From: Shaun Tancheff Date: Thu, 30 May 2024 22:11:42 +0000 (-0600) Subject: LU-16350 ldiskfs: Server support for LTS linux v6.6 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=6d4d1589a452f159fc4c63a594786dea266b1d7f;p=fs%2Flustre-release.git LU-16350 ldiskfs: Server support for LTS linux v6.6 Migrate upai ext4 headers into staging for ldiskfs Updated patch series for Linux LTS v6.6.10 ext4-attach-jinode-in-writepages.patch ext4-dont-check-before-replay.patch ext4-mballoc-pa-free-mismatch.patch ext4-pdirop.patch ext4-prealloc.patch ext4-corrupted-inode-block-bitmaps-handling-patches.patch ext4-delayed-iput.patch ext4-encdata.patch ext4-ialloc-uid-gid-and-pass-owner-down.patch ext4-mballoc-extra-checks.patch Dropped: ext4-add-periodic-superblock-update.patch Test-Parameters: trivial HPE-bug-id: LUS-11376 Signed-off-by: Shaun Tancheff Change-Id: I2a0a5d4be1e724ed1936178ccc3f7a7e7a2672c7 --- diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index d35c936..d2827584 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -186,9 +186,13 @@ AS_IF([test -z "$LDISKFS_SERIES"], AS_VERSION_COMPARE([$LINUXRELEASE],[6.1.0], [ LDISKFS_SERIES="5.10.0-ml.series"], [ LDISKFS_SERIES="6.1.38-ml.series"], [ - LDISKFS_SERIES="6.1.38-ml.series"] - )] # 6.1 LTS - )] # 5.10 LTS + AS_VERSION_COMPARE([$LINUXRELEASE],[6.6.0], [ + LDISKFS_SERIES="6.1.38-ml.series"], [ + LDISKFS_SERIES="6.6-ml.series"], [ + LDISKFS_SERIES="6.6-ml.series"] + )] # 6.6 + )] # 6.1 + )] # 5.10 )] # 5.4 LTS )], []) diff --git a/ldiskfs/Makefile.in b/ldiskfs/Makefile.in index db4ba95..668d17e 100644 --- a/ldiskfs/Makefile.in +++ b/ldiskfs/Makefile.in @@ -7,6 +7,7 @@ backfs_extra := $(wildcard @LINUX@/fs/ext4/Makefile) backfs_headers := $(wildcard @EXT4_SRC_DIR@/*.h) linux_headers := $(wildcard @LINUX@/include/linux/ext4*.h) +uapi_linux_headers := $(wildcard @LINUX@/include/uapi/linux/ext4*.h) linux_new_headers := htree_lock.h trace_headers := $(wildcard @LINUX@/include/trace/events/ext4*.h) diff --git a/ldiskfs/autoMakefile.am b/ldiskfs/autoMakefile.am index 3bd7483..487d50a 100644 --- a/ldiskfs/autoMakefile.am +++ b/ldiskfs/autoMakefile.am @@ -10,9 +10,12 @@ ldiskfs$(KMODEXT): sources endif endif -ldiskfs_linux_headers := $(addprefix linux/,$(subst ext4,ldiskfs,$(notdir $(linux_headers)))) +ldiskfs_linux_headers := \ + $(addprefix linux/,$(subst ext4,ldiskfs,$(notdir $(linux_headers)))) \ + $(addprefix uapi/linux/,$(subst ext4,ldiskfs,$(notdir $(uapi_linux_headers)))) -$(filter %.c,$(ldiskfs_patched_sources)): sources $(ldiskfs_linux_headers) $(filter %.h,$(ldiskfs_patched_sources)) +$(filter %.c,$(ldiskfs_patched_sources)): \ + sources $(ldiskfs_linux_headers) $(filter %.h,$(ldiskfs_patched_sources)) # Convert LDISKFS_SUPER_MAGIC back to EXT4_SUPER_MAGIC so that the ldiskfs # code can use the existing kernel headers instead of defining this itself. @@ -29,19 +32,28 @@ ldiskfs_sed_flags = \ sed $(strip $(ldiskfs_sed_flags)) $< > $@ linux/ldiskfs%.h: linux-stage/include/linux/ext4%.h + @echo sed $(strip $(ldiskfs_sed_flags)) $< '=>' $@ + sed $(strip $(ldiskfs_sed_flags)) $< > $@ + +uapi/linux/ldiskfs%.h: linux-stage/include/uapi/linux/ext4%.h + @echo sed $(strip $(ldiskfs_sed_flags)) $< '=>' $@ sed $(strip $(ldiskfs_sed_flags)) $< > $@ series := @top_srcdir@/ldiskfs/kernel_patches/series/ldiskfs-$(LDISKFS_SERIES) patches := @top_srcdir@/ldiskfs/kernel_patches/patches -sources: $(backfs_sources) $(backfs_headers) $(linux_headers) $(series) $(trace_headers) - rm -rf linux-stage linux sources trace $(ldiskfs_SOURCES) +sources: $(backfs_sources) $(backfs_headers) $(linux_headers) $(uapi_linux_headers) $(series) $(trace_headers) + rm -rf linux-stage uapi linux sources trace $(ldiskfs_SOURCES) mkdir -p linux-stage/fs/ext4 linux-stage/include/linux \ + linux-stage/include/uapi/linux \ linux-stage/include/trace/events cp $(backfs_sources) $(backfs_headers) $(backfs_extra) linux-stage/fs/ext4 if test -n "$(linux_headers)" ; then \ cp $(linux_headers) linux-stage/include/linux; \ fi + if test -n "$(uapi_linux_headers)" ; then \ + cp $(uapi_linux_headers) linux-stage/include/uapi/linux; \ + fi if test -n "$(trace_headers)" ; then \ cp $(trace_headers) linux-stage/include/trace/events; \ fi @@ -57,7 +69,7 @@ else done @echo endif - mkdir -p linux trace/events + mkdir -p uapi/linux linux trace/events @echo -n "Replacing 'ext4' with 'ldiskfs':" for i in $(notdir $(backfs_headers) $(backfs_sources)) $(new_sources) ; do \ [ -f linux-stage/fs/ext4/$$i ] || continue; \ @@ -71,6 +83,12 @@ endif mv ext4$$i ldiskfs$$i ; \ fi ; \ done + for i in $(subst ext4,,$(notdir $(uapi_linux_headers))) ; do \ + echo -n " ext4$$i" ; \ + sed $(strip $(ldiskfs_sed_flags)) \ + linux-stage/include/uapi/linux/ext4$$i \ + > uapi/linux/ldiskfs$$i ; \ + done for i in $(subst ext4,,$(notdir $(linux_headers) $(new_headers))) ; do \ echo -n " ext4$$i" ; \ sed $(strip $(ldiskfs_sed_flags)) \ @@ -86,10 +104,9 @@ endif for i in $(notdir $(linux_new_headers)) ; do \ echo -n " $$i"; \ sed $(strip $(ldiskfs_sed_flags)) \ - linux-stage/include/linux/$$i \ - > linux/$$i ; \ + linux-stage/include/linux/$$i \ + > linux/$$i ; \ done - @echo touch sources diff --git a/ldiskfs/kernel_patches/patches/linux-6.2/ext4-attach-jinode-in-writepages.patch b/ldiskfs/kernel_patches/patches/linux-6.2/ext4-attach-jinode-in-writepages.patch new file mode 100644 index 0000000..099e4bd --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.2/ext4-attach-jinode-in-writepages.patch @@ -0,0 +1,68 @@ +commit 66153d87190a3547099446d222f36114d3eeffad +Author: Yang Sheng +AuthorDate: Tue Jan 24 03:31:27 2017 +0800 +Subject: LU-9031 osd: handle jinode change for ldiskfs + +We need take care of jinode for ldiskfs. Since we +didn't got inode from syscall like sys_open(). So +have to initailize it in OSD by ourselves. + +Signed-off-by: Yang Sheng +Change-Id: Iec6db290c3779a8f7c98e5d1356b71fd928d7c88 +Reviewed-on: https://review.whamcloud.com/24941 +Reviewed-by: Andreas Dilger +Reviewed-by: Bob Glossman +--- + fs/ext4/ext4.h | 1 + + fs/ext4/inode.c | 9 ++++++++- + 2 files changed, 9 insertions(+), 1 deletion(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 4450196..8743d31 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3120,6 +3120,7 @@ extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, + int len, int state); + + /* inode.c */ ++#define HAVE_LDISKFS_INFO_JINODE + void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei); + int ext4_inode_is_fast_symlink(struct inode *inode); +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 400a6c7..1cd709a 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -746,6 +746,10 @@ out_sem: + (loff_t)map->m_lblk << inode->i_blkbits; + loff_t length = (loff_t)map->m_len << inode->i_blkbits; + ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ return ret; ++ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + ret = ext4_jbd2_inode_add_wait(handle, inode, + start_byte, length); +@@ -2999,7 +3003,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) + mpd->first_page = wbc->range_start >> PAGE_SHIFT; + mpd->last_page = wbc->range_end >> PAGE_SHIFT; + } +- ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ goto out_writepages; + ext4_io_submit_init(&mpd->io_submit, wbc); + retry: + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) +@@ -4513,6 +4519,7 @@ int ext4_inode_attach_jinode(struct inode *inode) + jbd2_free_inode(jinode); + return 0; + } ++EXPORT_SYMBOL(ext4_inode_attach_jinode); + + /* + * ext4_truncate() +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.5/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-data-in-dirent.patch new file mode 100644 index 0000000..d031553 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-data-in-dirent.patch @@ -0,0 +1,849 @@ + commit 2db3b2b33ee796f4ea61316773452d936303ad27 + Author: Pravin Shelar + AuthorDate: Sun Oct 4 18:13:14 2009 +0000 + Subject: ext4: add ext4-data-in-dirent patch + + Allows ext4 to store extra data records inside the ext4_dirent + along with the regular directory entry (type, length, filename). + Data is stored in ext4 dirent after filename, with a bit flag in + de->file_type to indicate if any record after de->name is used. + Each in-use record is variable length and must store a 1-byte + length (including the length byte itself) at the start so that it + can be skipped if the record type is unknown/uneeded. The record + starts after a NUL byte terminator for the filename. This extra + space is accounted in de->rec_len but not de->name_len. + + Flag EXT4_DIRENT_LUFID is used for a 128-bit file identifier. + Make use of dentry->d_fsdata to pass LUFID to ext4, so no changes + in ext4_add_entry() interface are required. + Bugzilla-ID: b=17670 + Signed-off-by: Pravin Shelar + Reviewed-by: Huang Hua + Signed-off-by: Andreas Dilger +--- + fs/ext4/dir.c | 9 +- + fs/ext4/ext4.h | 107 ++++++++++++++- + fs/ext4/fast_commit.c | 2 +- + fs/ext4/inline.c | 8 +- + fs/ext4/namei.c | 295 ++++++++++++++++++++++++++++++++++++------ + fs/ext4/super.c | 4 +- + 6 files changed, 369 insertions(+), 56 deletions(-) + +diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c +index 3985f8c3..b8e4df14 100644 +--- a/fs/ext4/dir.c ++++ b/fs/ext4/dir.c +@@ -465,12 +465,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; ++ int extra_data = 0; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ +- len = sizeof(struct fname) + ent_name->len + 1; ++ if (dirent->file_type & EXT4_DIRENT_LUFID) ++ extra_data = ext4_get_dirent_data_len(dirent); ++ ++ len = sizeof(struct fname) + ent_name->len + extra_data + 1; ++ + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; +@@ -479,7 +484,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; +- memcpy(new_fn->name, ent_name->name, ent_name->len); ++ memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data); + + while (*p) { + parent = *p; +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index fb78a390..3f72f8c9 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1155,6 +1155,7 @@ struct ext4_inode_info { + __u32 i_csum_seed; + + kprojid_t i_projid; ++ void *i_dirdata; + }; + + /* +@@ -1176,6 +1177,7 @@ struct ext4_inode_info { + * Mount flags set via mount options or defaults + */ + #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ ++#define EXT4_MOUNT_DIRDATA 0x00002 /* Data in directory entries */ + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +@@ -2171,6 +2173,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ +@@ -2382,6 +2385,42 @@ struct ext4_dir_entry_tail { + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 ++#define EXT4_FT_MASK 0xf ++ ++#if EXT4_FT_MAX > EXT4_FT_MASK ++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" ++#endif ++ ++/* ++ * d_type has 4 unused bits, so it can hold four types data. these different ++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be ++ * stored, in flag order, after file-name in ext4 dirent. ++*/ ++/* ++ * this flag is added to d_type if ext4 dirent has extra data after ++ * filename. this data length is variable and length is stored in first byte ++ * of data. data start after filename NUL byte. ++ * This is used by Lustre FS. ++ */ ++#define EXT4_DIRENT_LUFID 0x10 ++ ++#define EXT4_LUFID_MAGIC 0xAD200907UL ++struct ext4_dentry_param { ++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ ++ char edp_len; /* size of edp_data in bytes */ ++ char edp_data[0]; /* packed array of data */ ++} __packed; ++ ++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, ++ struct ext4_dentry_param *p) ++{ ++ if (!ext4_has_feature_dirdata(sb)) ++ return NULL; ++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) ++ return &p->edp_len; ++ else ++ return NULL; ++} + + #define EXT4_FT_DIR_CSUM 0xDE + +@@ -2393,6 +2432,17 @@ struct ext4_dir_entry_tail { + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) + #define EXT4_MAX_REC_LEN ((1<<16)-1) ++#define EXT4_DIR_REC_LEN_(name_len, i_dir) \ ++ ext4_dir_rec_len((name_len), (i_dir)) ++#define EXT4_DIR_ENTRY_LEN_(de, i_dir) \ ++ (EXT4_DIR_REC_LEN_((de)->name_len + ext4_get_dirent_data_len(de), \ ++ (i_dir))) ++/* ldiskfs */ ++#define EXT4_DIR_REC_LEN(name_len, i_dir) EXT4_DIR_REC_LEN_((name_len), (i_dir)) ++#define EXT4_DIR_ENTRY_LEN(de, i_dir) EXT4_DIR_ENTRY_LEN_((de), (i_dir)) ++/* lustre osd_handler compat -- ifdef LDISKFS_DIR_REC_LEN_WITH_DIR */ ++#define EXT4_DIR_REC_LEN_WITH_DIR 1 ++#define __EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len), NULL) + + /* + * The rec_len is dependent on the type of directory. Directories that are +@@ -2400,10 +2450,10 @@ struct ext4_dir_entry_tail { + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +-static inline unsigned int ext4_dir_rec_len(__u8 name_len, ++static inline unsigned int ext4_dir_rec_len(__u32 name_len, + const struct inode *dir) + { +- int rec_len = (name_len + 8 + EXT4_DIR_ROUND); ++ __u32 rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); +@@ -2825,11 +2875,13 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de); ++ struct ext4_dir_entry_2 **dest_de, ++ int dlen); + void ext4_insert_dentry(struct inode *dir, struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname); ++ struct ext4_filename *fname, ++ void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { + if (!ext4_has_feature_dir_index(inode->i_sb) && +@@ -2845,10 +2897,17 @@ static const unsigned char ext4_filetype_table[] = { + + static inline unsigned char get_dtype(struct super_block *sb, int filetype) + { +- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) ++ int fl_index = filetype & EXT4_FT_MASK; ++ ++ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX) + return DT_UNKNOWN; + +- return ext4_filetype_table[filetype]; ++ if (!test_opt(sb, DIRDATA)) ++ return ext4_filetype_table[fl_index]; ++ ++ return (ext4_filetype_table[fl_index]) | ++ (filetype & EXT4_DIRENT_LUFID); ++ + } + extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); +@@ -3055,9 +3114,13 @@ extern int ext4_ind_migrate(struct inode *inode); + + /* namei.c */ + extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode); ++ struct inode *inode, ++ const void *data1, const void *data2); + extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); ++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, ++ const void *data1, const void *data2); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern struct inode *ext4_create_inode(handle_t *handle, +@@ -3861,6 +3924,36 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) + return buffer_uptodate(bh); + } + ++/* ++ * Compute the total directory entry data length. ++ * This includes the filename and an implicit NUL terminator (always present), ++ * and optional extensions. Each extension has a bit set in the high 4 bits of ++ * de->file_type, and the extension length is the first byte in each entry. ++ */ ++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) ++{ ++ char *len = de->name + de->name_len + 1 /* NUL terminator */; ++ int dlen = 0; ++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; ++ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de; ++ ++ if (!t->det_reserved_zero1 && ++ le16_to_cpu(t->det_rec_len) == ++ sizeof(struct ext4_dir_entry_tail) && ++ !t->det_reserved_zero2 && ++ t->det_reserved_ft == EXT4_FT_DIR_CSUM) ++ return 0; ++ ++ while (extra_data_flags) { ++ if (extra_data_flags & 1) { ++ dlen += *len + (dlen == 0); ++ len += *len; ++ } ++ extra_data_flags >>= 1; ++ } ++ return dlen; ++} ++ + #endif /* __KERNEL__ */ + + #define EFSBADCRC EBADMSG /* Bad CRC detected */ +diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c +index b06de728..332a0925 100644 +--- a/fs/ext4/fast_commit.c ++++ b/fs/ext4/fast_commit.c +@@ -1653,7 +1653,7 @@ static int ext4_fc_replay_create(struct super_block *sb, + ext4_debug("Dir %d not found.", darg.ino); + goto out; + } +- ret = ext4_init_new_dir(NULL, dir, inode); ++ ret = ext4_init_new_dir(NULL, dir, inode, NULL, NULL); + iput(dir); + if (ret) { + ret = 0; +diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c +index 012d9259..b86441a6 100644 +--- a/fs/ext4/inline.c ++++ b/fs/ext4/inline.c +@@ -1013,7 +1013,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, +- inline_size, fname, &de); ++ inline_size, fname, &de, 0); + if (err) + return err; + +@@ -1022,7 +1022,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, + EXT4_JTR_NONE); + if (err) + return err; +- ext4_insert_dentry(dir, inode, de, inline_size, fname); ++ ext4_insert_dentry(dir, inode, de, inline_size, fname, NULL); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + +@@ -1381,7 +1381,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(fake.name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(&fake, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +@@ -1391,7 +1391,7 @@ int ext4_inlinedir_to_tree(struct file *dir_file, + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(fake.name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(&fake, NULL), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 8fafbe65..38858511 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -290,7 +290,8 @@ static unsigned dx_get_count(struct dx_entry *entries); + static unsigned dx_get_limit(struct dx_entry *entries); + static void dx_set_count(struct dx_entry *entries, unsigned value); + static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, +@@ -437,23 +438,24 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, + { + struct ext4_dir_entry *dp; + struct dx_root_info *root; +- int count_offset; ++ int count_offset, dot_rec_len, dotdot_rec_len; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); + + if (rlen == blocksize) + count_offset = 8; +- else if (rlen == 12) { +- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); +- if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) ++ else { ++ dot_rec_len = le16_to_cpu(dirent->rec_len); ++ dp = (struct ext4_dir_entry *)(((void *)dirent) + dot_rec_len); ++ if (le16_to_cpu(dp->rec_len) != (blocksize - dot_rec_len)) + return NULL; +- root = (struct dx_root_info *)(((void *)dp + 12)); ++ dotdot_rec_len = EXT4_DIR_ENTRY_LEN((struct ext4_dir_entry_2 *)dp, NULL); ++ root = (struct dx_root_info *)(((void *)dp + dotdot_rec_len)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; +- count_offset = 32; +- } else +- return NULL; ++ count_offset = 8 + dot_rec_len + dotdot_rec_len; ++ } + + if (offset) + *offset = count_offset; +@@ -558,11 +560,12 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize) + */ + struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) + { ++ BUG_ON(de->name_len != 1); + /* get dotdot first */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de, NULL)); + + /* dx root info is after dotdot entry */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de, NULL)); + + return (struct dx_root_info *)de; + } +@@ -607,11 +610,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) + { +- unsigned int entry_space = dir->i_sb->s_blocksize - +- ext4_dir_rec_len(1, NULL) - +- ext4_dir_rec_len(2, NULL) - infosize; ++ struct ext4_dir_entry_2 *dotdot_de; ++ unsigned entry_space; ++ ++ BUG_ON(dot_de->name_len != 1); ++ dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize); ++ entry_space = dir->i_sb->s_blocksize - EXT4_DIR_ENTRY_LEN(dot_de, NULL) - ++ EXT4_DIR_ENTRY_LEN(dotdot_de, NULL) - infosize; + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); +@@ -731,7 +739,7 @@ static struct stats dx_show_leaf(struct inode *dir, + (unsigned) ((char *) de - base)); + #endif + } +- space += ext4_dir_rec_len(de->name_len, dir); ++ space += EXT4_DIR_ENTRY_LEN(de, dir); + names++; + } + de = ext4_next_entry(de, size); +@@ -887,11 +895,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + + entries = (struct dx_entry *)(((char *)info) + info->info_length); + +- if (dx_get_limit(entries) != dx_root_limit(dir, +- info->info_length)) { ++ if (dx_get_limit(entries) != ++ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), +- dx_root_limit(dir, info->info_length)); ++ dx_root_limit(dir, ++ (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)); + goto fail; + } + +@@ -1949,7 +1960,7 @@ dx_move_dirents(struct inode *dir, char *from, char *to, + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +- rec_len = ext4_dir_rec_len(de->name_len, dir); ++ rec_len = EXT4_DIR_ENTRY_LEN(de, dir); + + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = +@@ -1982,7 +1993,7 @@ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +- rec_len = ext4_dir_rec_len(de->name_len, dir); ++ rec_len = EXT4_DIR_ENTRY_LEN(de, dir); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +@@ -2125,10 +2136,11 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de) ++ struct ext4_dir_entry_2 **dest_de, ++ int dlen) + { + struct ext4_dir_entry_2 *de; +- unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); ++ unsigned short reclen = ext4_dir_rec_len(fname_len(fname) + dlen, dir); + int nlen, rlen; + unsigned int offset = 0; + char *top; +@@ -2141,7 +2153,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, + return -EFSCORRUPTED; + if (ext4_match(dir, fname, de)) + return -EEXIST; +- nlen = ext4_dir_rec_len(de->name_len, dir); ++ nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; +@@ -2159,12 +2171,13 @@ void ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname) ++ struct ext4_filename *fname, ++ void *data) + { + + int nlen, rlen; + +- nlen = ext4_dir_rec_len(de->name_len, dir); ++ nlen = EXT4_DIR_ENTRY_LEN(de, dir); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = +@@ -2185,6 +2198,12 @@ void ext4_insert_dentry(struct inode *dir, + EXT4_DIRENT_HASHES(de)->minor_hash = + cpu_to_le32(hinfo->minor_hash); + } ++ if (data) { ++ de->name[fname_len(fname)] = 0; ++ memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + } + + /* +@@ -2202,14 +2221,19 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + { + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; +- int err, err2; ++ int err, err2, dlen = 0; ++ unsigned char *data; + ++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) ++ EXT4_I(inode)->i_dirdata); + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (!de) { ++ if (data) ++ dlen = (*data) + 1; + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, +- blocksize - csum_size, fname, &de); ++ blocksize - csum_size, fname, &de, dlen); + if (err) + return err; + } +@@ -2222,7 +2246,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + } + + /* By now the buffer is marked for journaling */ +- ext4_insert_dentry(dir, inode, de, blocksize, fname); ++ ext4_insert_dentry(dir, inode, de, blocksize, fname, data); + + /* + * XXX shouldn't update any times until successful +@@ -2339,7 +2363,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + entries = (void *)dx_info + sizeof(*dx_info); + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); ++ dx_set_limit(entries, dx_root_limit(dir, ++ dot_de, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ + fname->hinfo.hash_version = dx_info->hash_version; +@@ -2390,7 +2415,106 @@ out_frames: + return retval; + } + +-/* update ".." entry */ ++static int ext4_expand_dotdot(struct inode *dir, ++ struct buffer_head *bh, ++ int dlen) ++{ ++ struct ext4_dir_entry_2 *dot_de; ++ struct ext4_dir_entry_2 *dotdot_de; ++ int len; ++ unsigned blocksize = dir->i_sb->s_blocksize; ++ ++ dot_de = (struct ext4_dir_entry_2 *)bh->b_data; ++ dotdot_de = ext4_next_entry(dot_de, blocksize); ++ ++ if (is_dx(dir)) { ++ struct dx_entry *entries; ++ struct dx_root_info *dx_info; ++ int limit, count; ++ int entry_space; ++ ++ len = EXT4_DIR_REC_LEN(2 + dlen, NULL) - ++ EXT4_DIR_ENTRY_LEN(dotdot_de, NULL); ++ ++ dx_info = dx_get_dx_info(dot_de); ++ entries = (struct dx_entry *)((char *)dx_info + ++ sizeof(*dx_info)); ++ count = dx_get_count(entries); ++ ++ /* ++ * figure out new limit with dlen, ++ * check if we have enough space ++ */ ++ entry_space = blocksize; ++ entry_space -= (char *)dotdot_de - (char *)dot_de + ++ EXT4_DIR_REC_LEN(2 + dlen, NULL) + ++ sizeof(*dx_info); ++ if (ext4_has_metadata_csum(dir->i_sb)) ++ entry_space -= sizeof(struct dx_tail); ++ limit = entry_space / sizeof(struct dx_entry); ++ if (count > limit) ++ return -ENOSPC; ++ ++ /* set the new limit, move dx_info and the entries */ ++ dx_set_limit(entries, limit); ++ memmove((char *)dx_info + len, dx_info, ++ sizeof(*dx_info) + count * sizeof(struct dx_entry)); ++ } else { ++ struct ext4_dir_entry_2 *next, *to, *prev, *de; ++ char *top = (char *)bh->b_data + blocksize; ++ int space = 0; ++ unsigned rec_len = 0; ++ ++ len = EXT4_DIR_REC_LEN(2 + dlen, NULL) - ++ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize); ++ ++ if (ext4_has_metadata_csum(dir->i_sb)) ++ top -= sizeof(struct ext4_dir_entry_tail); ++ ++ de = ext4_next_entry(dotdot_de, blocksize); ++ while ((char *)de < top) { ++ space += ext4_rec_len_from_disk(de->rec_len, blocksize) - ++ EXT4_DIR_ENTRY_LEN(de, dir); ++ de = ext4_next_entry(de, blocksize); ++ } ++ ++ if (space < len) ++ return -ENOSPC; ++ ++ /* pack all the entries after dotdot */ ++ de = ext4_next_entry(dotdot_de, blocksize); ++ prev = to = de; ++ while ((char *)de < top) { ++ next = ext4_next_entry(de, blocksize); ++ if (de->inode && de->name_len) { ++ rec_len = EXT4_DIR_ENTRY_LEN(de, dir); ++ if (de > to) ++ memmove(to, de, rec_len); ++ to->rec_len = ext4_rec_len_to_disk(rec_len, ++ blocksize); ++ prev = to; ++ to = (struct ext4_dir_entry_2 *) ++ (((char *)to) + rec_len); ++ } ++ de = next; ++ } ++ /* fix up rec_len for the last entry */ ++ prev->rec_len = ext4_rec_len_to_disk(top - (char *)prev - len, ++ blocksize); ++ /* move all the entries after dotdot to make space */ ++ de = ext4_next_entry(dotdot_de, blocksize); ++ memmove((char *)de + len, de, (char *)prev - (char *)de + ++ EXT4_DIR_ENTRY_LEN(prev, dir)); ++ /* fix the rec_len for dotdot */ ++ dotdot_de->rec_len = ext4_rec_len_to_disk( ++ EXT4_DIR_REC_LEN(2 + dlen, NULL), ++ blocksize); ++ } ++ ++ return 0; ++} ++ ++/* update ".." entry, try to expand the entry if necessary */ + static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + struct inode *inode) + { +@@ -2399,6 +2523,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + struct ext4_dir_entry_2 *dot_de, *dotdot_de; + unsigned int offset; + int retval = 0; ++ int dlen = 0; ++ char *data; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2439,6 +2565,30 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, + + dotdot_de->inode = cpu_to_le32(inode->i_ino); + ++ data = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *)dentry->d_fsdata); ++ if (data != NULL) { ++ dlen = *data + 1; ++ if (is_dx(dir)) { ++ if (ext4_get_dirent_data_len(dotdot_de) < dlen) { ++ if (ext4_expand_dotdot(dir, bh, dlen) < 0) ++ dlen = 0; ++ } ++ } else { ++ if (ext4_rec_len_from_disk(dotdot_de->rec_len, ++ dir->i_sb->s_blocksize) < ++ EXT4_DIR_REC_LEN(2 + dlen, NULL)) { ++ if (ext4_expand_dotdot(dir, bh, dlen) < 0) ++ dlen = 0; ++ } ++ } ++ } ++ if (dlen) { ++ dotdot_de->name[2] = 0; ++ memcpy(&dotdot_de->name[2 + 1], data, *data); ++ dotdot_de->file_type |= LDISKFS_DIRENT_LUFID; ++ } ++ + ext4_mark_inode_dirty(handle, dir); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + if (is_dx(dir)) { +@@ -2476,6 +2626,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + ext4_lblk_t block, blocks; + int csum_size = 0; + ++ EXT4_I(inode)->i_dirdata = dentry->d_fsdata; + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + +@@ -3049,38 +3200,73 @@ err_unlock_inode: + return err; + } + ++struct tp_block { ++ struct inode *inode; ++ void *data1; ++ void *data2; ++}; ++ + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) + { ++ void *data1 = NULL, *data2 = NULL; ++ int dot_reclen = 0; ++ ++ if (dotdot_real_len == 10) { ++ struct tp_block *tpb = (struct tp_block *)inode; ++ data1 = tpb->data1; ++ data2 = tpb->data2; ++ inode = tpb->inode; ++ dotdot_real_len = 0; ++ } + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +- de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), +- blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + ++ /* get packed fid data*/ ++ data1 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data1); ++ if (data1) { ++ de->name[1] = 0; ++ memcpy(&de->name[2], data1, *(char *) data1); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de, NULL)); ++ ++ dot_reclen = cpu_to_le16(de->rec_len); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; ++ ++ strcpy(de->name, ".."); ++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ data2 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data2); ++ if (data2) { ++ de->name[2] = 0; ++ memcpy(&de->name[3], data2, *(char *) data2); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - +- (csum_size + ext4_dir_rec_len(1, NULL)), +- blocksize); ++ (csum_size + dot_reclen), blocksize); + else + de->rec_len = ext4_rec_len_to_disk( +- ext4_dir_rec_len(de->name_len, NULL), ++ EXT4_DIR_ENTRY_LEN(de, NULL), + blocksize); +- strcpy(de->name, ".."); +- ext4_set_de_type(inode->i_sb, de, S_IFDIR); + + return ext4_next_entry(de, blocksize); + } + + int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode) ++ struct inode *inode, ++ const void *data1, const void *data2) + { ++ struct tp_block param; + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + ext4_lblk_t block = 0; +@@ -3104,7 +3290,11 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + de = (struct ext4_dir_entry_2 *)dir_block->b_data; +- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); ++ param.inode = inode; ++ param.data1 = (void *)data1; ++ param.data2 = (void *)data2; ++ ext4_init_dot_dotdot((struct inode *)(¶m), de, blocksize, ++ csum_size, dir->i_ino, 10); + set_nlink(inode, 2); + if (csum_size) + ext4_initialize_dirent_tail(dir_block, blocksize); +@@ -3146,7 +3336,7 @@ retry: + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; +- err = ext4_init_new_dir(handle, dir, inode); ++ err = ext4_init_new_dir(handle, dir, inode, NULL, NULL); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); +@@ -3184,6 +3374,29 @@ out_retry: + return err; + } + ++/* Initialize @inode as a subdirectory of @dir, and add the ++ * "." and ".." entries into the first directory block. */ ++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, ++ const void *data1, const void *data2) ++{ ++ int rc; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ ext4_handle_sync(handle); ++ ++ inode->i_op = &ext4_dir_inode_operations; ++ inode->i_fop = &ext4_dir_operations; ++ rc = ext4_init_new_dir(handle, dir, inode, data1, data2); ++ if (!rc) ++ rc = ext4_mark_inode_dirty(handle, inode); ++ return rc; ++} ++EXPORT_SYMBOL(ext4_add_dot_dotdot); ++ + /* + * routine to check that the specified directory is empty (for rmdir) + */ +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 2c059117..8510a9b5 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1665,7 +1665,7 @@ enum { + Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, + Opt_inlinecrypt, + Opt_usrjquota, Opt_grpjquota, Opt_quota, +- Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, ++ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata, + Opt_usrquota, Opt_grpquota, Opt_prjquota, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, +@@ -1779,6 +1779,7 @@ static const struct fs_parameter_spec ext4_param_specs[] = { + fsparam_u32 ("stripe", Opt_stripe), + fsparam_flag ("delalloc", Opt_delalloc), + fsparam_flag ("nodelalloc", Opt_nodelalloc), ++ fsparam_flag ("dirdata", Opt_dirdata), + fsparam_flag ("warn_on_error", Opt_warn_on_error), + fsparam_flag ("nowarn_on_error", Opt_nowarn_on_error), + fsparam_u32 ("debug_want_extra_isize", +@@ -1908,6 +1909,7 @@ static const struct mount_opts { + MOPT_CLEAR | MOPT_Q}, + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, ++ {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET}, + {Opt_jqfmt, 0, MOPT_QFMT}, + {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, + {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.5/ext4-dont-check-before-replay.patch b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-dont-check-before-replay.patch new file mode 100644 index 0000000..8594bec --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-dont-check-before-replay.patch @@ -0,0 +1,40 @@ +commit a70b020e5b2f1bbe3b759232852beaac4f0852b5 +Author: Lokesh Nagappa Jaliminche +AuthorDate: Fri Nov 25 16:17:09 2016 +0530 +LU-8364 ext4: fixes for failover mode. + +When ext4 runs in failover mode with read-only disk, +it may loose part of allocation updates and fail while +mounting fs due to group descriptor checks before journal +replay not being valid after journal replay is complete. +Don't produce panics with on disk checks in read-only mode. + +Seagate-bug-id: MRP-797 +Change-Id: I54bee3a0aeb9a15f5ee2a79f7a2a2a905f19af1a +Signed-off-by: Alexey Lyashkov +Signed-off-by: Lokesh Nagappa Jaliminche +Reviewed-on: https://review.whamcloud.com/21141 +--- + fs/ext4/super.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 734c88b..7459777 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -5391,6 +5391,12 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) + needs_recovery = 0; + } + ++ if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ++ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ++ err = -EFSCORRUPTED; ++ goto failed_mount3a; ++ } ++ + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.5/ext4-mballoc-pa-free-mismatch.patch b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-mballoc-pa-free-mismatch.patch new file mode 100644 index 0000000..d0ddd27 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-mballoc-pa-free-mismatch.patch @@ -0,0 +1,119 @@ +commit 2d3aaef4122c11dcb6d892da89522ffa37036136 +Author: Fan Yong +AuthorDate: Thu Feb 25 00:32:12 2010 -0800 +Subject: ext4: diagnostic patch to verify lustre read-only device mechanism + +Diagnostic patch to check whether lustre read-only device mechanism works well or not. +Signed-off-by: Fan Yong +Reviewed-by: Alex Zhuravlev +Reviewed-by: Rahul Deshmukh +--- + fs/ext4/mballoc.c | 43 +++++++++++++++++++++++++++++++++++++------ + fs/ext4/mballoc.h | 1 + + 2 files changed, 38 insertions(+), 6 deletions(-) + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 839cf8f..540c6c1 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -5042,6 +5042,7 @@ adjust_bex: + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_INODE_PA; ++ pa->pa_error = 0; + + mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); +@@ -5093,6 +5094,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) + INIT_LIST_HEAD(&pa->pa_group_list); + pa->pa_deleted = 0; + pa->pa_type = MB_GROUP_PA; ++ pa->pa_error = 0; + + mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); +@@ -5146,7 +5148,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + unsigned long long grp_blk_start; + int free = 0; + ++ assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); + BUG_ON(pa->pa_deleted == 0); ++ BUG_ON(pa->pa_inode == NULL); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); + BUG_ON(group != e4b->bd_group && pa->pa_len != 0); +@@ -5169,12 +5173,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); + bit = next + 1; + } +- if (free != pa->pa_free) { +- ext4_msg(e4b->bd_sb, KERN_CRIT, +- "pa %p: logic %lu, phys. %lu, len %d", +- pa, (unsigned long) pa->pa_lstart, +- (unsigned long) pa->pa_pstart, +- pa->pa_len); ++ ++ /* "free < pa->pa_free" means we maybe double alloc the same blocks, ++ * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ ++ if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { ++ ext4_error(sb, "pa free mismatch: [pa %p] " ++ "[phy %lu] [logic %lu] [len %u] [free %u] " ++ "[error %u] [inode %d] [freed %u]", pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ pa->pa_len, (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, pa->pa_inode->i_ino, ++ free); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", + free, pa->pa_free); + /* +@@ -5182,6 +5192,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + * from the bitmap and continue. + */ + } ++ /* do not verify if the file system is being umounted */ ++ BUG_ON(atomic_read(&sb->s_active) > 0 && pa->pa_free != free); + atomic_add(free, &sbi->s_mb_discarded); + + return 0; +@@ -6023,6 +6035,25 @@ errout: + ac->ac_b_ex.fe_len = 0; + ar->len = 0; + ext4_mb_show_ac(ac); ++ if (ac->ac_pa) { ++ struct ext4_prealloc_space *pa = ac->ac_pa; ++ ++ /* We can not make sure whether the bitmap has ++ * been updated or not when fail case. So can ++ * not revert pa_free back, just mark pa_error*/ ++ pa->pa_error++; ++ ext4_error(sb, ++ "Updating bitmap error: [err %d] " ++ "[pa %p] [phy %lu] [logic %lu] " ++ "[len %u] [free %u] [error %u] " ++ "[inode %lu]", *errp, pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, ++ (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, ++ pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ } + } + ext4_mb_release_context(ac); + kmem_cache_free(ext4_ac_cachep, ac); +diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h +index 74b25d6..fb5a2c6 100644 +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -126,6 +126,7 @@ struct ext4_prealloc_space { + ext4_grpblk_t pa_len; /* len of preallocated chunk */ + ext4_grpblk_t pa_free; /* how many blocks are free */ + unsigned short pa_type; /* pa type. inode or group */ ++ unsigned short pa_error; /* error count */ + union { + rwlock_t *inode_lock; /* locks the rbtree holding this PA */ + spinlock_t *lg_lock; /* locks the lg list holding this PA */ +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.5/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-pdirop.patch new file mode 100644 index 0000000..208d9dc --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-pdirop.patch @@ -0,0 +1,943 @@ +LU-50 ldiskfs: pdirops patch for ldiskfs + +Single directory performance is a critical for HPC workloads. In a +typical use case an application creates a separate output file for +each node and task in a job. As nodes and tasks increase, hundreds +of thousands of files may be created in a single directory within +a short window of time. +Today, both filename lookup and file system modifying operations +(such as create and unlink) are protected with a single lock for +an entire ldiskfs directory. PDO project will remove this +bottleneck by introducing a parallel locking mechanism for entire +ldiskfs directories. This work will enable multiple application +threads to simultaneously lookup, create and unlink in parallel. + +This patch contains: + - pdirops support for ldiskfs + - N-level htree directory + - integrate with osd-ldiskfs + +Signed-off-by: Liang Zhen +Change-Id: I269c0e3112e68f3acd79e860dab052a68c7d7aaa +Reviewed-on: http://review.whamcloud.com/375 +Reviewed-by: Andreas Dilger +--- + fs/ext4/Makefile | 1 + + fs/ext4/ext4.h | 78 ++++++++ + fs/ext4/namei.c | 467 ++++++++++++++++++++++++++++++++++++++++++----- + fs/ext4/super.c | 1 + + 4 files changed, 505 insertions(+), 42 deletions(-) + +diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile +index 72206a29..1d15a3af 100644 +--- a/fs/ext4/Makefile ++++ b/fs/ext4/Makefile +@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ ++ htree_lock.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index b7999363..f9757e9e 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1001,6 +1002,9 @@ struct ext4_inode_info { + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +@@ -2564,6 +2568,72 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ ++extern struct buffer_head *ext4_find_entry_locked(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); ++ + struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; +@@ -2891,12 +2961,20 @@ void ext4_insert_dentry(struct inode *dir, struct inode *inode, + void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } ++#endif + } + static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 17e95435..98d27781 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -56,6 +56,7 @@ struct buffer_head *ext4_append(handle_t *handle, + { + struct ext4_map_blocks map; + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); + int err; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && +@@ -63,6 +64,10 @@ struct buffer_head *ext4_append(handle_t *handle, + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); ++ + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + map.m_lblk = *block; + map.m_len = 1; +@@ -74,15 +79,18 @@ struct buffer_head *ext4_append(handle_t *handle, + */ + err = ext4_map_blocks(NULL, inode, &map, 0); + if (err < 0) +- return ERR_PTR(err); ++ goto err_unlock; + if (err) { + EXT4_ERROR_INODE(inode, "Logical block already allocated"); +- return ERR_PTR(-EFSCORRUPTED); ++ err = -EFSCORRUPTED; ++ goto err_unlock; + } + + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); +- if (IS_ERR(bh)) ++ if (IS_ERR(bh)) { ++ up(&ei->i_append_sem); + return bh; ++ } + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_mark_inode_dirty(handle, inode); +@@ -93,11 +101,14 @@ struct buffer_head *ext4_append(handle_t *handle, + EXT4_JTR_NONE); + if (err) + goto out; ++ up(&ei->i_append_sem); + return bh; + + out: + brelse(bh); + ext4_std_error(inode->i_sb, err); ++err_unlock: ++ up(&ei->i_append_sem); + return ERR_PTR(err); + } + +@@ -296,7 +307,8 @@ static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_frame *frame); ++ struct dx_frame *frame, ++ struct htree_lock *lck); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, +@@ -312,12 +324,13 @@ static void dx_insert_block(struct dx_frame *frame, + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir); ++ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck); + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode); ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck); + + /* checksumming functions */ + void ext4_initialize_dirent_tail(struct buffer_head *bh, +@@ -806,6 +819,227 @@ static inline void htree_rep_invariant_check(struct dx_entry *at, + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ ext4_find_entry_locked(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ ext4_add_entry_locked(handle, dentry, inode, NULL) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -817,10 +1051,11 @@ static inline void htree_rep_invariant_check(struct dx_entry *at, + */ + static struct dx_frame * + dx_probe(struct ext4_filename *fname, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck) + { + unsigned count, indirect, level, i; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info *info; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); +@@ -910,8 +1145,16 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + level = 0; + blocks[0] = 0; + while (1) { ++ if (indirect == level) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation ++ */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); +@@ -938,6 +1181,74 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + frame->entries = entries; + frame->at = at; + ++ if (indirect == level) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return ++ */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock ++ */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it ++ */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block ++ */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block ++ */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } ++ return frame; ++ } ++ dx = at; ++ + block = dx_get_block(at); + for (i = 0; i <= level; i++) { + if (blocks[i] == block) { +@@ -947,8 +1258,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + goto fail; + } + } +- if (++level > indirect) +- return frame; ++ ++level; + blocks[level] = block; + frame++; + frame->bh = ext4_read_dirblock(dir, block, INDEX); +@@ -1019,7 +1329,7 @@ static void dx_release(struct dx_frame *frames) + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -1034,12 +1344,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -1062,6 +1382,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); +@@ -1070,6 +1397,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -1236,10 +1564,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL); + if (IS_ERR(frame)) + return PTR_ERR(frame); +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -1279,7 +1607,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -1604,7 +1932,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + static struct buffer_head *__ext4_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1645,7 +1973,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, + goto restart; + } + if (is_dx(dir)) { +- ret = ext4_dx_find_entry(dir, fname, res_dir); ++ ret = ext4_dx_find_entry(dir, fname, res_dir, lck); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -1655,6 +1983,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, + goto cleanup_and_exit; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + ret = NULL; + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); +@@ -1745,10 +2074,10 @@ cleanup_and_exit: + return ret; + } + +-static struct buffer_head *ext4_find_entry(struct inode *dir, ++struct buffer_head *ext4_find_entry_locked(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + int err; + struct ext4_filename fname; +@@ -1760,12 +2089,14 @@ static struct buffer_head *ext4_find_entry(struct inode *dir, + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ++ bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck); + + ext4_fname_free_filename(&fname); + return bh; + } + ++EXPORT_SYMBOL(ext4_find_entry_locked); ++ + static struct buffer_head *ext4_lookup_entry(struct inode *dir, + struct dentry *dentry, + struct ext4_dir_entry_2 **res_dir) +@@ -1781,7 +2112,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ++ bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL); + + ext4_fname_free_filename(&fname); + return bh; +@@ -1789,7 +2120,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) + { + struct super_block * sb = dir->i_sb; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; +@@ -1800,7 +2132,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + #ifdef CONFIG_FS_ENCRYPTION + *res_dir = NULL; + #endif +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; + do { +@@ -1822,7 +2154,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning_inode(dir, + "error %d reading directory index block", +@@ -2011,8 +2343,9 @@ static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned continued; +@@ -2089,8 +2422,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, +- blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(dir, data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(dir, data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(dir, data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, +@@ -2108,12 +2447,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_dirblock(handle, dir, bh2); + if (err) + goto journal_error; +@@ -2395,7 +2743,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + if (retval) + goto out_frames; + +- de = do_split(handle,dir, &bh2, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL); + if (IS_ERR(de)) { + retval = PTR_ERR(de); + goto out_frames; +@@ -2612,8 +2960,8 @@ out: + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; +@@ -2663,9 +3011,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + return ext4_update_dotdot(handle, dentry, inode); + + if (is_dx(dir)) { +- retval = ext4_dx_add_entry(handle, &fname, dir, inode); ++ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; ++ ext4_htree_safe_relock(lck); + /* Can we just ignore htree data? */ + if (ext4_has_metadata_csum(sb)) { + EXT4_ERROR_INODE(dir, +@@ -2728,12 +3077,14 @@ out: + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(ext4_add_entry_locked); + + /* + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode) ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck) + { + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; +@@ -2745,7 +3096,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + + again: + restart = 0; +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return PTR_ERR(frame); + entries = frame->entries; +@@ -2780,6 +3131,12 @@ again: + struct dx_node *node2; + struct buffer_head *bh2; + ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } ++ + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2883,8 +3240,32 @@ again: + restart = 1; + goto journal_error; + } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; ++ } ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto cleanup; +@@ -2895,6 +3276,8 @@ again: + journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + brelse(bh); + dx_release(frames); + /* @restart is true means htree-path has been changed, we need to +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 8510a9b5..1212e0b3 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1403,6 +1403,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + return NULL; + + inode_set_iversion(&ei->vfs_inode, 1); ++ sema_init(&ei->i_append_sem, 1); + ei->i_flags = 0; + spin_lock_init(&ei->i_raw_lock); + ei->i_prealloc_node = RB_ROOT; +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.5/ext4-prealloc.patch b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-prealloc.patch new file mode 100644 index 0000000..10f378f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.5/ext4-prealloc.patch @@ -0,0 +1,409 @@ +commit d8d8fd9192a54c7b8caef8cca9b7a1eb5e5e3298 +Author: Alex Zhuravlev +AuthorDate: Thu Oct 23 10:02:19 2008 +0000 +Subject: ext4: support for tunable preallocation window +Add support for tunable preallocation window and new tunables +for large/small requests. +Bugzilla-ID: b=12800 +Signed-off-by: Alex Zhuravlev +Reviewed-by: Kalpak Shah +Reviewed-by: Andreas Dilger +--- + fs/ext4/ext4.h | 7 +- + fs/ext4/inode.c | 3 + + fs/ext4/mballoc.c | 220 +++++++++++++++++++++++++++++++++++----------- + fs/ext4/sysfs.c | 8 +- + 4 files changed, 182 insertions(+), 56 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index f9a929d9..9e14787b 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1270,6 +1270,8 @@ extern void mb_set_bits(void *bm, int cur, int len); + #define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ + #define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + ++#define EXT4_MAX_PREALLOC_TABLE 64 ++ + /* + * Behaviour when detecting errors + */ +@@ -1577,11 +1579,13 @@ struct ext4_sb_info { + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; +- unsigned int s_mb_stream_request; ++ unsigned long s_mb_small_req; ++ unsigned long s_mb_large_req; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; ++ unsigned long *s_mb_prealloc_table; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; + /* where last allocation was done - for stream allocation */ +@@ -2896,6 +2900,7 @@ int ext4_fc_record_regions(struct super_block *sb, int ino, + int len, int replay); + + /* mballoc.c */ ++extern const struct proc_ops ext4_seq_prealloc_table_fops; + extern const struct seq_operations ext4_mb_seq_groups_ops; + extern const struct seq_operations ext4_mb_seq_structs_summary_ops; + extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 24e1e488..dbdaf5ca 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -2593,6 +2593,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) + PAGE_SIZE >> inode->i_blkbits); + } + ++ if (wbc->nr_to_write < sbi->s_mb_small_req) ++ wbc->nr_to_write = sbi->s_mb_small_req; ++ + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 3711be69..48e799a4 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -3234,6 +3234,99 @@ const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .show = ext4_mb_seq_structs_summary_show, + }; + ++static int ext4_mb_check_and_update_prealloc(struct ext4_sb_info *sbi, ++ char *str, size_t cnt, ++ int update) ++{ ++ unsigned long value; ++ unsigned long prev = 0; ++ char *cur; ++ char *next; ++ char *end; ++ int num = 0; ++ ++ cur = str; ++ end = str + cnt; ++ while (cur < end) { ++ while ((cur < end) && (*cur == ' ')) cur++; ++ value = simple_strtol(cur, &next, 0); ++ if (value == 0) ++ break; ++ if (cur == next) ++ return -EINVAL; ++ ++ cur = next; ++ ++ if (value > (sbi->s_blocks_per_group - 1 - 1 - sbi->s_itb_per_group)) ++ return -EINVAL; ++ ++ /* they should add values in order */ ++ if (value <= prev) ++ return -EINVAL; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = value; ++ ++ prev = value; ++ num++; ++ } ++ ++ if (num > EXT4_MAX_PREALLOC_TABLE - 1) ++ return -EOVERFLOW; ++ ++ if (update) ++ sbi->s_mb_prealloc_table[num] = 0; ++ ++ return 0; ++} ++ ++static ssize_t ext4_mb_prealloc_table_proc_write(struct file *file, ++ const char __user *buf, ++ size_t cnt, loff_t *pos) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(pde_data(file_inode(file))); ++ char str[128]; ++ int rc; ++ ++ if (cnt >= sizeof(str)) ++ return -EINVAL; ++ if (copy_from_user(str, buf, cnt)) ++ return -EFAULT; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 0); ++ if (rc) ++ return rc; ++ ++ rc = ext4_mb_check_and_update_prealloc(sbi, str, cnt, 1); ++ return rc ? rc : cnt; ++} ++ ++static int mb_prealloc_table_seq_show(struct seq_file *m, void *v) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(m->private); ++ int i; ++ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE && ++ sbi->s_mb_prealloc_table[i] != 0; i++) ++ seq_printf(m, "%ld ", sbi->s_mb_prealloc_table[i]); ++ seq_printf(m, "\n"); ++ ++ return 0; ++} ++ ++static int mb_prealloc_table_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, mb_prealloc_table_seq_show, pde_data(inode)); ++} ++ ++const struct proc_ops ext4_seq_prealloc_table_fops = { ++ .proc_open = mb_prealloc_table_seq_open, ++ .proc_read = seq_read, ++ .proc_lseek = seq_lseek, ++ .proc_release = single_release, ++ .proc_write = ext4_mb_prealloc_table_proc_write, ++}; ++ + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) + { + int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; +@@ -3550,7 +3643,7 @@ static void ext4_discard_work(struct work_struct *work) + int ext4_mb_init(struct super_block *sb) + { + struct ext4_sb_info *sbi = EXT4_SB(sb); +- unsigned i, j; ++ unsigned i, j, k, l; + unsigned offset, offset_incr; + unsigned max; + int ret; +@@ -3638,7 +3731,6 @@ int ext4_mb_init(struct super_block *sb) + sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; + sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; + sbi->s_mb_stats = MB_DEFAULT_STATS; +- sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; + sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; + sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; + +@@ -3664,9 +3756,29 @@ int ext4_mb_init(struct super_block *sb) + * RAID stripe size so that preallocations don't fragment + * the stripes. + */ +- if (sbi->s_stripe > 1) { +- sbi->s_mb_group_prealloc = roundup( +- sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe)); ++ ++ /* Allocate table once */ ++ sbi->s_mb_prealloc_table = kzalloc( ++ EXT4_MAX_PREALLOC_TABLE * sizeof(unsigned long), GFP_NOFS); ++ if (sbi->s_mb_prealloc_table == NULL) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ if (sbi->s_stripe == 0) { ++ for (k = 0, l = 4; k <= 9; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = 256; ++ sbi->s_mb_large_req = 1024; ++ sbi->s_mb_group_prealloc = 512; ++ } else { ++ for (k = 0, l = sbi->s_stripe; k <= 2; ++k, l *= 2) ++ sbi->s_mb_prealloc_table[k] = l; ++ ++ sbi->s_mb_small_req = sbi->s_stripe; ++ sbi->s_mb_large_req = sbi->s_stripe * 8; ++ sbi->s_mb_group_prealloc = sbi->s_stripe * 4; + } + + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); +@@ -3702,6 +3814,7 @@ out: + kfree(sbi->s_mb_avg_fragment_size_locks); + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); ++ kfree(sbi->s_mb_prealloc_table); + kfree(sbi->s_mb_offsets); + sbi->s_mb_offsets = NULL; + kfree(sbi->s_mb_maxs); +@@ -3972,7 +4085,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + int err, len; + + BUG_ON(ac->ac_status != AC_STATUS_FOUND); +- BUG_ON(ac->ac_b_ex.fe_len <= 0); + + sb = ac->ac_sb; + sbi = EXT4_SB(sb); +@@ -4405,10 +4517,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + { + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_super_block *es = sbi->s_es; +- int bsbits, max; +- loff_t size, start_off, end; ++ int bsbits, i, wind; ++ loff_t size, end; + loff_t orig_size __maybe_unused; + ext4_lblk_t start; ++ unsigned long value, last_non_zero; + + /* do normalize only data requests, metadata requests + do not need preallocation */ +@@ -4437,51 +4550,46 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + size = size << bsbits; + if (size < i_size_read(ac->ac_inode)) + size = i_size_read(ac->ac_inode); +- orig_size = size; ++ size = (size + ac->ac_sb->s_blocksize - 1) >> bsbits; ++ ++ start = wind = 0; ++ value = last_non_zero = 0; + +- /* max size of free chunks */ +- max = 2 << bsbits; +- +-#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ +- (req <= (size) || max <= (chunk_size)) +- +- /* first, try to predict filesize */ +- /* XXX: should this table be tunable? */ +- start_off = 0; +- if (size <= 16 * 1024) { +- size = 16 * 1024; +- } else if (size <= 32 * 1024) { +- size = 32 * 1024; +- } else if (size <= 64 * 1024) { +- size = 64 * 1024; +- } else if (size <= 128 * 1024) { +- size = 128 * 1024; +- } else if (size <= 256 * 1024) { +- size = 256 * 1024; +- } else if (size <= 512 * 1024) { +- size = 512 * 1024; +- } else if (size <= 1024 * 1024) { +- size = 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (21 - bsbits)) << 21; +- size = 2 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (22 - bsbits)) << 22; +- size = 4 * 1024 * 1024; +- } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), +- (8<<20)>>bsbits, max, 8 * 1024)) { +- start_off = ((loff_t)ac->ac_o_ex.fe_logical >> +- (23 - bsbits)) << 23; +- size = 8 * 1024 * 1024; ++ /* let's choose preallocation window depending on file size */ ++ for (i = 0; i < EXT4_MAX_PREALLOC_TABLE; i++) { ++ value = sbi->s_mb_prealloc_table[i]; ++ if (value == 0) ++ break; ++ else ++ last_non_zero = value; ++ ++ if (size <= value) { ++ wind = value; ++ break; ++ } ++ } ++ ++ if (wind == 0) { ++ if (last_non_zero != 0) { ++ __u64 tstart, tend; ++ /* file is quite large, we now preallocate with ++ * the biggest configured window with regart to ++ * logical offset */ ++ wind = last_non_zero; ++ tstart = ac->ac_o_ex.fe_logical; ++ do_div(tstart, wind); ++ start = tstart * wind; ++ tend = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len - 1; ++ do_div(tend, wind); ++ tend = tend * wind + wind; ++ size = tend - start; ++ } + } else { +- start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; +- size = (loff_t) EXT4_C2B(sbi, +- ac->ac_o_ex.fe_len) << bsbits; ++ size = wind; + } +- size = size >> bsbits; +- start = start_off >> bsbits; ++ ++ ++ orig_size = size; + + /* + * For tiny groups (smaller than 8MB) the chosen allocation +@@ -4536,7 +4644,6 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, + (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); + } +- BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + + /* now prepare goal request */ + +@@ -5774,8 +5881,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + inode_pa_eligible = false; + + size = max(size, isize); +- /* Don't use group allocation for large files */ +- if (size > sbi->s_mb_stream_request) ++ if ((ac->ac_o_ex.fe_len >= sbi->s_mb_small_req) || ++ (size >= sbi->s_mb_large_req)) + group_pa_eligible = false; + + if (!group_pa_eligible) { +@@ -5786,6 +5893,13 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) + return; + } + ++ /* ++ * request is so large that we don't care about ++ * streaming - it overweights any possible seek ++ */ ++ if (ac->ac_o_ex.fe_len >= sbi->s_mb_large_req) ++ return; ++ + BUG_ON(ac->ac_lg != NULL); + /* + * locality group prealloc space are per cpu. The reason for having +diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c +index 6d332dff..4bd58d79 100644 +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -212,7 +212,8 @@ EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); +-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); ++EXT4_RW_ATTR_SBI_UI(mb_small_req, s_mb_small_req); ++EXT4_RW_ATTR_SBI_UI(mb_large_req, s_mb_large_req); + EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); + EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); + EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); +@@ -262,7 +263,8 @@ static struct attribute *ext4_attrs[] = { + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), + ATTR_LIST(mb_order2_req), +- ATTR_LIST(mb_stream_req), ++ ATTR_LIST(mb_small_req), ++ ATTR_LIST(mb_large_req), + ATTR_LIST(mb_group_prealloc), + ATTR_LIST(mb_max_linear_groups), + ATTR_LIST(max_writeback_mb_bump), +@@ -548,6 +550,8 @@ int ext4_register_sysfs(struct super_block *sb) + ext4_fc_info_show, sb); + proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_ops, sb); ++ proc_create_data("prealloc_table", S_IRUGO, sbi->s_proc, ++ &ext4_seq_prealloc_table_fops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.6/ext4-corrupted-inode-block-bitmaps-handling-patches.patch b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-corrupted-inode-block-bitmaps-handling-patches.patch new file mode 100644 index 0000000..b3c49a8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-corrupted-inode-block-bitmaps-handling-patches.patch @@ -0,0 +1,305 @@ +commit 2963f3d09eb3a0817f87386c0bd7be7ce086809d +Author: Wang Shilong +AuthorDate: Tue Sep 8 21:54:29 2015 +0800 +LU-7114 ldiskfs: corrupted bitmaps handling patches + +This patch backported following patches from upstream: + +163a203ddb36c36d4a1c942aececda0cc8d06aa7 +ext4: mark block group as corrupt on block bitmap error + +87a39389be3e3b007d341be510a7e4a0542bdf05 +ext4: mark block group as corrupt on inode bitmap error + +bdfb6ff4a255dcebeb09a901250e13a97eff75af +ext4: mark group corrupt on group descriptor checksum + +Also use ext4_warning() instead of ext4_error() so that +filesystem don't become RO in default, and together +with these patches,FS wil still be usable even such +bad things happen. + +Signed-off-by: Wang Shilong +Change-Id: Ib4075aba7df6f7f59e89a90475405080acd43dd0 +Reviewed-on: http://review.whamcloud.com/16312 +Reviewed-by: Andreas Dilger +Reviewed-by: Yang Sheng + +NOTE: Ported to linux 6.6 keeps the ext4_warning() updates. +--- + fs/ext4/balloc.c | 18 ++++++------- + fs/ext4/ialloc.c | 6 ++--- + fs/ext4/mballoc.c | 64 +++++++++++++++++++---------------------------- + 3 files changed, 37 insertions(+), 51 deletions(-) + +diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c +index 79b20d6a..8809fdb5 100644 +--- a/fs/ext4/balloc.c ++++ b/fs/ext4/balloc.c +@@ -418,7 +418,7 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + if (unlikely(!ext4_block_bitmap_csum_verify(sb, desc, bh) || + ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ++ ext4_warning(sb, "bg %u: bad block bitmap checksum", block_group); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -426,8 +426,8 @@ static int ext4_validate_block_bitmap(struct super_block *sb, + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: block %llu: invalid block bitmap", +- block_group, blk); ++ ext4_warning(sb, "bg %u: block %llu: invalid block bitmap", ++ block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; +@@ -514,18 +514,16 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, + goto out; + } + err = ext4_init_block_bitmap(sb, bh, block_group, desc); +- if (err) { +- ext4_unlock_group(sb, block_group); +- unlock_buffer(bh); +- ext4_error(sb, "Failed to init block bitmap for group " +- "%u: %d", block_group, err); +- goto out; +- } + set_bitmap_uptodate(bh); + set_buffer_uptodate(bh); + set_buffer_verified(bh); + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); ++ if (err) { ++ ext4_warning(sb, "Failed to init block bitmap for group " ++ "%u: %d", block_group, err); ++ goto out; ++ } + return bh; + } + ext4_unlock_group(sb, block_group); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index a9d3d2fc..af739fad 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -102,8 +102,8 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, + EXT4_INODES_PER_GROUP(sb) / 8) || + ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_CRC)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "Corrupt inode bitmap - block_group = %u, " +- "inode_bitmap = %llu", block_group, blk); ++ ext4_warning(sb, "Corrupt inode bitmap - block_group = %u, " ++ "inode_bitmap = %llu", block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + return -EFSBADCRC; +@@ -353,7 +353,7 @@ out: + if (!fatal) + fatal = err; + } else { +- ext4_error(sb, "bit already cleared for inode %lu", ino); ++ ext4_warning(sb, "bit already cleared for inode %lu", ino); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_IBITMAP_CORRUPT); + } +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index 4cf0c725..cfe5abc5 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -1212,10 +1212,14 @@ int ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_grp_locked_error(sb, group, 0, 0, +- "block bitmap and bg descriptor " +- "inconsistent: %u vs %u free clusters", +- free, grp->bb_free); ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ ext4_warning(sb, "group %lu: block bitmap and bg descriptor " ++ "inconsistent: %u vs %u free clusters " ++ "%u in gd, %lu pa's", ++ (long unsigned int)group, free, grp->bb_free, ++ ext4_free_group_clusters(sb, gdp), ++ grp->bb_prealloc_nr); + /* + * If we intend to continue, we consider group descriptor + * corrupt and update bb_free using bitmap value +@@ -1566,7 +1570,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + int block; + int pnum; + int poff; +- struct page *page; ++ struct page *page = NULL; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -1594,7 +1598,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, + */ + ret = ext4_mb_init_group(sb, group, gfp); + if (ret) +- return ret; ++ goto err; + } + + /* +@@ -1705,6 +1709,7 @@ err: + + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; ++ ext4_warning(sb, "Error loading buddy information for %u", group); + return ret; + } + +@@ -5131,9 +5136,11 @@ int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, + } + + if (free != free_in_gdp) { +- ext4_error(sb, "on-disk bitmap for group %d" ++ ext4_warning(sb, "on-disk bitmap for group %d" + "corrupted: %u blocks free in bitmap, %u - in gd\n", + group, free, free_in_gdp); ++ ext4_mark_group_bitmap_corrupted(sb, group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EIO; + } + return 0; +@@ -5544,16 +5551,8 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + /* "free < pa->pa_free" means we maybe double alloc the same blocks, + * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ + if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { +- ext4_error(sb, "pa free mismatch: [pa %p] " +- "[phy %lu] [logic %lu] [len %u] [free %u] " +- "[error %u] [inode %d] [freed %u]", pa, +- (unsigned long)pa->pa_pstart, +- (unsigned long)pa->pa_lstart, +- pa->pa_len, (unsigned)pa->pa_free, +- (unsigned)pa->pa_error, pa->pa_inode->i_ino, +- free); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", +- free, pa->pa_free); ++ free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. +@@ -5620,16 +5619,11 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, +- "Error %d reading block bitmap for %u", +- err, group); + goto out_dbg; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- err, group); + put_bh(bitmap_bh); + goto out_dbg; + } +@@ -5794,17 +5788,12 @@ repeat: + + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + return; +- } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", +- err, group); + ext4_mb_unload_buddy(&e4b); + continue; + } +@@ -6109,11 +6098,8 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + group = ext4_get_group_number(sb, pa->pa_pstart); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error_err(sb, -err, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + continue; +- } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_get_group_info(sb, group)->bb_prealloc_nr--; +@@ -6477,7 +6463,7 @@ errout: + * been updated or not when fail case. So can + * not revert pa_free back, just mark pa_error*/ + pa->pa_error++; +- ext4_error(sb, ++ ext4_warning(sb, + "Updating bitmap error: [err %d] " + "[pa %p] [phy %lu] [logic %lu] " + "[len %u] [free %u] [error %u] " +@@ -6488,6 +6474,7 @@ errout: + (unsigned)pa->pa_free, + (unsigned)pa->pa_error, + pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ ext4_mark_group_bitmap_corrupted(sb, 0, 0); + } + } + ext4_mb_release_context(ac); +@@ -6755,7 +6742,7 @@ do_more: + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * We need to make sure we don't reuse the freed block until after the +@@ -6848,8 +6835,10 @@ do_more: + goto do_more; + } + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); ++ return; + } + + /** +@@ -7047,7 +7036,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * need to update group_info->bb_free and bitmap +@@ -7086,8 +7075,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, + err = ret; + + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); + return err; + } + +@@ -7223,8 +7213,6 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- ret, group); + return ret; + } + +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.6/ext4-delayed-iput.patch b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-delayed-iput.patch new file mode 100644 index 0000000..8062f0d --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-delayed-iput.patch @@ -0,0 +1,185 @@ +commit e239a14001b62d96c186ae2c9f58402f73e63dcc +Author: Andrew Perepechko +AuthorDate: Mon Jan 31 19:55:31 2022 +0300 +LU-15404 ldiskfs: truncate during setxattr leads to kernel panic + +When changing a large xattr value to a different large xattr value, +the old xattr inode is freed. Truncate during the final iput causes +current transaction restart. Eventually, parent inode bh is marked +dirty and kernel panic happens when jbd2 figures out that this bh +belongs to the committed transaction. + +A possible fix is to call this final iput in a separate thread. +This way, setxattr transactions will never be split into two. +Since the setxattr code adds xattr inodes with nlink=0 into the +orphan list, old xattr inodes will be properly cleaned up in +any case. + +Change-Id: Idd70befa6a83818ece06daccf9bb6256812674b9 +Signed-off-by: Andrew Perepechko +HPE-bug-id: LUS-10534 +Reviewed-on: https://review.whamcloud.com/46358 +Reviewed-by: Andreas Dilger +Reviewed-by: Alexander Zarochentsev +--- + fs/ext4/ext4.h | 7 +++++-- + fs/ext4/page-io.c | 2 +- + fs/ext4/super.c | 15 ++++++++------- + fs/ext4/xattr.c | 39 +++++++++++++++++++++++++++++++++++++-- + 4 files changed, 51 insertions(+), 12 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 816398bc..fbd00726 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1649,8 +1649,11 @@ struct ext4_sb_info { + struct flex_groups * __rcu *s_flex_groups; + ext4_group_t s_flex_groups_allocated; + +- /* workqueue for reserved extent conversions (buffered io) */ +- struct workqueue_struct *rsv_conversion_wq; ++ /* ++ * workqueue for reserved extent conversions (buffered io) ++ * and large ea inodes reclaim ++ */ ++ struct workqueue_struct *s_misc_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; +diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c +index dfdd7e5c..1489b640 100644 +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -230,7 +230,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) + WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + WARN_ON(!io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); +- wq = sbi->rsv_conversion_wq; ++ wq = sbi->s_misc_wq; + if (list_empty(&ei->i_rsv_conversion_list)) + queue_work(wq, &ei->i_rsv_conversion_work); + list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index f47468f5..c63806fa 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1304,10 +1304,11 @@ static void ext4_put_super(struct super_block *sb) + &sb->s_uuid); + + ext4_unregister_li_request(sb); ++ flush_workqueue(sbi->s_misc_wq); + ext4_quotas_off(sb, EXT4_MAXQUOTAS); + + flush_work(&sbi->s_sb_upd_work); +- destroy_workqueue(sbi->rsv_conversion_wq); ++ destroy_workqueue(sbi->s_misc_wq); + ext4_release_orphan_info(sb); + + if (sbi->s_journal) { +@@ -5469,9 +5470,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) + * The maximum number of concurrent works can be high and + * concurrency isn't really necessary. Limit it to 1. + */ +- EXT4_SB(sb)->rsv_conversion_wq = +- alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); +- if (!EXT4_SB(sb)->rsv_conversion_wq) { ++ EXT4_SB(sb)->s_misc_wq = ++ alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); ++ if (!EXT4_SB(sb)->s_misc_wq) { + printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); + err = -ENOMEM; + goto failed_mount4; +@@ -5645,8 +5646,8 @@ failed_mount4a: + sb->s_root = NULL; + failed_mount4: + ext4_msg(sb, KERN_ERR, "mount failed"); +- if (EXT4_SB(sb)->rsv_conversion_wq) +- destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); ++ if (EXT4_SB(sb)->s_misc_wq) ++ destroy_workqueue(EXT4_SB(sb)->s_misc_wq); + failed_mount_wq: + ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); + sbi->s_ea_inode_cache = NULL; +@@ -6336,7 +6337,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) + return 0; + + trace_ext4_sync_fs(sb, wait); +- flush_workqueue(sbi->rsv_conversion_wq); ++ flush_workqueue(sbi->s_misc_wq); + /* + * Writeback quota in non-journalled quota case - journalled quota has + * no dirty dquots +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 30ece5e3..596149e6 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -1655,6 +1655,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, + return 0; + } + ++struct delayed_iput_work { ++ struct work_struct work; ++ struct inode *inode; ++}; ++ ++static void delayed_iput_fn(struct work_struct *work) ++{ ++ struct delayed_iput_work *diwork; ++ ++ diwork = container_of(work, struct delayed_iput_work, work); ++ iput(diwork->inode); ++ kfree(diwork); ++} ++ ++static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) ++{ ++ if (!inode) { ++ kfree(work); ++ return; ++ } ++ ++ if (!work) { ++ iput(inode); ++ } else { ++ INIT_WORK(&work->work, delayed_iput_fn); ++ work->inode = inode; ++ queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); ++ } ++} ++ + /* + * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode + * feature is enabled. +@@ -1672,6 +1702,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + int in_inode = i->in_inode; + struct inode *old_ea_inode = NULL; + struct inode *new_ea_inode = NULL; ++ struct delayed_iput_work *diwork = NULL; + size_t old_size, new_size; + int ret; + +@@ -1748,7 +1779,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, + * Finish that work before doing any modifications to the xattr data. + */ + if (!s->not_found && here->e_value_inum) { +- ret = ext4_xattr_inode_iget(inode, ++ diwork = kmalloc(sizeof(*diwork), GFP_NOFS); ++ if (!diwork) ++ ret = -ENOMEM; ++ else ++ ret = ext4_xattr_inode_iget(inode, + le32_to_cpu(here->e_value_inum), + le32_to_cpu(here->e_hash), + &old_ea_inode); +@@ -1915,7 +1950,7 @@ update_hash: + + ret = 0; + out: +- iput(old_ea_inode); ++ delayed_iput(old_ea_inode, diwork); + iput(new_ea_inode); + return ret; + } +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.6/ext4-encdata.patch b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-encdata.patch new file mode 100644 index 0000000..3b50734 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-encdata.patch @@ -0,0 +1,484 @@ +commit d0a722cb8fb886380e24e8261e8efca09a3262d6 +Author: Sebastien Buisson +AuthorDate: Tue Dec 20 15:40:52 2022 +0100 +Commit: Oleg Drokin +CommitDate: Thu Aug 31 06:28:45 2023 +0000 +LU-16374 ldiskfs: implement security.encdata xattr + +security.encdata is a virtual xattr containing information related +to encrypted files. It is expressed as ASCII text with a "key: value" +format, and space as field separator. For instance: + + { encoding: base64url, size: 3012, enc_ctx: YWJjZGVmZ2hpamtsbW + 5vcHFyc3R1dnd4eXphYmNkZWZnaGlqa2xtbg, enc_name: ZmlsZXdpdGh2ZX + J5bG9uZ25hbWVmaWxld2l0aHZlcnlsb25nbmFtZWZpbGV3aXRodmVyeWxvbmdu + YW1lZmlsZXdpdGg } + +'encoding' is the encoding method used for binary data, assume name +can be up to 255 chars. +'size' is the clear text file data length in bytes. +'enc_ctx' is encoded encryption context, 40 bytes for v2. +'enc_name' is encoded encrypted name, 256 bytes max. +So on overall, this xattr is at most 727 chars plus terminating '0'. + +On get, the value of the security.encdata xattr is computed from +encrypted file's information. +On set, encrypted file's information is restored from xattr value. +The encrypted name is stored temporarily in a dedicated xattr +LDISKFS_XATTR_NAME_RAWENCNAME, that will be used to set correct name +at linkat. + +Signed-off-by: Sebastien Buisson +Change-Id: Ia318c39d403b1c448e71bcd5b29862d022d05d0a +Reviewed-on: https://review.whamcloud.com/49456 +Reviewed-by: Andreas Dilger +Reviewed-by: Li Dongyang +--- + fs/ext4/encdata.h | 128 ++++++++++++++++++++++++ + fs/ext4/inode.c | 6 ++ + fs/ext4/super.c | 8 ++ + fs/ext4/xattr.h | 2 + + fs/ext4/xattr_security.c | 209 ++++++++++++++++++++++++++++++++++++++- + 5 files changed, 352 insertions(+), 1 deletion(-) + create mode 100644 fs/ext4/encdata.h + +diff --git a/fs/ext4/encdata.h b/fs/ext4/encdata.h +new file mode 100644 +index 00000000..aa83832f +--- /dev/null ++++ b/fs/ext4/encdata.h +@@ -0,0 +1,128 @@ ++/* ++ * encdata.h ++ * ++ * Copyright (c) 2022 Whamcloud ++ */ ++ ++#ifndef _ENCDATA_H ++#define _ENCDATA_H ++ ++/* Define a fixed 4096-byte encryption unit size */ ++/* Must be identical to LUSTRE_ENCRYPTION_UNIT_SIZE */ ++#define EXT4_ENCRYPTION_BLOCKBITS 12 ++#define EXT4_ENCRYPTION_UNIT_SIZE ((size_t)1 << EXT4_ENCRYPTION_BLOCKBITS) ++#define EXT4_ENCRYPTION_MASK (~(EXT4_ENCRYPTION_UNIT_SIZE - 1)) ++#define LLCRYPT_SET_CONTEXT_MAX_SIZE 40 ++#define ENCDATA_XATTR_FMT_1 "{ encoding: " ++#define ENCDATA_XATTR_FMT_2 ", size: " ++#define ENCDATA_XATTR_FMT_3 ", enc_ctx: " ++#define ENCDATA_XATTR_FMT_4 ", enc_name: " ++#define ENCDATA_XATTR_FMT_END " }" ++#define ENCDATA_XATTR_FMT_COMP ENCDATA_XATTR_FMT_1 ENCDATA_XATTR_FMT_2 \ ++ ENCDATA_XATTR_FMT_3 ENCDATA_XATTR_FMT_4 \ ++ ENCDATA_XATTR_FMT_END ++ ++extern char encdata_xattr_fmt[NAME_MAX]; ++ ++/* ++ * base64url encoding, lifted from fs/crypto/fname.c. ++ */ ++ ++static const char base64url_table[] = ++ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; ++ ++#define BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) ++ ++/** ++ * base64url_encode() - base64url-encode some binary data ++ * @src: the binary data to encode ++ * @srclen: the length of @src in bytes ++ * @dst: (output) the base64url-encoded string. Not NUL-terminated. ++ * ++ * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL ++ * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used, ++ * as it's unneeded and not required by the RFC. base64url is used instead of ++ * base64 to avoid the '/' character, which isn't allowed in filenames. ++ * ++ * Return: the length of the resulting base64url-encoded string in bytes. ++ * This will be equal to LLCRYPT_BASE64URL_CHARS(srclen). ++ */ ++static inline int base64url_encode(const u8 *src, int srclen, char *dst) ++{ ++ u32 ac = 0; ++ int bits = 0; ++ int i; ++ char *cp = dst; ++ ++ for (i = 0; i < srclen; i++) { ++ ac = (ac << 8) | src[i]; ++ bits += 8; ++ do { ++ bits -= 6; ++ *cp++ = base64url_table[(ac >> bits) & 0x3f]; ++ } while (bits >= 6); ++ } ++ if (bits) ++ *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f]; ++ return cp - dst; ++} ++ ++/** ++ * base64url_decode() - base64url-decode a string ++ * @src: the string to decode. Doesn't need to be NUL-terminated. ++ * @srclen: the length of @src in bytes ++ * @dst: (output) the decoded binary data ++ * ++ * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with ++ * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't ++ * accepted, nor are non-encoding characters such as whitespace. ++ * ++ * This implementation hasn't been optimized for performance. ++ * ++ * Return: the length of the resulting decoded binary data in bytes, ++ * or -1 if the string isn't a valid base64url string. ++ */ ++static inline int base64url_decode(const char *src, int srclen, u8 *dst) ++{ ++ u32 ac = 0; ++ int bits = 0; ++ int i; ++ u8 *bp = dst; ++ ++ for (i = 0; i < srclen; i++) { ++ const char *p = strchr(base64url_table, src[i]); ++ ++ if (p == NULL || src[i] == 0) ++ return -1; ++ ac = (ac << 6) | (p - base64url_table); ++ bits += 6; ++ if (bits >= 8) { ++ bits -= 8; ++ *bp++ = (u8)(ac >> bits); ++ } ++ } ++ if (ac & ((1 << bits) - 1)) ++ return -1; ++ return bp - dst; ++} ++ ++/* This version of the code uses base64url encoding for binary data. */ ++#define ENCDATA_ENCODING "base64url" ++ ++/* Wrappers to support various encodings. Add new methods in there. ++ */ ++static inline int encode(const u8 *src, int srclen, char *dst, char *encoding) ++{ ++ if (!strcmp(encoding, "base64url")) ++ return base64url_encode(src, srclen, dst); ++ return -EINVAL; ++} ++ ++static inline int decode(const char *src, int srclen, u8 *dst, char *encoding) ++{ ++ if (!strcmp(encoding, "base64url")) ++ return base64url_decode(src, srclen, dst); ++ return -EINVAL; ++} ++ ++#endif /* _ENCDATA_H */ +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 16c5c054..286dcae6 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -46,6 +46,7 @@ + #include "xattr.h" + #include "acl.h" + #include "truncate.h" ++#include "encdata.h" + + #include + +@@ -5595,6 +5596,11 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, + STATX_ATTR_NODUMP | + STATX_ATTR_VERITY); + ++ if (flags & EXT4_ENCRYPT_FL && ++ unlikely(!IS_LUSTRE_MOUNT(inode->i_sb))) ++ stat->size = round_up(stat->size, ++ EXT4_ENCRYPTION_UNIT_SIZE); ++ + generic_fillattr(idmap, request_mask, inode, stat); + return 0; + } +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index 39e4d0fb..040ee320 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -56,6 +56,7 @@ + #include "acl.h" + #include "mballoc.h" + #include "fsmap.h" ++#include "encdata.h" + + #define CREATE_TRACE_POINTS + #include +@@ -7350,6 +7351,7 @@ MODULE_ALIAS_FS("ext4"); + + /* Shared across all ext4 file systems */ + wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; ++char encdata_xattr_fmt[NAME_MAX]; + + static int __init ext4_init_fs(void) + { +@@ -7403,6 +7405,12 @@ static int __init ext4_init_fs(void) + if (err) + goto out; + ++ snprintf(encdata_xattr_fmt, sizeof(encdata_xattr_fmt), ++ ENCDATA_XATTR_FMT_1"%%%u[^,]"ENCDATA_XATTR_FMT_2"%%llu" ++ ENCDATA_XATTR_FMT_3"%%%us"ENCDATA_XATTR_FMT_4"%%%us", ++ NAME_MAX, BASE64URL_CHARS(LLCRYPT_SET_CONTEXT_MAX_SIZE), ++ BASE64URL_CHARS(NAME_MAX)); ++ + return 0; + out: + ext4_fc_destroy_dentry_cache(); +diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h +index 824faf0b..1e8aa6f2 100644 +--- a/fs/ext4/xattr.h ++++ b/fs/ext4/xattr.h +@@ -140,6 +140,8 @@ extern const struct xattr_handler ext4_xattr_security_handler; + extern const struct xattr_handler ext4_xattr_hurd_handler; + + #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" ++#define EXT4_XATTR_NAME_ENCDATA "encdata" ++#define EXT4_XATTR_NAME_RAWENCNAME "rawencname" + + /* + * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes. +diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c +index 776cf11d..80cad6e0 100644 +--- a/fs/ext4/xattr_security.c ++++ b/fs/ext4/xattr_security.c +@@ -10,13 +10,217 @@ + #include + #include "ext4_jbd2.h" + #include "ext4.h" ++#include "critical_encode.h" ++#include "encdata.h" + #include "xattr.h" + ++/* security.encdata is a virtual xattr containing information related ++ * to encrypted files. It is expressed as ASCII text with a "key: value" ++ * format, and space as field separator. For instance: ++ * ++ * { encoding: base64url, size: 3012, enc_ctx: YWJjZGVmZ2hpamtsbW ++ * 5vcHFyc3R1dnd4eXphYmNkZWZnaGlqa2xtbg, enc_name: ZmlsZXdpdGh2ZX ++ * J5bG9uZ25hbWVmaWxld2l0aHZlcnlsb25nbmFtZWZpbGV3aXRodmVyeWxvbmdu ++ * YW1lZmlsZXdpdGg } ++ * ++ * 'encoding' is the encoding method used for binary data, assume name ++ * can be up to 255 chars. ++ * 'size' is the clear text file data length in bytes. ++ * 'enc_ctx' is encoded encryption context, 40 bytes for v2. ++ * 'enc_name' is encoded encrypted name, 256 bytes max. ++ * So on overall, this xattr is at most 727 chars plus terminating '\0'. ++ */ ++static int ext4_build_xattr_encdata(struct dentry *dentry, ++ struct inode *inode, ++ void *buffer, size_t size) ++{ ++ char encoded_enc_ctx[BASE64URL_CHARS(LLCRYPT_SET_CONTEXT_MAX_SIZE) + 1]; ++ unsigned char enc_ctx[LLCRYPT_SET_CONTEXT_MAX_SIZE]; ++ char encoded_name[BASE64URL_CHARS(NAME_MAX) + 1]; ++ struct ext4_filename fname = { 0 }; ++ struct inode *parent = NULL; ++ int encoded_enc_ctx_len = 0; ++ int encoded_name_len = 0; ++ char size_str[32]; ++ int retval; ++ ++ if (!IS_ENCRYPTED(inode)) { ++ retval = -ENODATA; ++ goto out; ++ } ++ ++ /* get size */ ++ retval = snprintf(size_str, sizeof(size_str), "%llu", ++ S_ISDIR(inode->i_mode) ? 0 : inode->i_size); ++ if (retval >= sizeof(size_str)) { ++ retval = -ERANGE; ++ goto out; ++ } ++ ++ /* get raw name */ ++ if (dentry && dentry->d_parent) ++ parent = dentry->d_parent->d_inode; ++ ++ retval = ext4_setup_filename(parent, &dentry->d_name, 1, &fname); ++ if (retval) ++ goto out; ++ ++ /* base64url-encode raw name */ ++ encoded_name_len = encode(fname.disk_name.name, fname.disk_name.len, ++ encoded_name, ENCDATA_ENCODING); ++ if (encoded_name_len == -EINVAL) { ++ retval = -EINVAL; ++ goto out; ++ } ++ encoded_name[encoded_name_len] = '\0'; ++ ++ if (!buffer) { ++ /* Return exact xattr length we would return if called with ++ * non-NULL buffer. ++ */ ++ retval = sizeof(ENCDATA_XATTR_FMT_COMP) - 1 + ++ sizeof(ENCDATA_ENCODING) - 1 + strlen(size_str) + ++ BASE64URL_CHARS(LLCRYPT_SET_CONTEXT_MAX_SIZE) + ++ encoded_name_len; ++ goto out; ++ } ++ ++ /* get encryption context */ ++ retval = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, ++ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ++ enc_ctx, sizeof(enc_ctx)); ++ ++ if (retval < 0) ++ goto out; ++ ++ /* base64url-encode encryption context */ ++ encoded_enc_ctx_len = encode(enc_ctx, retval, encoded_enc_ctx, ++ ENCDATA_ENCODING); ++ if (encoded_enc_ctx_len == -EINVAL) { ++ retval = -EINVAL; ++ goto out; ++ } ++ encoded_enc_ctx[encoded_enc_ctx_len] = '\0'; ++ ++ /* write EXT4_XATTR_ENCDATA info into buffer */ ++ retval = snprintf(buffer, size, ++ ENCDATA_XATTR_FMT_1 ENCDATA_ENCODING ++ ENCDATA_XATTR_FMT_2"%s"ENCDATA_XATTR_FMT_3"%s" ++ ENCDATA_XATTR_FMT_4"%s"ENCDATA_XATTR_FMT_END, ++ size_str, encoded_enc_ctx, ++ encoded_name_len ? encoded_name : ""); ++ if (retval >= size) ++ retval = -ERANGE; ++ ++out: ++ if (fname.disk_name.name != dentry->d_name.name) ++ kfree(fname.disk_name.name); ++ ++ return retval; ++} ++ ++static int ext4_process_xattr_encdata(struct inode *inode, ++ const void *value, size_t size, ++ int flags) ++{ ++ char encoded_enc_ctx[BASE64URL_CHARS(LLCRYPT_SET_CONTEXT_MAX_SIZE) + 1]; ++ unsigned char enc_ctx[LLCRYPT_SET_CONTEXT_MAX_SIZE]; ++ char encoded_name[BASE64URL_CHARS(NAME_MAX) + 1]; ++ char encoding[NAME_MAX + 1]; ++ char name[NAME_MAX + 1]; ++ loff_t disk_size = 0; ++ char *buffer = NULL; ++ int enc_ctx_len = 0; ++ int name_len = 0; ++ int retval = 0; ++ ++ if (IS_ENCRYPTED(inode) || !value || flags & XATTR_REPLACE) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ buffer = kmalloc(size + 1, GFP_NOFS); ++ if (!buffer) { ++ retval = -ENOMEM; ++ goto out; ++ } ++ memcpy(buffer, value, size); ++ buffer[size] = '\0'; ++ ++ retval = sscanf(buffer, encdata_xattr_fmt, ++ encoding, &disk_size, encoded_enc_ctx, encoded_name); ++ if (retval < 4) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ /* get former encryption context: should not exist */ ++ retval = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, ++ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0); ++ if (retval != -ENODATA) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ if (strlen(encoded_enc_ctx) > ++ BASE64URL_CHARS(LLCRYPT_SET_CONTEXT_MAX_SIZE)) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ /* base64url-decode encryption context */ ++ retval = decode(encoded_enc_ctx, strlen(encoded_enc_ctx), ++ enc_ctx, encoding); ++ if (retval < 0) { ++ retval = -EINVAL; ++ goto out; ++ } ++ enc_ctx_len = retval; ++ ++ /* set encryption context, this will set encryption flag */ ++ retval = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, ++ EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ++ enc_ctx, enc_ctx_len, XATTR_CREATE); ++ if (retval < 0) ++ goto out; ++ ++ if (disk_size) { ++ /* set size on inode */ ++ spin_lock(&inode->i_lock); ++ i_size_write(inode, disk_size); ++ EXT4_I(inode)->i_disksize = disk_size; ++ spin_unlock(&inode->i_lock); ++ mark_inode_dirty(inode); ++ } ++ ++ /* put raw encrypted name in EXT4_XATTR_NAME_RAWENCNAME xattr, ++ * for later use, but base64url-decode first ++ */ ++ retval = decode(encoded_name, strlen(encoded_name), name, encoding); ++ if (retval < 0) { ++ retval = -EINVAL; ++ goto out; ++ } ++ name_len = retval; ++ ++ retval = ext4_xattr_set(inode, EXT4_XATTR_INDEX_LUSTRE, ++ EXT4_XATTR_NAME_RAWENCNAME, ++ name, name_len, XATTR_CREATE); ++ ++out: ++ kfree(buffer); ++ ++ return retval; ++} ++ + static int + ext4_xattr_security_get(const struct xattr_handler *handler, +- struct dentry *unused, struct inode *inode, ++ struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) + { ++ if (!strncmp(name, EXT4_XATTR_NAME_ENCDATA, strlen(name))) ++ return ext4_build_xattr_encdata(dentry, inode, buffer, size); ++ + return ext4_xattr_get(inode, EXT4_XATTR_INDEX_SECURITY, + name, buffer, size); + } +@@ -28,6 +232,9 @@ ext4_xattr_security_set(const struct xattr_handler *handler, + const char *name, const void *value, + size_t size, int flags) + { ++ if (!strncmp(name, EXT4_XATTR_NAME_ENCDATA, strlen(name))) ++ return ext4_process_xattr_encdata(inode, value, size, flags); ++ + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, + name, value, size, flags); + } +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.6/ext4-ialloc-uid-gid-and-pass-owner-down.patch b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-ialloc-uid-gid-and-pass-owner-down.patch new file mode 100644 index 0000000..023e753 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-ialloc-uid-gid-and-pass-owner-down.patch @@ -0,0 +1,145 @@ +commit 5bb641fa61175fd0fe63e830219d88304b5162c3 +Author: Shaun Tancheff +AuthorDate: Thu Dec 10 10:31:51 2020 -0600 +LU-13239 ldiskfs: pass inode timestamps at initial creation + +A previous patch https://github.com/Cray/lustre/commit/6d4fb6694 +"LUS-4880 osd-ldiskfs: pass uid/gid/xtime directly to ldiskfs" +was intended to be ported to upstream lustre but was lost. + +The patch https://review.whamcloud.com/34685/ +"LU-12151 osd-ldiskfs: pass owner down rather than transfer it" +passed the inode UID and GID down to ldiskfs at inode allocation +time to avoid the overhead of transferring quota from the inode +(initially created as root) over to the actual user of the file. + +The two patches differed slightly in that the LUS-4880 included +passing the a/m/ctimes from osd-ldiskfs to ldiskfs at inode +creation time avoids overhead of setting the timestamps afterward. + +Benchmarks using MDTEST: + mdtest -f 32 -l 32 -n 16384 -i 5 -p 120 -t -u -v -d mdtest + + master patched + Operation Mean Std Dev Mean Std Dev + --------- ---- ------- ---- ------- + Directory creation: 17008.593 72.700 17099.863 155.461 + Directory stat : 170513.269 1456.002 170105.207 2349.934 + Directory removal : 80796.147 2633.832 84480.222 892.536 + File creation : 39227.419 7014.539 40429.900 6643.868 + File stat : 101761.395 2979.802 103818.800 1146.689 + File read : 86583.370 871.982 85725.254 965.862 + File removal : 74923.504 761.048 75075.180 723.966 + Tree creation : 588.570 244.534 608.332 123.939 + Tree removal : 39.874 1.873 44.357 2.350 + +This patch also reorganizes the ldiskfs patch series in +order to accommodate struct iattr being added to +ldiskfs_create_inode. +All supported server platforms RHEL 7.5+, SUSE 12+ and +ubuntu 18+ are affected. + +HPE-bug-id: LUS-7378, LUS-4880, LUS-8042, LUS-9157, LUS-8772, LUS-8769 +Signed-off-by: Shaun Tancheff +Change-Id: I87e9c792b5240820bfd3a7268e477970ebac8465 +Reviewed-on: https://review.whamcloud.com/37556 +Reviewed-by: Petros Koutoupis +Reviewed-by: Jian Yu +Reviewed-by: Wang Shilong +--- + fs/ext4/ext4.h | 8 ++++---- + fs/ext4/ialloc.c | 11 ++++++++++- + fs/ext4/namei.c | 13 +++++++++++-- + 3 files changed, 25 insertions(+), 7 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index ad9cabf8..425ffe30 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3017,15 +3017,15 @@ extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, + const struct qstr *qstr, __u32 goal, + uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, +- int nblocks); ++ int nblocks, struct iattr *iattr); + + #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ +- (goal), (owner), i_flags, 0, 0, 0) ++ (goal), (owner), i_flags, 0, 0, 0, NULL) + #define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ + type, nblocks) \ + __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ +- 0, (type), __LINE__, (nblocks)) ++ 0, (type), __LINE__, (nblocks), NULL) + + + extern void ext4_free_inode(handle_t *, struct inode *); +@@ -3219,7 +3219,7 @@ extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern struct inode *ext4_create_inode(handle_t *handle, + struct inode *dir, int mode, +- uid_t *owner); ++ struct iattr *iattr); + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index af739fad..d870e68c 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -926,7 +926,7 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, + umode_t mode, const struct qstr *qstr, + __u32 goal, uid_t *owner, __u32 i_flags, + int handle_type, unsigned int line_no, +- int nblocks) ++ int nblocks, struct iattr *iattr) + { + struct super_block *sb; + struct buffer_head *inode_bitmap_bh = NULL; +@@ -1309,6 +1309,15 @@ got: + if (err) + goto fail_drop; + ++ if (iattr) { ++ if (iattr->ia_valid & ATTR_CTIME) ++ inode_set_ctime_to_ts(inode, iattr->ia_ctime); ++ if (iattr->ia_valid & ATTR_MTIME) ++ inode->i_mtime = iattr->ia_mtime; ++ if (iattr->ia_valid & ATTR_ATIME) ++ inode->i_atime = iattr->ia_atime; ++ } ++ + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index c0700140..e8d968a4 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -3436,11 +3436,20 @@ static int ext4_add_nondir(handle_t *handle, + /* Return locked inode, then the caller can modify the inode's states/flags + * before others finding it. The caller should unlock the inode by itself. */ + struct inode *ext4_create_inode(handle_t *handle, struct inode *dir, int mode, +- uid_t *owner) ++ struct iattr *iattr) + { + struct inode *inode; ++ uid_t owner[2] = {0, 0}; + +- inode = ext4_new_inode(handle, dir, mode, NULL, 0, owner, 0); ++ if (iattr) { ++ if (iattr->ia_valid & ATTR_UID) ++ owner[0] = from_kuid(&init_user_ns, iattr->ia_uid); ++ if (iattr->ia_valid & ATTR_GID) ++ owner[1] = from_kgid(&init_user_ns, iattr->ia_gid); ++ } ++ ++ inode = __ext4_new_inode(&nop_mnt_idmap, handle, dir, mode, NULL, 0, owner, 0, ++ 0, 0, 0, iattr); + if (!IS_ERR(inode)) { + if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { + #ifdef CONFIG_LDISKFS_FS_XATTR +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.6/ext4-mballoc-extra-checks.patch b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-mballoc-extra-checks.patch new file mode 100644 index 0000000..c0bccf5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-mballoc-extra-checks.patch @@ -0,0 +1,325 @@ +commit f2f28f1d09c0a00b3fc569422f881931d857fac9 +Author: Alex Zhuravlev +AuthorDate: Tue Oct 28 17:59:09 2008 +0000 +Subject: ext4: detect on-disk corruption of block bitmap +Detect on-disk corruption of block bitmap and better checking of +preallocated blocks. +Bugzilla-ID: b=16680 +Signed-off-by: Alex Zhuravlev +Reviewed-by: Kalpak Shah +Signed-off-by: Andreas Dilger +--- + fs/ext4/ext4.h | 1 + + fs/ext4/mballoc.c | 106 ++++++++++++++++++++++++++++++++++++++++------ + fs/ext4/mballoc.h | 2 +- + 3 files changed, 95 insertions(+), 14 deletions(-) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 314a66cd..c699f2f8 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -3448,6 +3448,7 @@ struct ext4_group_info { + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c +index ad030a86..698ad923 100644 +--- a/fs/ext4/mballoc.c ++++ b/fs/ext4/mballoc.c +@@ -416,7 +416,7 @@ static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { + "ext4_groupinfo_64k", "ext4_groupinfo_128k" + }; + +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); + +@@ -1177,7 +1177,7 @@ mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) + } + + static noinline_for_stack +-void ext4_mb_generate_buddy(struct super_block *sb, ++int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group, + struct ext4_group_info *grp) + { +@@ -1221,6 +1221,7 @@ void ext4_mb_generate_buddy(struct super_block *sb, + grp->bb_free = free; + ext4_mark_group_bitmap_corrupted(sb, group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); ++ return -EIO; + } + mb_set_largest_free_order(sb, grp); + mb_update_avg_fragment_size(sb, grp); +@@ -1230,6 +1231,8 @@ void ext4_mb_generate_buddy(struct super_block *sb, + period = get_cycles() - period; + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); ++ ++ return 0; + } + + static void mb_regenerate_buddy(struct ext4_buddy *e4b) +@@ -1350,7 +1353,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + } + + first_block = page->index * blocks_per_page; +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + group = (first_block + i) >> 1; + if (group >= ngroups) + break; +@@ -1398,7 +1401,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ++ err = ext4_mb_generate_buddy(sb, data, incore, group, grinfo); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -1413,7 +1416,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); + ext4_unlock_group(sb, group); + +@@ -1423,7 +1426,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) + incore = data; + } + } +- SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + + out: + if (bh) { +@@ -3014,9 +3018,11 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = pde_data(file_inode(seq->file)); ++ struct ext4_group_desc *gdp; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err, buddy_loaded = 0; ++ int free = 0; + struct ext4_buddy e4b; + struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, +@@ -3029,7 +3035,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + + group--; + if (group == 0) +- seq_puts(seq, "#group: free frags first [" ++ seq_puts(seq, "#group: bfree gfree frags first pa [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + +@@ -3049,13 +3055,19 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + buddy_loaded = 1; + } + ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = ext4_free_group_clusters(sb, gdp); ++ + memcpy(&sg, grinfo, i); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); + +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", ++ (long unsigned int)group, sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -5078,25 +5090,75 @@ try_group_pa: + return false; + } + ++/* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions. The group lock should be hold by the ++ * caller. ++ */ ++static ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_CLUSTERS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ unsigned short free_in_gdp = ext4_free_group_clusters(sb, gdp); ++ ++ if (free_in_gdp == 0 && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) ++ return 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = mb_find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != free_in_gdp) { ++ ext4_error(sb, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, free_in_gdp); ++ return -EIO; ++ } ++ return 0; ++} ++ + /* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ + static noinline_for_stack +-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; ++ int skip = 0, count = 0; ++ int err; + int len; + + if (!grp) +- return; ++ return -EIO; ++ ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; + + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. +@@ -5113,13 +5175,23 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + mb_set_bits(bitmap, start, len); + preallocated += len; ++ count++; ++ } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; + } + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_mark_pa_deleted(struct super_block *sb, +@@ -5210,6 +5282,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + if (pa->pa_type == MB_INODE_PA) { +@@ -5338,6 +5411,7 @@ adjust_bex: + pa->pa_inode = ac->ac_inode; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + write_lock(pa->pa_node_lock.inode_lock); + ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); +@@ -5391,6 +5465,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) + pa->pa_inode = NULL; + + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + + /* + * We will later add the new pa to the right bucket +@@ -5557,6 +5632,8 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -5688,7 +5765,7 @@ repeat: + if (err) { + ext4_error_err(sb, -err, "Error %d loading buddy information for %u", + err, group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -5701,6 +5778,8 @@ repeat: + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); +@@ -6005,6 +6084,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, group)->bb_prealloc_nr--; + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + +diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h +index 56938532..6b672472 100644 +--- a/fs/ext4/mballoc.h ++++ b/fs/ext4/mballoc.h +@@ -66,7 +66,7 @@ + /* + * for which requests use 2^N search using buddies + */ +-#define MB_DEFAULT_ORDER2_REQS 2 ++#define MB_DEFAULT_ORDER2_REQS 8 + + /* + * default group prealloc size 512 blocks +-- +2.34.1 + diff --git a/ldiskfs/kernel_patches/patches/linux-6.6/ext4-nocmtime.patch b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-nocmtime.patch new file mode 100644 index 0000000..d23ec6b3 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-6.6/ext4-nocmtime.patch @@ -0,0 +1,44 @@ +commit d2c828a32a3b019194051ee24607eafee517cc43 +Author: Niu Yawei +AuthorDate: Mon Feb 9 22:21:00 2015 -0500 +LU-6137 ldiskfs: simplify nocmtime patch + +Simplify the nocmtime patch by patching only ext4_current_time(), +this fixed the defect that original patch doesn't handle setacl +code path, it can also avoid the risk of future changes adding +new places that needs to be fixed. + +Remove the obsolete xattr-no-update-ctime patch. + +Signed-off-by: Niu Yawei +Change-Id: I02928c4f867e9476f0bc1815dd3256e3d79dadf7 +Reviewed-on: http://review.whamcloud.com/13705 +Reviewed-by: Bobi Jam +Reviewed-by: Bob Glossman +Reviewed-by: Fan Yong +Signed-off-by: Anreas Dilger +--- + fs/ext4/ext4.h | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index adb577aa..218da6ec 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -985,6 +985,13 @@ enum { + I_DATA_SEM_EA + }; + ++static inline struct timespec64 ext4_current_time(struct inode *inode) ++{ ++ if (IS_NOCMTIME(inode)) ++ return inode_get_ctime(inode); ++ return current_time(inode); ++} ++#define current_time(a) ext4_current_time(a) + + /* + * fourth extended file system inode data in memory +-- +2.25.1 + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-6.6-ml.series b/ldiskfs/kernel_patches/series/ldiskfs-6.6-ml.series new file mode 100644 index 0000000..3082474 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-6.6-ml.series @@ -0,0 +1,37 @@ +linux-5.16/ext4-inode-version.patch +linux-5.18/ext4-lookup-dotdot.patch +linux-5.14/ext4-print-inum-in-htree-warning.patch +linux-6.5/ext4-prealloc.patch +linux-5.16/ext4-osd-iop-common.patch +linux-5.16/ext4-misc.patch +linux-6.6/ext4-mballoc-extra-checks.patch +sles15sp4/ext4-hash-indexed-dir-dotdot-update.patch +linux-5.14/ext4-kill-dx-root.patch +linux-6.5/ext4-mballoc-pa-free-mismatch.patch +linux-6.5/ext4-data-in-dirent.patch +linux-6.6/ext4-nocmtime.patch +base/ext4-htree-lock.patch +linux-6.5/ext4-pdirop.patch +rhel9/ext4-max-dir-size.patch +linux-6.6/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +rhel9/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +linux-6.2/ext4-attach-jinode-in-writepages.patch +linux-6.5/ext4-dont-check-before-replay.patch +rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7.6/ext4-export-orphan-add.patch +linux-5.18/ext4-export-mb-stream-allocator-variables.patch +ubuntu19/ext4-iget-with-flags.patch +linux-5.14/export-ext4fs-dirhash-helper.patch +linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch +rhel9/ext4-dquot-commit-speedup.patch +linux-6.6/ext4-ialloc-uid-gid-and-pass-owner-down.patch +linux-5.14/ext4-projid-xattrs.patch +linux-6.6/ext4-delayed-iput.patch +rhel8/ext4-ext-merge.patch +linux-5.14/ext4-xattr-disable-credits-check.patch +rhel9.2/ext4-fiemap-kernel-data.patch +rhel8/ext4-old_ea_inodes_handling_fix.patch +ubuntu20.04.5/ext4-filename-encode.patch +rhel9.1/ext4-enc-flag.patch +linux-6.6/ext4-encdata.patch