From c93a3e5b15b72afc1eeb3f7754c15e078d098769 Mon Sep 17 00:00:00 2001 From: Mr NeilBrown Date: Tue, 14 Dec 2021 12:43:45 -0500 Subject: [PATCH] LU-14195 ldiskfs: update patches for Linux 5.10 Mostly simple conflicts due to code movement, however: ext4-data-in-dirent.patch now needs to patch fs/ext4/fast-commit.c as well as ext4_init_new_dir() is used in that file. Since fast commit can break recovery we prevent mounting with this option. Test-Parameters: trivial Signed-off-by: Mr NeilBrown Change-Id: I59b10fdb6bb606b193472e3045ab7d9b1d0d36b5 Reviewed-on: https://review.whamcloud.com/40913 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- config/lustre-build-ldiskfs.m4 | 6 +- .../ext4-attach-jinode-in-writepages.patch | 51 ++ .../patches/linux-5.10/ext4-data-in-dirent.patch | 749 +++++++++++++++++ .../ext4-give-warning-with-dir-htree-growing.patch | 155 ++++ .../patches/linux-5.10/ext4-misc.patch | 193 +++++ .../patches/linux-5.10/ext4-pdirop.patch | 893 +++++++++++++++++++++ .../kernel_patches/series/ldiskfs-5.10.0-ml.series | 29 + lustre/osd-ldiskfs/osd_handler.c | 6 + lustre/osd-ldiskfs/osd_internal.h | 7 + 9 files changed, 2088 insertions(+), 1 deletion(-) create mode 100644 ldiskfs/kernel_patches/patches/linux-5.10/ext4-attach-jinode-in-writepages.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.10/ext4-data-in-dirent.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.10/ext4-give-warning-with-dir-htree-growing.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.10/ext4-misc.patch create mode 100644 ldiskfs/kernel_patches/patches/linux-5.10/ext4-pdirop.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index 9e2cc33..91fd5229 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -136,7 +136,11 @@ AS_IF([test -z "$LDISKFS_SERIES"], [AS_VERSION_COMPARE([$LINUXRELEASE],[5.9.0], [LDISKFS_SERIES="5.8.0-ml.series"], # lt [LDISKFS_SERIES="5.9.0-ml.series"], # eq - [LDISKFS_SERIES="5.9.0-ml.series"], # gt + [AS_VERSION_COMPARE([$LINUXRELEASE],[5.10.0], + [LDISKFS_SERIES="5.9.0-ml.series"], # lt + [LDISKFS_SERIES="5.10.0-ml.series"], # eq + [LDISKFS_SERIES="5.10.0-ml.series"], # gt + )] )] )] )]) diff --git a/ldiskfs/kernel_patches/patches/linux-5.10/ext4-attach-jinode-in-writepages.patch b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-attach-jinode-in-writepages.patch new file mode 100644 index 0000000..64e9834 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-attach-jinode-in-writepages.patch @@ -0,0 +1,51 @@ +From 01da8ce642e08594db95d940b3352ad7ee153b09 Mon Sep 17 00:00:00 2001 +From: Shaun Tancheff +Date: Tue, 6 Aug 2019 17:11:57 -0500 +Subject: [PATCH] + linux-5.3/ext4-attach-jinode-in-writepages + +--- + fs/ext4/ext4.h | 1 + + fs/ext4/inode.c | 8 ++++++++ + 2 files changed, 9 insertions(+) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -2972,6 +2972,7 @@ extern void ext4_mb_mark_bb(struct super + int len, int state); + + /* inode.c */ ++#define HAVE_LDISKFS_INFO_JINODE + void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, + struct ext4_inode_info *ei); + int ext4_inode_is_fast_symlink(struct inode *inode); +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -723,6 +723,10 @@ out_sem: + (loff_t)map->m_lblk << inode->i_blkbits; + loff_t length = (loff_t)map->m_len << inode->i_blkbits; + ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ return ret; ++ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + ret = ext4_jbd2_inode_add_wait(handle, inode, + start_byte, length); +@@ -2725,6 +2729,9 @@ static int ext4_writepages(struct addres + mpd.last_page = wbc->range_end >> PAGE_SHIFT; + } + ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ goto out_writepages; + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); +@@ -4156,6 +4163,7 @@ int ext4_inode_attach_jinode(struct inod + jbd2_free_inode(jinode); + return 0; + } ++EXPORT_SYMBOL(ext4_inode_attach_jinode); + + /* + * ext4_truncate() diff --git a/ldiskfs/kernel_patches/patches/linux-5.10/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-data-in-dirent.patch new file mode 100644 index 0000000..63648b8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-data-in-dirent.patch @@ -0,0 +1,749 @@ +this patch implements feature which allows ext4 fs users (e.g. Lustre) +to store data in ext4 dirent. +data is stored in ext4 dirent after file-name, this space is accounted +in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data +is present. + +make use of dentry->d_fsdata to pass fid to ext4. so no +changes in ext4_add_entry() interface required. + +--- + fs/ext4/dir.c | 13 ++- + fs/ext4/ext4.h | 100 +++++++++++++++++++++++- + fs/ext4/fast_commit.c | 2 + fs/ext4/inline.c | 8 - + fs/ext4/namei.c | 201 +++++++++++++++++++++++++++++++++++++++----------- + fs/ext4/super.c | 4 + 6 files changed, 270 insertions(+), 58 deletions(-) + +diff -ur a/fs/ext4/dir.c b/fs/ext4/dir.c +--- a/fs/ext4/dir.c 2021-12-14 08:28:35.514373035 -0700 ++++ b/fs/ext4/dir.c 2021-12-14 08:29:41.318242202 -0700 +@@ -78,7 +78,7 @@ + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; +- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) ++ else if (unlikely(rlen < EXT4_DIR_ENTRY_LEN(de))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(next_offset > size)) + error_msg = "directory entry overrun"; +@@ -226,7 +226,7 @@ + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, +- sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) ++ sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); +@@ -449,12 +449,17 @@ + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; ++ int extra_data = 0; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ +- len = sizeof(struct fname) + ent_name->len + 1; ++ if (dirent->file_type & EXT4_DIRENT_LUFID) ++ extra_data = ext4_get_dirent_data_len(dirent); ++ ++ len = sizeof(struct fname) + ent_name->len + extra_data + 1; ++ + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; +@@ -463,7 +468,7 @@ + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; +- memcpy(new_fn->name, ent_name->name, ent_name->len); ++ memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data); + + while (*p) { + parent = *p; +diff -ur a/fs/ext4/ext4.h b/fs/ext4/ext4.h +--- a/fs/ext4/ext4.h 2021-12-14 08:28:35.622372837 -0700 ++++ b/fs/ext4/ext4.h 2021-12-14 08:30:28.938135681 -0700 +@@ -1157,6 +1157,7 @@ + __u32 i_csum_seed; + + kprojid_t i_projid; ++ void *i_dirdata; + }; + + /* +@@ -1178,6 +1179,7 @@ + * Mount flags set via mount options or defaults + */ + #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ ++#define EXT4_MOUNT_DIRDATA 0x00002 /* Data in directory entries */ + #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ + #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ + #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +@@ -2054,6 +2056,7 @@ + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD | \ +@@ -2232,6 +2235,43 @@ + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 ++#define EXT4_FT_MASK 0xf ++ ++#if EXT4_FT_MAX > EXT4_FT_MASK ++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" ++#endif ++ ++/* ++ * d_type has 4 unused bits, so it can hold four types data. these different ++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be ++ * stored, in flag order, after file-name in ext4 dirent. ++*/ ++/* ++ * this flag is added to d_type if ext4 dirent has extra data after ++ * filename. this data length is variable and length is stored in first byte ++ * of data. data start after filename NUL byte. ++ * This is used by Lustre FS. ++ */ ++#define EXT4_DIRENT_LUFID 0x10 ++ ++#define EXT4_LUFID_MAGIC 0xAD200907UL ++struct ext4_dentry_param { ++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ ++ char edp_len; /* size of edp_data in bytes */ ++ char edp_data[0]; /* packed array of data */ ++} __packed; ++ ++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, ++ struct ext4_dentry_param *p) ++ ++{ ++ if (!ext4_has_feature_dirdata(sb)) ++ return NULL; ++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) ++ return &p->edp_len; ++ else ++ return NULL; ++} + + #define EXT4_FT_DIR_CSUM 0xDE + +@@ -2242,8 +2282,16 @@ + */ + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ ++#define EXT4_DIR_REC_LEN_(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) ++#define EXT4_DIR_ENTRY_LEN_(de) (EXT4_DIR_REC_LEN_((de)->name_len +\ ++ ext4_get_dirent_data_len(de))) ++/* ldiskfs */ ++#define EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len)) ++#define EXT4_DIR_ENTRY_LEN(de) EXT4_DIR_ENTRY_LEN_((de)) ++/* lustre osd_handler compat */ ++#define __EXT4_DIR_REC_LEN(name_len) EXT4_DIR_REC_LEN_((name_len)) ++ + #define EXT4_MAX_REC_LEN ((1<<16)-1) + + /* +@@ -2710,11 +2758,11 @@ + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de); ++ struct ext4_dir_entry_2 **dest_de, int *dlen); + void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname); ++ struct ext4_filename *fname, void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { + if (!ext4_has_feature_dir_index(inode->i_sb) && +@@ -2730,10 +2778,17 @@ + + static inline unsigned char get_dtype(struct super_block *sb, int filetype) + { +- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) ++ int fl_index = filetype & EXT4_FT_MASK; ++ ++ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX) + return DT_UNKNOWN; + +- return ext4_filetype_table[filetype]; ++ if (!test_opt(sb, DIRDATA)) ++ return ext4_filetype_table[fl_index]; ++ ++ return (ext4_filetype_table[fl_index]) | ++ (filetype & EXT4_DIRENT_LUFID); ++ + } + extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); +@@ -2922,7 +2977,8 @@ + + /* namei.c */ + extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode); ++ struct inode *inode, ++ const void *data1, const void *data2); + extern int ext4_dirblock_csum_verify(struct inode *inode, + struct buffer_head *bh); + extern int ext4_orphan_add(handle_t *, struct inode *); +@@ -2933,6 +2989,8 @@ + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh); ++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, const void *, const void *); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern int ext4_search_dir(struct buffer_head *bh, +@@ -3725,6 +3783,36 @@ + return buffer_uptodate(bh); + } + ++/* ++ * Compute the total directory entry data length. ++ * This includes the filename and an implicit NUL terminator (always present), ++ * and optional extensions. Each extension has a bit set in the high 4 bits of ++ * de->file_type, and the extension length is the first byte in each entry. ++ */ ++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) ++{ ++ char *len = de->name + de->name_len + 1 /* NUL terminator */; ++ int dlen = 0; ++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; ++ struct ext4_dir_entry_tail *t = (struct ext4_dir_entry_tail *)de; ++ ++ if (!t->det_reserved_zero1 && ++ le16_to_cpu(t->det_rec_len) == ++ sizeof(struct ext4_dir_entry_tail) && ++ !t->det_reserved_zero2 && ++ t->det_reserved_ft == EXT4_FT_DIR_CSUM) ++ return 0; ++ ++ while (extra_data_flags) { ++ if (extra_data_flags & 1) { ++ dlen += *len + (dlen == 0); ++ len += *len; ++ } ++ extra_data_flags >>= 1; ++ } ++ return dlen; ++} ++ + #endif /* __KERNEL__ */ + + #define EFSBADCRC EBADMSG /* Bad CRC detected */ +diff -ur a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c +--- a/fs/ext4/fast_commit.c 2021-12-14 08:28:35.514373035 -0700 ++++ b/fs/ext4/fast_commit.c 2021-12-14 08:29:41.326242185 -0700 +@@ -1532,7 +1532,7 @@ + jbd_debug(1, "Dir %d not found.", darg.ino); + goto out; + } +- ret = ext4_init_new_dir(NULL, dir, inode); ++ ret = ext4_init_new_dir(NULL, dir, inode, NULL, NULL); + iput(dir); + if (ret) { + ret = 0; +diff -ur a/fs/ext4/inline.c b/fs/ext4/inline.c +--- a/fs/ext4/inline.c 2021-12-14 08:28:35.518373027 -0700 ++++ b/fs/ext4/inline.c 2021-12-14 08:29:41.326242185 -0700 +@@ -1023,7 +1023,7 @@ + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, +- inline_size, fname, &de); ++ inline_size, fname, &de, NULL); + if (err) + return err; + +@@ -1031,7 +1031,7 @@ + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; +- ext4_insert_dentry(inode, de, inline_size, fname); ++ ext4_insert_dentry(inode, de, inline_size, fname, NULL); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + +@@ -1380,7 +1380,7 @@ + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( +- EXT4_DIR_REC_LEN(fake.name_len), ++ EXT4_DIR_ENTRY_LEN(&fake), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +@@ -1390,7 +1390,7 @@ + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( +- EXT4_DIR_REC_LEN(fake.name_len), ++ EXT4_DIR_ENTRY_LEN(&fake), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +diff -ur a/fs/ext4/namei.c b/fs/ext4/namei.c +--- a/fs/ext4/namei.c 2021-12-14 08:28:35.638372808 -0700 ++++ b/fs/ext4/namei.c 2021-12-14 08:29:41.330242176 -0700 +@@ -265,7 +265,8 @@ + static unsigned dx_get_limit(struct dx_entry *entries); + static void dx_set_count(struct dx_entry *entries, unsigned value); + static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, +@@ -409,22 +410,23 @@ + { + struct ext4_dir_entry *dp; + struct dx_root_info *root; +- int count_offset; ++ int count_offset, dot_rec_len, dotdot_rec_len; + + if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + count_offset = 8; +- else if (le16_to_cpu(dirent->rec_len) == 12) { +- dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); ++ else { ++ dot_rec_len = le16_to_cpu(dirent->rec_len); ++ dp = (struct ext4_dir_entry *)(((void *)dirent) + dot_rec_len); + if (le16_to_cpu(dp->rec_len) != +- EXT4_BLOCK_SIZE(inode->i_sb) - 12) ++ EXT4_BLOCK_SIZE(inode->i_sb) - dot_rec_len) + return NULL; +- root = (struct dx_root_info *)(((void *)dp + 12)); ++ dotdot_rec_len = EXT4_DIR_ENTRY_LEN((struct ext4_dir_entry_2 *)dp); ++ root = (struct dx_root_info *)(((void *)dp + dotdot_rec_len)); + if (root->reserved_zero || + root->info_length != sizeof(struct dx_root_info)) + return NULL; +- count_offset = 32; +- } else +- return NULL; ++ count_offset = 8 + dot_rec_len + dotdot_rec_len; ++ } + + if (offset) + *offset = count_offset; +@@ -529,11 +531,12 @@ + */ + struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) + { ++ BUG_ON(de->name_len != 1); + /* get dotdot first */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de)); + + /* dx root info is after dotdot entry */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_ENTRY_LEN(de)); + + return (struct dx_root_info *)de; + } +@@ -578,10 +581,16 @@ + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - +- EXT4_DIR_REC_LEN(2) - infosize; ++ struct ext4_dir_entry_2 *dotdot_de; ++ unsigned entry_space; ++ ++ BUG_ON(dot_de->name_len != 1); ++ dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize); ++ entry_space = dir->i_sb->s_blocksize - EXT4_DIR_ENTRY_LEN(dot_de) - ++ EXT4_DIR_ENTRY_LEN(dotdot_de) - infosize; + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); +@@ -701,7 +710,7 @@ + (unsigned) ((char *) de - base)); + #endif + } +- space += EXT4_DIR_REC_LEN(de->name_len); ++ space += EXT4_DIR_ENTRY_LEN(de); + names++; + } + de = ext4_next_entry(de, size); +@@ -808,11 +817,14 @@ + + entries = (struct dx_entry *)(((char *)info) + info->info_length); + +- if (dx_get_limit(entries) != dx_root_limit(dir, +- info->info_length)) { ++ if (dx_get_limit(entries) != ++ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), +- dx_root_limit(dir, info->info_length)); ++ dx_root_limit(dir, ++ (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)); + goto fail; + } + +@@ -1798,7 +1810,7 @@ + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_ENTRY_LEN(de); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1822,7 +1834,7 @@ + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_ENTRY_LEN(de); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1959,14 +1971,16 @@ + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de) ++ struct ext4_dir_entry_2 **dest_de, int *dlen) + { + struct ext4_dir_entry_2 *de; +- unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); ++ unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)) + ++ (dlen ? *dlen : 0); + int nlen, rlen; + unsigned int offset = 0; + char *top; + ++ dlen ? *dlen = 0 : 0; /* default set to 0 */ + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { +@@ -1975,10 +1989,26 @@ + return -EFSCORRUPTED; + if (ext4_match(dir, fname, de)) + return -EEXIST; +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_ENTRY_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; ++ /* Then for dotdot entries, check for the smaller space ++ * required for just the entry, no FID */ ++ if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) { ++ if ((de->inode ? rlen - nlen : rlen) >= ++ EXT4_DIR_REC_LEN(fname_len(fname))) { ++ /* set dlen=1 to indicate not ++ * enough space store fid */ ++ dlen ? *dlen = 1 : 0; ++ break; ++ } ++ /* The new ".." entry must be written over the ++ * previous ".." entry, which is the first ++ * entry traversed by this scan. If it doesn't ++ * fit, something is badly wrong, so -EIO. */ ++ return -EIO; ++ } + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } +@@ -1992,12 +2022,12 @@ + void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname) ++ struct ext4_filename *fname, void *data) + { + + int nlen, rlen; + +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_ENTRY_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = +@@ -2011,6 +2041,11 @@ + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); ++ if (data) { ++ de->name[fname_len(fname)] = 0; ++ memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + } + + /* +@@ -2028,14 +2063,19 @@ + { + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; +- int err, err2; ++ int err, err2, dlen = 0; ++ unsigned char *data; + ++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) ++ EXT4_I(inode)->i_dirdata); + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (!de) { ++ if (data) ++ dlen = (*data) + 1; + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, +- blocksize - csum_size, fname, &de); ++ blocksize - csum_size, fname, &de, &dlen); + if (err) + return err; + } +@@ -2047,7 +2087,10 @@ + } + + /* By now the buffer is marked for journaling */ +- ext4_insert_dentry(inode, de, blocksize, fname); ++ /* If writing the short form of "dotdot", don't add the data section */ ++ if (dlen == 1) ++ data = NULL; ++ ext4_insert_dentry(inode, de, blocksize, fname, data); + + /* + * XXX shouldn't update any times until successful +@@ -2152,7 +2195,8 @@ + + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); ++ dx_set_limit(entries, dx_root_limit(dir, ++ dot_de, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ + fname->hinfo.hash_version = dx_info->hash_version; +@@ -2202,6 +2246,8 @@ + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + int len, journal = 0, err = 0; ++ int dlen = 0; ++ char *data; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2227,11 +2273,16 @@ + goto out_journal; + + journal = 1; +- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de)); + } + +- len -= EXT4_DIR_REC_LEN(1); +- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ len -= EXT4_DIR_ENTRY_LEN(de); ++ data = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *)dentry->d_fsdata); ++ if (data) ++ dlen = *data + 1; ++ assert(len == 0 || len >= EXT4_DIR_REC_LEN(2 + dlen)); ++ + de = (struct ext4_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (!journal) { +@@ -2248,7 +2299,12 @@ + assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); + de->name_len = 2; + strcpy(de->name, ".."); +- ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) { ++ de->name[2] = 0; ++ memcpy(&de->name[2 + 1], data, *data); ++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + + out_journal: + if (journal) { +@@ -2286,6 +2342,7 @@ + ext4_lblk_t block, blocks; + int csum_size = 0; + ++ EXT4_I(inode)->i_dirdata = dentry->d_fsdata; + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + +@@ -2839,37 +2896,70 @@ + return err; + } + ++struct tp_block { ++ struct inode *inode; ++ void *data1; ++ void *data2; ++}; ++ + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) + { ++ void *data1 = NULL, *data2 = NULL; ++ int dot_reclen = 0; ++ ++ if (dotdot_real_len == 10) { ++ struct tp_block *tpb = (struct tp_block *)inode; ++ data1 = tpb->data1; ++ data2 = tpb->data2; ++ inode = tpb->inode; ++ dotdot_real_len = 0; ++ } + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), +- blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + ++ /* get packed fid data*/ ++ data1 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data1); ++ if (data1) { ++ de->name[1] = 0; ++ memcpy(&de->name[2], data1, *(char *) data1); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ de->rec_len = cpu_to_le16(EXT4_DIR_ENTRY_LEN(de)); ++ dot_reclen = cpu_to_le16(de->rec_len); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; ++ strcpy(de->name, ".."); ++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ data2 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data2); ++ if (data2) { ++ de->name[2] = 0; ++ memcpy(&de->name[3], data2, *(char *) data2); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - +- (csum_size + EXT4_DIR_REC_LEN(1)), ++ (csum_size + dot_reclen), + blocksize); + else + de->rec_len = ext4_rec_len_to_disk( +- EXT4_DIR_REC_LEN(de->name_len), blocksize); +- strcpy(de->name, ".."); +- ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ EXT4_DIR_ENTRY_LEN(de), blocksize); + + return ext4_next_entry(de, blocksize); + } + + int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode) ++ struct inode *inode, ++ const void *data1, const void *data2) + { ++ struct tp_block param; + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + ext4_lblk_t block = 0; +@@ -2893,7 +2983,11 @@ + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + de = (struct ext4_dir_entry_2 *)dir_block->b_data; +- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); ++ param.inode = inode; ++ param.data1 = (void *)data1; ++ param.data2 = (void *)data2; ++ ext4_init_dot_dotdot((struct inode *)(¶m), de, blocksize, ++ csum_size, dir->i_ino, 10); + set_nlink(inode, 2); + if (csum_size) + ext4_initialize_dirent_tail(dir_block, blocksize); +@@ -2908,6 +3002,29 @@ + return err; + } + ++/* Initialize @inode as a subdirectory of @dir, and add the ++ * "." and ".." entries into the first directory block. */ ++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, ++ const void *data1, const void *data2) ++{ ++ int rc; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ ext4_handle_sync(handle); ++ ++ inode->i_op = &ext4_dir_inode_operations; ++ inode->i_fop = &ext4_dir_operations; ++ rc = ext4_init_new_dir(handle, dir, inode, data1, data2); ++ if (!rc) ++ rc = ext4_mark_inode_dirty(handle, inode); ++ return rc; ++} ++EXPORT_SYMBOL(ext4_add_dot_dotdot); ++ + static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) + { + handle_t *handle; +@@ -2934,7 +3051,7 @@ + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; +- err = ext4_init_new_dir(handle, dir, inode); ++ err = ext4_init_new_dir(handle, dir, inode, NULL, NULL); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); +diff -ur a/fs/ext4/super.c b/fs/ext4/super.c +--- a/fs/ext4/super.c 2021-12-14 08:28:35.614372852 -0700 ++++ b/fs/ext4/super.c 2021-12-14 08:29:41.334242168 -0700 +@@ -1694,7 +1694,7 @@ + Opt_inlinecrypt, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, +- Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, ++ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, + Opt_dax, Opt_dax_always, Opt_dax_inode, Opt_dax_never, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_warn_on_error, +@@ -1778,6 +1778,7 @@ + {Opt_nolazytime, "nolazytime"}, + {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, + {Opt_nodelalloc, "nodelalloc"}, ++ {Opt_dirdata, "dirdata"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, +@@ -2017,6 +2018,7 @@ + {Opt_usrjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_grpjquota, 0, MOPT_Q | MOPT_STRING}, + {Opt_offusrjquota, 0, MOPT_Q}, ++ {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, diff --git a/ldiskfs/kernel_patches/patches/linux-5.10/ext4-give-warning-with-dir-htree-growing.patch b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-give-warning-with-dir-htree-growing.patch new file mode 100644 index 0000000..6c16268 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-give-warning-with-dir-htree-growing.patch @@ -0,0 +1,155 @@ +--- + fs/ext4/ext4.h | 1 + fs/ext4/namei.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- + fs/ext4/super.c | 2 + + fs/ext4/sysfs.c | 2 + + 4 files changed, 71 insertions(+), 2 deletions(-) + +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1547,6 +1547,7 @@ struct ext4_sb_info { + unsigned int s_mb_group_prealloc; + unsigned int s_mb_max_inode_prealloc; + unsigned int s_max_dir_size_kb; ++ unsigned long s_warning_dir_size; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -775,12 +775,20 @@ struct ext4_dir_lock_data { + #define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) + #define ext4_find_entry(dir, name, dirent, inline) \ + ext4_find_entry_locked(dir, name, dirent, inline, NULL) +-#define ext4_add_entry(handle, dentry, inode) \ +- ext4_add_entry_locked(handle, dentry, inode, NULL) + + /* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ + #define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) + ++inline int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int ret = ext4_add_entry_locked(handle, dentry, inode, NULL); ++ ++ if (ret == -ENOBUFS) ++ ret = 0; ++ return ret; ++} ++ + static void ext4_htree_event_cb(void *target, void *event) + { + u64 *block = (u64 *)target; +@@ -2657,6 +2665,54 @@ out: + return err; + } + ++static unsigned long __ext4_max_dir_size(struct dx_frame *frames, ++ struct dx_frame *frame, struct inode *dir) ++{ ++ unsigned long max_dir_size; ++ ++ if (EXT4_SB(dir->i_sb)->s_max_dir_size_kb) { ++ max_dir_size = EXT4_SB(dir->i_sb)->s_max_dir_size_kb << 10; ++ } else { ++ max_dir_size = EXT4_BLOCK_SIZE(dir->i_sb); ++ while (frame >= frames) { ++ max_dir_size *= dx_get_limit(frame->entries); ++ if (frame == frames) ++ break; ++ frame--; ++ } ++ /* use 75% of max dir size in average */ ++ max_dir_size = max_dir_size / 4 * 3; ++ } ++ return max_dir_size; ++} ++ ++/* ++ * With hash tree growing, it is easy to hit ENOSPC, but it is hard ++ * to predict when it will happen. let's give administrators warning ++ * when reaching 3/5 and 2/3 of limit ++ */ ++static inline bool dir_size_in_warning_range(struct dx_frame *frames, ++ struct dx_frame *frame, ++ struct inode *dir) ++{ ++ unsigned long size1, size2; ++ struct super_block *sb = dir->i_sb; ++ ++ if (unlikely(!EXT4_SB(sb)->s_warning_dir_size)) ++ EXT4_SB(sb)->s_warning_dir_size = ++ __ext4_max_dir_size(frames, frame, dir); ++ ++ size1 = EXT4_SB(sb)->s_warning_dir_size / 16 * 10; ++ size1 = size1 & ~(EXT4_BLOCK_SIZE(sb) - 1); ++ size2 = EXT4_SB(sb)->s_warning_dir_size / 16 * 11; ++ size2 = size2 & ~(EXT4_BLOCK_SIZE(sb) - 1); ++ if (in_range(dir->i_size, size1, EXT4_BLOCK_SIZE(sb)) || ++ in_range(dir->i_size, size2, EXT4_BLOCK_SIZE(sb))) ++ return true; ++ ++ return false; ++} ++ + /* + * ext4_add_entry() + * +@@ -2796,6 +2852,7 @@ static int ext4_dx_add_entry(handle_t *h + struct ext4_dir_entry_2 *de; + int restart; + int err; ++ bool ret_warn = false; + + again: + restart = 0; +@@ -2824,6 +2881,11 @@ again: + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); ++ ++ if (frame - frames + 1 >= ext4_dir_htree_level(sb) || ++ EXT4_SB(sb)->s_warning_dir_size) ++ ret_warn = dir_size_in_warning_range(frames, frame, dir); ++ + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +@@ -2987,6 +3049,8 @@ cleanup: + */ + if (restart && err == 0) + goto again; ++ if (err == 0 && ret_warn) ++ err = -ENOBUFS; + return err; + } + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -2243,6 +2243,8 @@ static int handle_mount_opt(struct super + sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; ++ /* reset s_warning_dir_size and make it re-calculated */ ++ sbi->s_warning_dir_size = 0; + #ifdef CONFIG_EXT4_DEBUG + } else if (token == Opt_fc_debug_max_replay) { + sbi->s_fc_debug_max_replay = arg; +--- a/fs/ext4/sysfs.c ++++ b/fs/ext4/sysfs.c +@@ -214,6 +214,7 @@ EXT4_ATTR_OFFSET(inode_readahead_blks, 0 + EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); + EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size_kb); + EXT4_RW_ATTR_SBI_UI(max_dir_size_kb, s_max_dir_size_kb); ++EXT4_RW_ATTR_SBI_UI(warning_dir_size, s_warning_dir_size); + EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +@@ -264,6 +265,7 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(inode_goal), + ATTR_LIST(max_dir_size), + ATTR_LIST(max_dir_size_kb), ++ ATTR_LIST(warning_dir_size), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), diff --git a/ldiskfs/kernel_patches/patches/linux-5.10/ext4-misc.patch b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-misc.patch new file mode 100644 index 0000000..2f0cab7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-misc.patch @@ -0,0 +1,193 @@ +--- + fs/ext4/ext4.h | 29 ++++++++++++++++++++++- + fs/ext4/ialloc.c | 3 ++- + fs/ext4/inode.c | 15 +++++++++++++++ + fs/ext4/namei.c | 9 ++++++--- + fs/ext4/super.c | 10 ++-------- + 5 files changed, 47 insertions(+), 13 deletions(-) + +diff -ur a/fs/ext4/ext4.h b/fs/ext4/ext4.h +--- a/fs/ext4/ext4.h 2021-12-09 07:30:00.134965109 -0700 ++++ b/fs/ext4/ext4.h 2021-12-09 07:32:16.038650421 -0700 +@@ -1849,6 +1849,8 @@ + + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + ++#define JOURNAL_START_HAS_3ARGS 1 ++ + /* + * Codes for operating systems + */ +@@ -2089,7 +2091,21 @@ + + EXTN_FEATURE_FUNCS(2) + EXTN_FEATURE_FUNCS(3) +-EXTN_FEATURE_FUNCS(4) ++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_compat & ++ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & ++ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_incompat & ++ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0); ++} + + static inline bool ext4_has_compat_features(struct super_block *sb) + { +@@ -3554,6 +3575,11 @@ + #define EXT_MAX_BLOCKS 0xffffffff + + extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); ++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, ++ ext4_group_t block_group); ++extern struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block); + extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); + extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +diff -ur a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +--- a/fs/ext4/ialloc.c 2021-12-09 07:30:00.086965299 -0700 ++++ b/fs/ext4/ialloc.c 2021-12-09 07:30:47.418778395 -0700 +@@ -120,7 +120,7 @@ + * + * Return buffer_head of bitmap on success, or an ERR_PTR on error. + */ +-static struct buffer_head * ++struct buffer_head * + ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc *desc; +@@ -215,6 +215,7 @@ + put_bh(bh); + return ERR_PTR(err); + } ++EXPORT_SYMBOL(ext4_read_inode_bitmap); + + /* + * NOTE! When we get the inode, we're the only people +diff -ur a/fs/ext4/inode.c b/fs/ext4/inode.c +--- a/fs/ext4/inode.c 2021-12-09 07:30:00.126965141 -0700 ++++ b/fs/ext4/inode.c 2021-12-09 07:30:47.422778379 -0700 +@@ -6197,3 +6197,19 @@ + + return ret; + } ++EXPORT_SYMBOL(ext4_map_blocks); ++EXPORT_SYMBOL(ext4_truncate); ++EXPORT_SYMBOL(ext4_iget); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_itable_unused_count); ++EXPORT_SYMBOL(ext4_force_commit); ++EXPORT_SYMBOL(__ext4_mark_inode_dirty); ++EXPORT_SYMBOL(ext4_get_group_desc); ++EXPORT_SYMBOL(__ext4_journal_get_write_access); ++EXPORT_SYMBOL(__ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); ++EXPORT_SYMBOL(__ext4_std_error); ++EXPORT_SYMBOL(ext4fs_dirhash); ++EXPORT_SYMBOL(ext4_get_inode_loc); ++EXPORT_SYMBOL(__ext4_journal_ensure_credits); +diff -ur a/fs/ext4/namei.c b/fs/ext4/namei.c +--- a/fs/ext4/namei.c 2021-12-09 07:30:00.138965094 -0700 ++++ b/fs/ext4/namei.c 2021-12-09 07:30:47.426778363 -0700 +@@ -50,7 +50,7 @@ + #define NAMEI_RA_BLOCKS 4 + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +-static struct buffer_head *ext4_append(handle_t *handle, ++struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) + { +@@ -181,6 +181,7 @@ + } + return bh; + } ++EXPORT_SYMBOL(ext4_append); + + #ifndef assert + #define assert(test) J_ASSERT(test) +@@ -2575,23 +2576,25 @@ + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. + */ +-static void ext4_inc_count(struct inode *inode) ++void ext4_inc_count(struct inode *inode) + { + inc_nlink(inode); + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); + } ++EXPORT_SYMBOL(ext4_inc_count); + + /* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +-static void ext4_dec_count(struct inode *inode) ++void ext4_dec_count(struct inode *inode) + { + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); + } ++EXPORT_SYMBOL(ext4_dec_count); + + + /* +diff -ur a/fs/ext4/super.c b/fs/ext4/super.c +--- a/fs/ext4/super.c 2021-12-09 07:30:00.050965441 -0700 ++++ b/fs/ext4/super.c 2021-12-09 07:30:47.426778363 -0700 +@@ -435,7 +435,7 @@ + return; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_update_tstamp(es, s_last_error_time); +- strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); ++ strlcpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); +@@ -496,7 +496,7 @@ + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + es->s_first_error_time_hi = es->s_last_error_time_hi; +- strncpy(es->s_first_error_func, func, ++ strlcpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; +@@ -6683,16 +6683,12 @@ + if (err) + goto out05; + +- register_as_ext3(); +- register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + return 0; + out: +- unregister_as_ext2(); +- unregister_as_ext3(); + out05: + destroy_inodecache(); + out1: +@@ -6716,8 +6712,6 @@ + static void __exit ext4_exit_fs(void) + { + ext4_destroy_lazyinit_thread(); +- unregister_as_ext2(); +- unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + destroy_inodecache(); + ext4_exit_mballoc(); diff --git a/ldiskfs/kernel_patches/patches/linux-5.10/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-pdirop.patch new file mode 100644 index 0000000..2b23d52 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/linux-5.10/ext4-pdirop.patch @@ -0,0 +1,893 @@ +From 1a0f7f0b9c13ef0aa86e125f350b6733bff8db3c Mon Sep 17 00:00:00 2001 +From: Liang Zhen +Date: Fri Jul 8 13:43:08 2011 -0400 +Subject: [PATCH] LU-50 ldiskfs: pdirops patch for ldiskfs + + Single directory performance is a critical for HPC workloads. + In a typical use case an application creates a separate output file for each + node and task in a job. As nodes and tasks increase, hundreds of thousands of + files may be created in a single directory within a short window of time. + Today, both filename lookup and file system modifying operations (such as + create and unlink) are protected with a single lock for an entire ldiskfs + directory. PDO project will remove this bottleneck by introducing a parallel + locking mechanism for entire ldiskfs directories. This work will enable + multiple application threads to simultaneously lookup, create and unlink in + parallel. + +This patch contains: + - pdirops support for ldiskfs + - integrate with osd-ldiskfs +--- + fs/ext4/Makefile | 1 + fs/ext4/ext4.h | 78 +++++++++ + fs/ext4/namei.c | 454 ++++++++++++++++++++++++++++++++++++++++++++++++++----- + fs/ext4/super.c | 1 + 4 files changed, 494 insertions(+), 40 deletions(-) + create mode 100644 fs/ext4/htree_lock.c + create mode 100644 include/linux/htree_lock.h + +--- a/fs/ext4/Makefile ++++ b/fs/ext4/Makefile +@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ ++ htree_lock.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1003,6 +1004,9 @@ struct ext4_inode_info { + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +@@ -2398,6 +2402,72 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ ++extern struct buffer_head *ext4_find_entry_locked(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); ++ + struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; +@@ -2772,12 +2842,20 @@ void ext4_insert_dentry(struct inode *in + struct ext4_filename *fname, void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } ++#endif + } + static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -55,6 +55,7 @@ struct buffer_head *ext4_append(handle_t + ext4_lblk_t *block) + { + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); + int err; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && +@@ -62,15 +63,22 @@ struct buffer_head *ext4_append(handle_t + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); ++ + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); +- if (IS_ERR(bh)) ++ if (IS_ERR(bh)) { ++ up(&ei->i_append_sem); + return bh; ++ } + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); ++ up(&ei->i_append_sem); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); +@@ -271,7 +279,8 @@ static unsigned dx_node_limit(struct ino + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_frame *frame); ++ struct dx_frame *frame, ++ struct htree_lock *lck); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, +@@ -285,12 +294,13 @@ static void dx_insert_block(struct dx_fr + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir); ++ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck); + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode); ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck); + + /* checksumming functions */ + void ext4_initialize_dirent_tail(struct buffer_head *bh, +@@ -754,6 +764,227 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ ext4_find_entry_locked(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ ext4_add_entry_locked(handle, dentry, inode, NULL) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -765,10 +996,11 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(struct ext4_filename *fname, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck) + { + unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info *info; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); +@@ -830,8 +1062,15 @@ dx_probe(struct ext4_filename *fname, st + + dxtrace(printk("Look up %x", hash)); + while (1) { ++ if (indirect == 0) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); +- if (!count || count > dx_get_limit(entries)) { ++ if (count == 0 || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); +@@ -870,8 +1109,70 @@ dx_probe(struct ext4_filename *fname, st + dx_get_block(at))); + frame->entries = entries; + frame->at = at; +- if (!indirect--) ++ ++ if (indirect == 0) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } + return frame; ++ } ++ dx = at; ++ indirect--; + frame++; + frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(frame->bh)) { +@@ -940,7 +1241,7 @@ static void dx_release(struct dx_frame * + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -955,12 +1256,22 @@ static int ext4_htree_next_block(struct + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -983,6 +1294,13 @@ static int ext4_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); +@@ -991,6 +1309,7 @@ static int ext4_htree_next_block(struct + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -1135,10 +1454,10 @@ int ext4_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL); + if (IS_ERR(frame)) + return PTR_ERR(frame); +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -1178,7 +1497,7 @@ int ext4_htree_fill_tree(struct file *di + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -1454,7 +1773,7 @@ static int is_dx_internal_node(struct in + static struct buffer_head *__ext4_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1496,7 +1815,7 @@ static struct buffer_head *__ext4_find_e + goto restart; + } + if (is_dx(dir)) { +- ret = ext4_dx_find_entry(dir, fname, res_dir); ++ ret = ext4_dx_find_entry(dir, fname, res_dir, lck); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -1506,6 +1825,7 @@ static struct buffer_head *__ext4_find_e + goto cleanup_and_exit; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + ret = NULL; + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); +@@ -1596,10 +1916,10 @@ cleanup_and_exit: + return ret; + } + +-static struct buffer_head *ext4_find_entry(struct inode *dir, ++struct buffer_head *ext4_find_entry_locked(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + int err; + struct ext4_filename fname; +@@ -1611,12 +1931,14 @@ static struct buffer_head *ext4_find_ent + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ++ bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck); + + ext4_fname_free_filename(&fname); + return bh; + } + ++EXPORT_SYMBOL(ext4_find_entry_locked); ++ + static struct buffer_head *ext4_lookup_entry(struct inode *dir, + struct dentry *dentry, + struct ext4_dir_entry_2 **res_dir) +@@ -1631,7 +1953,7 @@ static struct buffer_head *ext4_lookup_e + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ++ bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL); + + ext4_fname_free_filename(&fname); + return bh; +@@ -1639,7 +1961,8 @@ static struct buffer_head *ext4_lookup_e + + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) + { + struct super_block * sb = dir->i_sb; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; +@@ -1650,7 +1973,7 @@ static struct buffer_head * ext4_dx_find + #ifdef CONFIG_FS_ENCRYPTION + *res_dir = NULL; + #endif +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; + do { +@@ -1672,7 +1995,7 @@ static struct buffer_head * ext4_dx_find + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning_inode(dir, + "error %d reading directory index block", +@@ -1852,8 +2175,9 @@ static struct ext4_dir_entry_2* dx_pack_ + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1924,8 +2248,14 @@ static struct ext4_dir_entry_2 *do_split + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split, +- blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, +@@ -1943,12 +2273,21 @@ static struct ext4_dir_entry_2 *do_split + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_dirblock(handle, dir, bh2); + if (err) + goto journal_error; +@@ -2218,7 +2557,7 @@ static int make_indexed_dir(handle_t *ha + if (retval) + goto out_frames; + +- de = do_split(handle,dir, &bh2, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL); + if (IS_ERR(de)) { + retval = PTR_ERR(de); + goto out_frames; +@@ -2328,8 +2667,8 @@ out: + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; +@@ -2375,9 +2714,10 @@ static int ext4_add_entry(handle_t *hand + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, &fname, dir, inode); ++ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; ++ ext4_htree_safe_relock(lck); + /* Can we just ignore htree data? */ + if (ext4_has_metadata_csum(sb)) { + EXT4_ERROR_INODE(dir, +@@ -2440,12 +2780,14 @@ out: + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(ext4_add_entry_locked); + + /* + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode) ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck) + { + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; +@@ -2457,7 +2799,7 @@ static int ext4_dx_add_entry(handle_t *h + + again: + restart = 0; +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return PTR_ERR(frame); + entries = frame->entries; +@@ -2492,6 +2834,12 @@ again: + struct dx_node *node2; + struct buffer_head *bh2; + ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } ++ + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2594,8 +2942,32 @@ again: + restart = 1; + goto journal_error; + } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; ++ } ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto cleanup; +@@ -2606,6 +2978,8 @@ again: + journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + brelse(bh); + dx_release(frames); + /* @restart is true means htree-path has been changed, we need to +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1297,6 +1297,7 @@ static struct inode *ext4_alloc_inode(st + + inode_set_iversion(&ei->vfs_inode, 1); + spin_lock_init(&ei->i_raw_lock); ++ sema_init(&ei->i_append_sem, 1); + INIT_LIST_HEAD(&ei->i_prealloc_list); + atomic_set(&ei->i_prealloc_active, 0); + spin_lock_init(&ei->i_prealloc_lock); diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series new file mode 100644 index 0000000..6add51a --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-ml.series @@ -0,0 +1,29 @@ +rhel8/ext4-inode-version.patch +linux-5.4/ext4-lookup-dotdot.patch +suse15/ext4-print-inum-in-htree-warning.patch +linux-5.8/ext4-prealloc.patch +ubuntu18/ext4-osd-iop-common.patch +linux-5.10/ext4-misc.patch +linux-5.8/ext4-mballoc-extra-checks.patch +linux-5.4/ext4-hash-indexed-dir-dotdot-update.patch +linux-5.8/ext4-kill-dx-root.patch +linux-5.8/ext4-mballoc-pa-free-mismatch.patch +linux-5.10/ext4-data-in-dirent.patch +rhel8/ext4-nocmtime.patch +base/ext4-htree-lock.patch +linux-5.10/ext4-pdirop.patch +linux-5.8/ext4-max-dir-size.patch +linux-5.8/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +linux-5.10/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +linux-5.10/ext4-attach-jinode-in-writepages.patch +rhel8/ext4-dont-check-before-replay.patch +rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7.6/ext4-export-orphan-add.patch +linux-5.8/ext4-export-mb-stream-allocator-variables.patch +ubuntu19/ext4-iget-with-flags.patch +linux-5.4/export-ext4fs-dirhash-helper.patch +linux-5.9/ext4-simple-blockalloc.patch +rhel8.3/ext4-xattr-disable-credits-check.patch +linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch +rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index bf3e463..cdf0a37 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -8025,6 +8025,12 @@ static int osd_mount(const struct lu_env *env, GOTO(out_mnt, rc = -EINVAL); } + if (ldiskfs_has_feature_fast_commit(o->od_mnt->mnt_sb)) { + CERROR("%s: device %s is mounted with fast_commit that breaks recovery\n", + name, dev); + GOTO(out_mnt, rc = -EOPNOTSUPP); + } + #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 53, 0) #ifdef LDISKFS_MOUNT_DIRDATA if (ldiskfs_has_feature_dirdata(o->od_mnt->mnt_sb)) diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 9cc3528..752685f 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -1555,7 +1555,14 @@ osd_index_backup(const struct lu_env *env, struct osd_device *osd, bool backup) # define ldiskfs_has_feature_project(sb) \ LDISKFS_HAS_RO_COMPAT_FEATURE(sb, LDISKFS_FEATURE_RO_COMPAT_PROJECT) # endif +# ifdef LDISKFS_FEATURE_COMPAT_FAST_COMMIT +# define ldiskfs_has_feature_fast_commit(sb) \ + LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_COMPAT_FAST_COMMIT) +# endif +#endif +#ifndef ldiskfs_has_feature_fast_commit +#define ldiskfs_has_feature_fast_commit(sb) false #endif int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, -- 1.8.3.1