From fc82aef82daa2cb9c43f83431174319f076c0aa7 Mon Sep 17 00:00:00 2001 From: Xinliang Liu Date: Mon, 21 Nov 2022 03:36:38 +0000 Subject: [PATCH] LU-16481 build: add server support for openEuler openEuer uses dnf as rpm pkg manager, it is somewhat like RHEL/Fedora. The current openEuler LTS 22.03 kernel is based on Linux 5.10.0. Ldiskfs patches based on ldiskfs-5.10.0-ml.series, different patches compared with ldiskfs-5.10.0-ml.series are: oe2203/ext4-misc.patch oe2203/ext4-pdirop.patch use due to openEuler kernel backport new bugfixes and based on ldiskfs-5.14.21-sles15sp4.series linux-5.16/ext4-inode-version.patch ubuntu20.04.3/ext4-simple-blockalloc.patch linux-5.14/ext4-xattr-disable-credits-check.patch use due to openEuler kernel backport new bugfixes. This patch also fixes lbuild that no need a kernel config file for patchless-server build. And add patched-server build needs an series patches checking. Test notes ---------- This patch is tested with below lbuild cmd: ../lustre-release/contrib/lbuild/lbuild --ccache --kerneldir=/home/openeuler/kernel-src-rpm/ --kernelrpm=/home/openeuler/kernel-src-rpm/ --lustre=/home/openeuler/lustre-release/lustre-2.15.54_1_xxx.tar.gz --patchless-server --disable-zfs Note that, due to zfs openEuler build support patches[1] haven't been backported to the stable release branch zfs-2.1-release and tag 2.1.5, current lbuild doesn't support zfs rpms build for openEuler you need to build zfs rpms in the zfs source code individually with cmd 'make rpms'. And until the openEuler gcc issue[2] is fixed, or you need to apply Lustre rpm spec patch[3]. Until the openEuler kernel symbols providing issue[4] is fixed, or you need to install kmod rpms with cmd 'sudo rpm -ivh --nodeps ./*.aarch64.rpm ' [1] https://github.com/openzfs/zfs/pulls?q=is%3Apr+is%3Aclosed+openeuler [2] https://gitee.com/openeuler/gcc/issues/I5XMD0 [3] diff lustre.spec.in ... -%define optflags -g -O2 -Werror +%define optflags -g -O2 -Werror -Wno-stringop-overflow [4] https://gitee.com/src-openeuler/kernel/issues/I6DQDX Test-Parameters: trivial Change-Id: Ie00e7d37ba3965e409b924109085a675bf3f7f4f Signed-off-by: Xinliang Liu Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49652 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Jian Yu Reviewed-by: Shaun Tancheff Reviewed-by: Oleg Drokin --- config/lustre-build-ldiskfs.m4 | 6 +- config/lustre-build-linux.m4 | 14 +- config/lustre-build.m4 | 2 +- contrib/lbuild/lbuild | 11 +- contrib/lbuild/lbuild-oe2203 | 12 + contrib/lbuild/lbuild-rhel | 2 + .../kernel_patches/patches/oe2203/ext4-misc.patch | 202 +++++ .../patches/oe2203/ext4-pdirop.patch | 925 +++++++++++++++++++++ .../series/ldiskfs-5.10.0-oe2203.series | 34 + lustre/ChangeLog | 3 +- .../kernel_patches/targets/5.10-oe2203.target.in | 2 +- 11 files changed, 1201 insertions(+), 12 deletions(-) create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch create mode 100644 ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch create mode 100644 ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index b24493b..c8ce4c2 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -135,9 +135,13 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ [LDISKFS_SERIES="5.8.0-ml.series"])], [LDISKFS_SERIES="5.11.0-40-ubuntu20.series"], [LDISKFS_SERIES="5.11.0-40-ubuntu20.series"]) +], [test x$OPENEULER_KERNEL = xyes], [ + case $OPENEULER_VERSION_NO in + 2203.0) LDISKFS_SERIES="5.10.0-oe2203.series" ;; + esac ]) ]) -# Not RHEL/SLES or Ubuntu .. probably mainline +# Not RHEL/SLES/openEuler or Ubuntu .. probably mainline AS_IF([test -z "$LDISKFS_SERIES"], [ AS_VERSION_COMPARE([$LINUXRELEASE],[5.4.0],[], diff --git a/config/lustre-build-linux.m4 b/config/lustre-build-linux.m4 index 63e212a..1ff6c47 100644 --- a/config/lustre-build-linux.m4 +++ b/config/lustre-build-linux.m4 @@ -174,15 +174,19 @@ AC_DEFUN([LB_LINUX_RELEASE], [ # Check for openEuler AS_IF([test "x$KERNEL_FOUND" = "xno"], [ - AC_CACHE_CHECK([for openEuler kernel signature], lb_cv_openeuler_kernel_sig, [ - lb_cv_openeuler_kernel_sig="no" - AS_IF([fgrep -q 'openEuler' $LINUX_OBJ/include/linux/kabi.h], [ - lb_cv_openeuler_kernel_sig="yes" + AC_CACHE_CHECK([for openEuler kernel version number], lb_cv_openeuler_kernel_version, [ + lb_cv_openeuler_kernel_version="" + AS_IF([fgrep -q OPENEULER_VERSION $LINUX_OBJ/include/$VERSION_HDIR/version.h], [ + lb_cv_openeuler_kernel_version=$(awk '/ OPENEULER_MAJOR / { print [$]3 }' \ + $LINUX_OBJ/include/$VERSION_HDIR/version.h).$(awk \ + '/ OPENEULER_MINOR / { print [$]3 }' \ + $LINUX_OBJ/include/$VERSION_HDIR/version.h) ]) ]) - AS_IF([test "x$lb_cv_openeuler_kernel_sig" = "xyes"], [ + AS_IF([test -n "$lb_cv_openeuler_kernel_version"], [ OPENEULER_KERNEL="yes" KERNEL_FOUND="yes" + OPENEULER_VERSION_NO=$lb_cv_openeuler_kernel_version ]) ]) diff --git a/config/lustre-build.m4 b/config/lustre-build.m4 index 93a4693..412abf8 100644 --- a/config/lustre-build.m4 +++ b/config/lustre-build.m4 @@ -452,7 +452,7 @@ AM_CONDITIONAL([DOC], [test x$ENABLE_DOC = x1]) AM_CONDITIONAL([MANPAGES], [test x$enable_manpages = xyes]) AM_CONDITIONAL([LINUX], [test x$lb_target_os = xlinux]) AM_CONDITIONAL([USE_QUILT], [test x$use_quilt = xyes]) -AM_CONDITIONAL([RHEL], [test -f /etc/redhat-release]) +AM_CONDITIONAL([RHEL], [test -f /etc/redhat-release -o -f /etc/openEuler-release]) AM_CONDITIONAL([SUSE], [test -f /etc/SUSE-brand -o -f /etc/SuSE-release]) AM_CONDITIONAL([UBUNTU], [test x$UBUNTU_KERNEL = xyes]) AM_CONDITIONAL([BUILD_LUTF], [test x$enable_lutf = xyes]) diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild index a21d17b..cbe5f58 100755 --- a/contrib/lbuild/lbuild +++ b/contrib/lbuild/lbuild @@ -672,6 +672,12 @@ load_target() { . "$TARGET_FILE" + + # patched kernel build needs a series patches + if ! $PATCHLESS && ! $PATCHLESS_SERVER && [ ! "$SERIES" ]; then + fatal 1 "Distro $DISTRO doesn't support patched kernel server build!! Please use option --patchless-server" + fi + if [ -n "$env_OFED_VERSION" ]; then OFED_VERSION="$env_OFED_VERSION" fi @@ -729,7 +735,7 @@ load_target() { # a new variable. PRISTINE_EXTRA_VERSION=$EXTRA_VERSION - if ! $PATCHLESS && [ ! -f "$CONFIG_FILE" ]; then + if ! $PATCHLESS && ! $PATCHLESS_SERVER && [ ! -f "$CONFIG_FILE" ]; then fatal 1 "Config file for target $TARGET missing from $TOPDIR/lustre/lustre/kernel_patches/kernel_configs/." fi @@ -1746,8 +1752,7 @@ build_with_srpm() { fatal 1 "Could not find the Linux tree in $kernelrpm" fi # download and unpack kernel-debuginfo-common (only in EL) - if [[ $DISTROMAJ =~ rhel ]]; then - local KERNEL_DEBUGINFO="kernel-debuginfo-common-${TARGET_ARCH}-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" + if [[ -n "$KERNEL_DEBUGINFO" ]]; then download_debuginfo_common "$KERNEL_DEBUGINFO" if ! lnxrel="$lnxrel" unpack_linux_devel_rpm \ "$KERNELRPMSBASE/$lnxmaj/$DISTROMAJ/$TARGET_ARCH/$KERNEL_DEBUGINFO"; then diff --git a/contrib/lbuild/lbuild-oe2203 b/contrib/lbuild/lbuild-oe2203 index 2abdcb6..d49a1e6 100644 --- a/contrib/lbuild/lbuild-oe2203 +++ b/contrib/lbuild/lbuild-oe2203 @@ -8,6 +8,8 @@ SPEC_NAME="kernel.spec" DEVEL_PATH_ARCH_DELIMETER="." USE_KABI=false RPM_HELPERS_DIR="/usr/lib/rpm/openEuler" +# Pkg which contains ext4 source code +KERNEL_DEBUGINFO="kernel-debugsource-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" # force local definition of %dist into ~/.rpmmacros # to avoid verbose extended strings like ".el9.centos" @@ -30,6 +32,16 @@ find_linux_rpm-oe2203() { find_linux_rpm-rhel "$prefix" "$wanted_kernel" "$pathtorpms" } +kernel_debuginfo_location() { + local base_os="https://repo.openeuler.org/openEuler-22.03-LTS" + + echo "$base_os/update/$TARGET_ARCH/Packages/" +} + +cleanup_rpmmacros() { + sed -i "/^%kernel_module_package/,/^)}$/d" $RMAC +} + apply_kmod_requires_conflicts() { if $PATCHLESS; then # don't allow the patched kernel to be considered as diff --git a/contrib/lbuild/lbuild-rhel b/contrib/lbuild/lbuild-rhel index b5058c4..a052207 100644 --- a/contrib/lbuild/lbuild-rhel +++ b/contrib/lbuild/lbuild-rhel @@ -11,6 +11,8 @@ BUILD_GEN+=".2" # LU-9850 DEVEL_KERNEL_TYPE="devel" RPM_HELPERS_DIR="/usr/lib/rpm/redhat" +# Pkg which contains ext4 source code +KERNEL_DEBUGINFO="kernel-debuginfo-common-${TARGET_ARCH}-${lnxmaj}-${lnxrel}.${TARGET_ARCH}.rpm" # a method which can be overriden by the release specific code get_rpmbuildopts() { diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch new file mode 100644 index 0000000..0e1528f --- /dev/null +++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-misc.patch @@ -0,0 +1,202 @@ +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 57169367362a..f3e369fc4dae 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -1870,6 +1870,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode) + + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + ++#define JOURNAL_START_HAS_3ARGS 1 ++ + /* + * Codes for operating systems + */ +@@ -2110,7 +2112,21 @@ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_bl + + EXTN_FEATURE_FUNCS(2) + EXTN_FEATURE_FUNCS(3) +-EXTN_FEATURE_FUNCS(4) ++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_compat & ++ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & ++ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_incompat & ++ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0); ++} + + static inline bool ext4_has_compat_features(struct super_block *sb) + { +@@ -3570,6 +3586,13 @@ struct ext4_extent; + #define EXT_MAX_BLOCKS 0xffffffff + + extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); ++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, ++ ext4_group_t block_group); ++extern void ext4_inc_count(struct inode *inode); ++extern void ext4_dec_count(struct inode *inode); ++extern struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block); + extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); + extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c +index 875af329c43e..646c00c81479 100644 +--- a/fs/ext4/ialloc.c ++++ b/fs/ext4/ialloc.c +@@ -120,7 +120,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb, + * + * Return buffer_head of bitmap on success, or an ERR_PTR on error. + */ +-static struct buffer_head * ++struct buffer_head * + ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc *desc; +@@ -215,6 +215,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + put_bh(bh); + return ERR_PTR(err); + } ++EXPORT_SYMBOL(ext4_read_inode_bitmap); + + /* + * NOTE! When we get the inode, we're the only people +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index e868b33ed8f5..70f3bb4ef5eb 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -6215,3 +6215,19 @@ vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) + + return ret; + } ++EXPORT_SYMBOL(ext4_map_blocks); ++EXPORT_SYMBOL(ext4_truncate); ++EXPORT_SYMBOL(ext4_iget); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_itable_unused_count); ++EXPORT_SYMBOL(ext4_force_commit); ++EXPORT_SYMBOL(__ext4_mark_inode_dirty); ++EXPORT_SYMBOL(ext4_get_group_desc); ++EXPORT_SYMBOL(__ext4_journal_get_write_access); ++EXPORT_SYMBOL(__ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); ++EXPORT_SYMBOL(__ext4_std_error); ++EXPORT_SYMBOL(ext4fs_dirhash); ++EXPORT_SYMBOL(ext4_get_inode_loc); ++EXPORT_SYMBOL(__ext4_journal_ensure_credits); +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index f4dd3a7ee965..708822bdbfc1 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -50,7 +50,7 @@ + #define NAMEI_RA_BLOCKS 4 + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +-static struct buffer_head *ext4_append(handle_t *handle, ++struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) + { +@@ -100,6 +100,8 @@ static struct buffer_head *ext4_append(handle_t *handle, + return ERR_PTR(err); + } + ++#define assert(test) J_ASSERT(test) ++ + static int ext4_dx_csum_verify(struct inode *inode, + struct ext4_dir_entry *dirent); + +@@ -209,6 +211,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, + } + return bh; + } ++EXPORT_SYMBOL(ext4_append); + + #ifndef assert + #define assert(test) J_ASSERT(test) +@@ -2626,23 +2629,25 @@ static int ext4_delete_entry(handle_t *handle, + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. + */ +-static void ext4_inc_count(struct inode *inode) ++void ext4_inc_count(struct inode *inode) + { + inc_nlink(inode); + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); + } ++EXPORT_SYMBOL(ext4_inc_count); + + /* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +-static void ext4_dec_count(struct inode *inode) ++void ext4_dec_count(struct inode *inode) + { + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); + } ++EXPORT_SYMBOL(ext4_dec_count); + + + /* +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index a461beea2aca..00437b8cd097 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -5607,7 +5607,7 @@ static void ext4_update_super(struct super_block *sb) + __ext4_update_tstamp(&es->s_first_error_time, + &es->s_first_error_time_hi, + sbi->s_first_error_time); +- strncpy(es->s_first_error_func, sbi->s_first_error_func, ++ strlcpy(es->s_first_error_func, sbi->s_first_error_func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = + cpu_to_le32(sbi->s_first_error_line); +@@ -5621,7 +5621,7 @@ static void ext4_update_super(struct super_block *sb) + __ext4_update_tstamp(&es->s_last_error_time, + &es->s_last_error_time_hi, + sbi->s_last_error_time); +- strncpy(es->s_last_error_func, sbi->s_last_error_func, ++ strlcpy(es->s_last_error_func, sbi->s_last_error_func, + sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(sbi->s_last_error_line); + es->s_last_error_ino = cpu_to_le32(sbi->s_last_error_ino); +@@ -6837,8 +6837,6 @@ static int __init ext4_init_fs(void) + if (err) + goto out05; + +- register_as_ext3(); +- register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; +@@ -6848,8 +6846,6 @@ static int __init ext4_init_fs(void) + printk(KERN_ERR "EXT4-fs: Cannot create netlink socket.\n"); + return 0; + out: +- unregister_as_ext2(); +- unregister_as_ext3(); + ext4_fc_destroy_dentry_cache(); + out05: + destroy_inodecache(); +@@ -6874,8 +6870,6 @@ static int __init ext4_init_fs(void) + static void __exit ext4_exit_fs(void) + { + ext4_destroy_lazyinit_thread(); +- unregister_as_ext2(); +- unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + ext4_fc_destroy_dentry_cache(); + destroy_inodecache(); diff --git a/ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch new file mode 100644 index 0000000..0c99ef8 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/oe2203/ext4-pdirop.patch @@ -0,0 +1,925 @@ +Single directory performance is a critical for HPC workloads. In a +typical use case an application creates a separate output file for +each node and task in a job. As nodes and tasks increase, hundreds +of thousands of files may be created in a single directory within +a short window of time. +Today, both filename lookup and file system modifying operations +(such as create and unlink) are protected with a single lock for +an entire ldiskfs directory. PDO project will remove this +bottleneck by introducing a parallel locking mechanism for entire +ldiskfs directories. This work will enable multiple application +threads to simultaneously lookup, create and unlink in parallel. + +This patch contains: + - pdirops support for ldiskfs + - integrate with osd-ldiskfs +--- + fs/ext4/Makefile | 1 + + fs/ext4/ext4.h | 78 ++++++++ + fs/ext4/namei.c | 464 +++++++++++++++++++++++++++++++++++++++++++---- + fs/ext4/super.c | 1 + + 4 files changed, 505 insertions(+), 39 deletions(-) + +diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile +index 49e7af6..f7ced03 100644 +--- a/fs/ext4/Makefile ++++ b/fs/ext4/Makefile +@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ ++ htree_lock.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ +diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h +index 3c6fa2b..c4c5aae 100644 +--- a/fs/ext4/ext4.h ++++ b/fs/ext4/ext4.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1013,6 +1014,9 @@ struct ext4_inode_info { + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +@@ -2429,6 +2433,72 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ ++extern struct buffer_head *ext4_find_entry_locked(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); ++ + struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; +@@ -2803,12 +2873,20 @@ void ext4_insert_dentry(struct inode *inode, + struct ext4_filename *fname, void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!ext4_has_feature_dir_index(inode->i_sb) && + ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { + /* ext4_iget() should have caught this... */ + WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); + } ++#endif + } + static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index 24e1276..4bf1d99 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -56,6 +56,7 @@ struct buffer_head *ext4_append(handle_t *handle, + { + struct ext4_map_blocks map; + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); + int err; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && +@@ -63,6 +64,10 @@ struct buffer_head *ext4_append(handle_t *handle, + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); ++ + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + map.m_lblk = *block; + map.m_len = 1; +@@ -73,19 +78,25 @@ struct buffer_head *ext4_append(handle_t *handle, + * directory. + */ + err = ext4_map_blocks(NULL, inode, &map, 0); +- if (err < 0) ++ if (err < 0) { ++ up(&ei->i_append_sem); + return ERR_PTR(err); ++ } + if (err) { ++ up(&ei->i_append_sem); + EXT4_ERROR_INODE(inode, "Logical block already allocated"); + return ERR_PTR(-EFSCORRUPTED); + } + + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); +- if (IS_ERR(bh)) ++ if (IS_ERR(bh)) { ++ up(&ei->i_append_sem); + return bh; ++ } + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + err = ext4_mark_inode_dirty(handle, inode); ++ up(&ei->i_append_sem); + if (err) + goto out; + BUFFER_TRACE(bh, "get_write_access"); +@@ -301,7 +312,8 @@ static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_frame *frame); ++ struct dx_frame *frame, ++ struct htree_lock *lck); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct inode *dir, struct buffer_head *bh, + struct dx_hash_info *hinfo, +@@ -315,12 +327,13 @@ static void dx_insert_block(struct dx_frame *frame, + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir); ++ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck); + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode); ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck); + + /* checksumming functions */ + void ext4_initialize_dirent_tail(struct buffer_head *bh, +@@ -784,6 +797,227 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ ext4_find_entry_locked(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ ext4_add_entry_locked(handle, dentry, inode, NULL) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -795,10 +1029,11 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, + */ + static struct dx_frame * + dx_probe(struct ext4_filename *fname, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck) + { + unsigned count, indirect, level, i; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info *info; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); +@@ -864,8 +1099,16 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + level = 0; + blocks[0] = 0; + while (1) { ++ if (indirect == level) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation ++ */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); + if (!count || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); +@@ -905,6 +1148,74 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + frame->entries = entries; + frame->at = at; + ++ if (indirect == level) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return ++ */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock ++ */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it ++ */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block ++ */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block ++ */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } ++ return frame; ++ } ++ dx = at; ++ + block = dx_get_block(at); + for (i = 0; i <= level; i++) { + if (blocks[i] == block) { +@@ -914,8 +1225,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, + goto fail; + } + } +- if (++level > indirect) +- return frame; ++ ++level; + blocks[level] = block; + frame++; + frame->bh = ext4_read_dirblock(dir, block, INDEX); +@@ -986,7 +1296,7 @@ static void dx_release(struct dx_frame *frames) + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -1001,12 +1311,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -1029,6 +1349,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); +@@ -1037,6 +1364,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -1181,10 +1509,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL); + if (IS_ERR(frame)) + return PTR_ERR(frame); +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -1224,7 +1552,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -1507,7 +1835,7 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, + static struct buffer_head *__ext4_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1549,7 +1877,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, + goto restart; + } + if (is_dx(dir)) { +- ret = ext4_dx_find_entry(dir, fname, res_dir); ++ ret = ext4_dx_find_entry(dir, fname, res_dir, lck); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -1559,6 +1887,7 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, + goto cleanup_and_exit; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + ret = NULL; + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); +@@ -1649,10 +1978,10 @@ cleanup_and_exit: + return ret; + } + +-static struct buffer_head *ext4_find_entry(struct inode *dir, ++struct buffer_head *ext4_find_entry_locked(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + int err; + struct ext4_filename fname; +@@ -1664,12 +1993,14 @@ static struct buffer_head *ext4_find_entry(struct inode *dir, + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, inlined); ++ bh = __ext4_find_entry(dir, &fname, res_dir, inlined, lck); + + ext4_fname_free_filename(&fname); + return bh; + } + ++EXPORT_SYMBOL(ext4_find_entry_locked); ++ + static struct buffer_head *ext4_lookup_entry(struct inode *dir, + struct dentry *dentry, + struct ext4_dir_entry_2 **res_dir) +@@ -1684,7 +2015,7 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + if (err) + return ERR_PTR(err); + +- bh = __ext4_find_entry(dir, &fname, res_dir, NULL); ++ bh = __ext4_find_entry(dir, &fname, res_dir, NULL, NULL); + + ext4_fname_free_filename(&fname); + return bh; +@@ -1692,7 +2023,8 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir, + + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) + { + struct super_block * sb = dir->i_sb; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; +@@ -1703,7 +2035,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + #ifdef CONFIG_FS_ENCRYPTION + *res_dir = NULL; + #endif +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; + do { +@@ -1725,7 +2057,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning_inode(dir, + "error %d reading directory index block", +@@ -1912,8 +2244,9 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned continued; +@@ -1990,6 +2323,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + /* Fancy dance to stay within two buffers */ + de2 = dx_move_dirents(data1, data2, map + split, count - split, + blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); ++ } ++ + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, +@@ -2007,12 +2349,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_dirblock(handle, dir, bh2); + if (err) + goto journal_error; +@@ -2283,7 +2634,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + if (retval) + goto out_frames; + +- de = do_split(handle,dir, &bh2, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL); + if (IS_ERR(de)) { + retval = PTR_ERR(de); + goto out_frames; +@@ -2393,8 +2744,8 @@ out: + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int ext4_add_entry_locked(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; +@@ -2443,9 +2794,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, &fname, dir, inode); ++ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; ++ ext4_htree_safe_relock(lck); + /* Can we just ignore htree data? */ + if (ext4_has_metadata_csum(sb)) { + EXT4_ERROR_INODE(dir, +@@ -2508,12 +2860,14 @@ out: + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(ext4_add_entry_locked); + + /* + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode) ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck) + { + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; +@@ -2525,7 +2879,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + + again: + restart = 0; +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return PTR_ERR(frame); + entries = frame->entries; +@@ -2560,6 +2914,12 @@ again: + struct dx_node *node2; + struct buffer_head *bh2; + ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } ++ + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2661,8 +3021,32 @@ again: + restart = 1; + goto journal_error; + } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; ++ } ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto cleanup; +@@ -2673,6 +3057,8 @@ again: + journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + brelse(bh); + dx_release(frames); + /* @restart is true means htree-path has been changed, we need to +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index f7614a5..3af5d10 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -1336,6 +1336,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) + + inode_set_iversion(&ei->vfs_inode, 1); + spin_lock_init(&ei->i_raw_lock); ++ sema_init(&ei->i_append_sem, 1); + INIT_LIST_HEAD(&ei->i_prealloc_list); + atomic_set(&ei->i_prealloc_active, 0); + spin_lock_init(&ei->i_prealloc_lock); +-- +2.33.0 + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series new file mode 100644 index 0000000..b667234 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-5.10.0-oe2203.series @@ -0,0 +1,34 @@ +linux-5.16/ext4-inode-version.patch +linux-5.4/ext4-lookup-dotdot.patch +suse15/ext4-print-inum-in-htree-warning.patch +linux-5.8/ext4-prealloc.patch +ubuntu18/ext4-osd-iop-common.patch +oe2203/ext4-misc.patch +linux-5.8/ext4-mballoc-extra-checks.patch +linux-5.4/ext4-hash-indexed-dir-dotdot-update.patch +linux-5.8/ext4-kill-dx-root.patch +linux-5.8/ext4-mballoc-pa-free-mismatch.patch +linux-5.10/ext4-data-in-dirent.patch +rhel8/ext4-nocmtime.patch +base/ext4-htree-lock.patch +oe2203/ext4-pdirop.patch +linux-5.8/ext4-max-dir-size.patch +linux-5.8/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +linux-5.10/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +linux-5.10/ext4-attach-jinode-in-writepages.patch +rhel8/ext4-dont-check-before-replay.patch +rhel7.6/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7.6/ext4-export-orphan-add.patch +linux-5.8/ext4-export-mb-stream-allocator-variables.patch +ubuntu19/ext4-iget-with-flags.patch +linux-5.4/export-ext4fs-dirhash-helper.patch +ubuntu20.04.3/ext4-simple-blockalloc.patch +linux-5.14/ext4-xattr-disable-credits-check.patch +linux-5.8/ext4-no-max-dir-size-limit-for-iam-objects.patch +rhel8/ext4-ialloc-uid-gid-and-pass-owner-down.patch +base/ext4-projid-xattrs.patch +linux-5.8/ext4-enc-flag.patch +base/ext4-delayed-iput.patch +linux-5.10/ext4-fiemap-kernel-data.patch +rhel8/ext4-old_ea_inodes_handling_fix.patch diff --git a/lustre/ChangeLog b/lustre/ChangeLog index e2bd123..aed3e69 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -25,6 +25,7 @@ TBD Whamcloud vanilla linux 5.4.0 (ZFS + ldiskfs) vanilla linux 5.4.21 (ZFS + ldiskfs) vanilla linux 5.4.136 (ZFS + ldiskfs) + 5.10.0-60.79.0.103.oe2203 (openEuler 22.03 LTS) * ldiskfs needs an ldiskfs patch series for that kernel, ZFS does not * Client primary kernels built and tested during release cycle: 5.14.0-162.12.1.el9 (RHEL9.1) @@ -59,7 +60,7 @@ TBD Whamcloud 5.8.0-53 (Ubuntu 20.04.2 HWE) 5.11.0-31 (Ubuntu 20.04.3 HWE) 5.11.0 (vanilla kernel.org) - 5.10.0-60.56.0.84.oe2203 (openEuler 22.03 LTS) + 5.10.0 (openEuler 22.03 LTS) * Recommended e2fsprogs version: 1.46.6-wc1 or newer * Recommended ZFS version: 2.1.5 * NFS export disabled when stack size < 8192 (32-bit Lustre clients), diff --git a/lustre/kernel_patches/targets/5.10-oe2203.target.in b/lustre/kernel_patches/targets/5.10-oe2203.target.in index 5ba89d2..c049ac2 100644 --- a/lustre/kernel_patches/targets/5.10-oe2203.target.in +++ b/lustre/kernel_patches/targets/5.10-oe2203.target.in @@ -1,5 +1,5 @@ lnxmaj="5.10.0" -lnxrel="60.56.0.84.oe2203" +lnxrel="60.79.0.103.oe2203" KERNEL_SRPM=kernel-${lnxmaj}-${lnxrel}.src.rpm SERIES="" -- 1.8.3.1