From: Li Dongyang Date: Thu, 19 Jul 2018 16:24:36 +0000 (-0400) Subject: LU-11071 build: Add server build support for Ubuntu 18.04 X-Git-Tag: 2.11.54~42 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=e41bdca755954beeb47ee6653ed9e140b7338e96 LU-11071 build: Add server build support for Ubuntu 18.04 This enables server build for Ubuntu 18.04 LTS, the ldiskfs patches are based on Gael's 4.12 support, they apply to kernel versions 4.15.0-20.21 to 4.15.0-23.25 There's also a small fix to make dpkg happy when installing lustre packages which requires lustre-client-utils. Test-Parameters: clientdistro=ubuntu1604 trivial Signed-off-by: Li Dongyang Signed-off-by: Gael Delbary Change-Id: I65e1a5ee0d17115f23ba071ff1ab23b4fb22e78f Reviewed-on: https://review.whamcloud.com/32613 Tested-by: Jenkins Reviewed-by: Quentin Bouget Tested-by: Maloo Reviewed-by: Yang Sheng Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- diff --git a/autoMakefile.am b/autoMakefile.am index fb20ce8..aaa8328 100644 --- a/autoMakefile.am +++ b/autoMakefile.am @@ -282,8 +282,11 @@ debs: undef.h if test "x@ENABLEO2IB@" = "xno"; then \ export IB_OPTIONS="--with-o2ib=no"; \ else \ - if test -n "@O2IBPATH@"; then \ + if test "x@ENABLEO2IB@" != "xyes" && \ + test -n "@O2IBPATH@"; then \ export IB_OPTIONS="--with-o2ib=@O2IBPATH@"; \ + else \ + export IB_OPTIONS="--with-o2ib=yes"; \ fi; \ fi; \ export KSRC_TREE=$(LINUX) && \ diff --git a/config/lustre-build-ldiskfs.m4 b/config/lustre-build-ldiskfs.m4 index 7c5640d..06a9546 100644 --- a/config/lustre-build-ldiskfs.m4 +++ b/config/lustre-build-ldiskfs.m4 @@ -56,23 +56,33 @@ AS_IF([test x$RHEL_KERNEL = xyes], [ )], [LDISKFS_SERIES="4.4-sles12sp3.series"], [LDISKFS_SERIES="4.4-sles12sp3.series"]) ], [test x$UBUNTU_KERNEL = xyes], [ - AS_VERSION_COMPARE([$LINUXRELEASE],[4.4.0], - [], - [ - KPLEV=$(echo $LINUXRELEASE | sed -n 's/.*-\([0-9]\+\).*/\1/p') - AS_IF( - [test -z "$KPLEV"], [ - AC_MSG_WARN([Failed to determine Kernel patch level. Assume latest.]) - LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series" - ], - [test $KPLEV -ge 73], [LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series"], - [test $KPLEV -ge 62], [LDISKFS_SERIES="4.4.0-62-ubuntu14+16.series"], - [test $KPLEV -ge 49], [LDISKFS_SERIES="4.4.0-49-ubuntu14+16.series"], - [LDISKFS_SERIES="4.4.0-45-ubuntu14+16.series"] - ) - ], - [LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series"] - ) + AS_VERSION_COMPARE([$LINUXRELEASE],[4.15.0],[ + AS_VERSION_COMPARE([$LINUXRELEASE],[4.4.0], [], + [ + KPLEV=$(echo $LINUXRELEASE | sed -n 's/.*-\([0-9]\+\).*/\1/p') + AS_IF( + [test -z "$KPLEV"], [ + AC_MSG_WARN([Failed to determine Kernel patch level. Assume latest.]) + LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series" + ], + [test $KPLEV -ge 73], [LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series"], + [test $KPLEV -ge 62], [LDISKFS_SERIES="4.4.0-62-ubuntu14+16.series"], + [test $KPLEV -ge 49], [LDISKFS_SERIES="4.4.0-49-ubuntu14+16.series"], + [LDISKFS_SERIES="4.4.0-45-ubuntu14+16.series"] + ) + ], + [LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series"])], + [ + KPLEV=$(echo $LINUXRELEASE | sed -n 's/.*-\([0-9]\+\).*/\1/p') + AS_IF( + [test -z "$KPLEV"], [ + AC_MSG_WARN([Failed to determine Kernel patch level. Assume latest.]) + LDISKFS_SERIES="4.15.0-20-ubuntu18.series" + ], + [test $KPLEV -ge 20], [LDISKFS_SERIES="4.15.0-20-ubuntu18.series"] + ) + ], + [LDISKFS_SERIES="4.15.0-20-ubuntu18.series"]) ]) ]) AS_IF([test -z "$LDISKFS_SERIES"], diff --git a/config/lustre-build-linux.m4 b/config/lustre-build-linux.m4 index d0e604e..ba250e3 100644 --- a/config/lustre-build-linux.m4 +++ b/config/lustre-build-linux.m4 @@ -210,11 +210,13 @@ AC_DEFUN([LB_ARG_CANON_PATH], [ # Find paths for linux, handling kernel-source rpms # AC_DEFUN([LB_LINUX_PATH], [ -for DEFAULT_LINUX in /lib/modules/$(uname -r)/{source,build} /usr/src/linux $(find /usr/src/kernels/ -maxdepth 1 -name @<:@0-9@:>@\* | xargs -r ls -d | tail -n 1); do +for DEFAULT_LINUX in /usr/src/linux-source-* /lib/modules/$(uname -r)/{source,build} /usr/src/linux $(find /usr/src/kernels/ -maxdepth 1 -name @<:@0-9@:>@\* | xargs -r ls -d | tail -n 1); do AS_IF([readlink -q -e $DEFAULT_LINUX >/dev/null], [break]) done if test "$DEFAULT_LINUX" = "/lib/modules/$(uname -r)/source"; then PATHS="/lib/modules/$(uname -r)/build" +else + PATHS="/usr/src/linux-headers-$(uname -r)" fi PATHS+=" $DEFAULT_LINUX" for DEFAULT_LINUX_OBJ in $PATHS; do diff --git a/config/lustre-build-zfs.m4 b/config/lustre-build-zfs.m4 index 0bcdf20..5120ce0 100644 --- a/config/lustre-build-zfs.m4 +++ b/config/lustre-build-zfs.m4 @@ -15,8 +15,8 @@ dnl # dnl # * /var/lib/dkms/zfs/${VERSION}/source dnl # * /usr/src/zfs-${VERSION}/${LINUXRELEASE} dnl # * /usr/src/zfs-${VERSION} -dnl # * ../spl/ -dnl # * $LINUX +dnl # * ../zfs/ +dnl # * $LINUX/zfs dnl # dnl # --with-zfs-devel=path dnl # - User provided directory where zfs development headers @@ -46,7 +46,7 @@ dnl # * /var/lib/dkms/spl/${VERSION}/source dnl # * /usr/src/spl-${VERSION}/${LINUXRELEASE} dnl # * /usr/src/spl-${VERSION} dnl # * ../spl/ -dnl # * $LINUX +dnl # * $LINUX/spl dnl # dnl # --with-spl=path - Enable spl support and use the spl headers in the dnl # provided path. No autodetection is performed. @@ -76,7 +76,7 @@ AC_DEFUN([LB_SPL], [ splsrc1="/usr/src/spl-${splver}/${LINUXRELEASE}" splsrc2="/usr/src/spl-${splver}" splsrc3="../spl/" - splsrc4="$LINUX" + splsrc4="$LINUX/spl" AC_MSG_CHECKING([spl source directory]) AS_IF([test -z "${splsrc}"], [ @@ -194,7 +194,7 @@ AC_DEFUN([LB_ZFS], [ zfssrc1="/usr/src/zfs-${zfsver}/${LINUXRELEASE}" zfssrc2="/usr/src/zfs-${zfsver}" zfssrc3="../zfs/" - zfssrc4="$LINUX" + zfssrc4="$LINUX/zfs" AC_MSG_CHECKING([zfs source directory]) AS_IF([test -z "${zfssrc}"], [ diff --git a/debian/changelog b/debian/changelog index 4747a05..5a39ecc 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +lustre (2.11.52-108-g258f0cf-dirty-1) unstable; urgency=low + + * Automated changelog entry update + + -- Brian J. Murrell Wed, 18 Jul 2018 08:36:49 -0600 + lustre (2.7.50-1) unstable; urgency=low * Automated changelog entry update diff --git a/debian/control b/debian/control index 57cf797..3cd0238 100644 --- a/debian/control +++ b/debian/control @@ -4,19 +4,19 @@ Priority: optional Maintainer: Brian J. Murrell Uploaders: Brian J. Murrell Standards-Version: 3.8.3 -Build-Depends: module-assistant, libreadline-dev, debhelper (>=7.0.0), dpatch, automake (>=1.7) | automake1.7 | automake1.8 | automake1.9, libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, bzip2, quilt, linux-headers-generic | linux-headers | linux-headers-amd64, rsync -Homepage: https://wiki.hpdd.intel.com/ -Vcs-Git: git://git.hpdd.intel.com/fs/lustre-release.git +Build-Depends: module-assistant, libreadline-dev, debhelper (>=9.0.0), dpatch, automake (>=1.7) | automake1.7 | automake1.8 | automake1.9, pkg-config, libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, bzip2, quilt, linux-headers-generic | linux-headers | linux-headers-amd64, rsync +Homepage: https://wiki.whamcloud.com/ +Vcs-Git: git://git.whamcloud.com/fs/lustre-release.git Package: lustre-source Section: admin Architecture: all Priority: optional -Depends: module-assistant, bzip2, debhelper (>= 7.0.0), libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, dpatch +Depends: module-assistant, bzip2, debhelper (>= 9.0.0), libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, dpatch, pkg-config Description: source for Lustre filesystem client kernel modules Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package contains the module source. The client kernel modules can be built for kernels 2.6.32+ with the use of module-assistant @@ -29,8 +29,8 @@ Priority: optional Depends: ${shlibs:Depends}, ${misc:Depends}, libyaml-0-2, libselinux1, libsnmp30, zlib1g, perl Description: Userspace utilities for the Lustre filesystem (client) Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a number of userspace utilities for accessing Lustre filesystems from a client. If you need server utils, @@ -41,13 +41,13 @@ Section: utils Architecture: i386 powerpc ppc64el amd64 ia64 arm64 Priority: optional Depends: ${shlibs:Depends}, ${misc:Depends}, libyaml-0-2, libselinux1, libsnmp30, zlib1g, perl -Provides: lustre-server-utils, lustre-client-utils +Provides: lustre-server-utils, lustre-client-utils (= ${binary:Version}) Conflicts: lustre-client-utils Replaces: lustre-client-utils Description: Userspace utilities for the Lustre filesystem (server) Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a number of userspace utilities for accessing and maintaining Lustre filesystems from a server. @@ -61,8 +61,8 @@ Priority: optional Depends: lustre-client-utils (= ${binary:Version}), python, perl, sg3-utils Description: Collection of benchmark tools for the Lustre filesystem Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a collection of benchmark tools @@ -73,8 +73,8 @@ Priority: optional Depends: lustre-iokit (= ${binary:Version}), attr, rsync, perl, lsof, mpi-default-bin Description: Test suite for the Lustre filesystem Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a number of test utilities for the Lustre filesystem. @@ -85,7 +85,7 @@ Architecture: i386 powerpc ppc64el amd64 ia64 arm64 Depends: lustre-client-utils (= ${binary:Version}) Description: Development files for the Lustre filesystem Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides development libraries for the Lustre filesystem. diff --git a/debian/control.main b/debian/control.main index 57cf797..3cd0238 100644 --- a/debian/control.main +++ b/debian/control.main @@ -4,19 +4,19 @@ Priority: optional Maintainer: Brian J. Murrell Uploaders: Brian J. Murrell Standards-Version: 3.8.3 -Build-Depends: module-assistant, libreadline-dev, debhelper (>=7.0.0), dpatch, automake (>=1.7) | automake1.7 | automake1.8 | automake1.9, libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, bzip2, quilt, linux-headers-generic | linux-headers | linux-headers-amd64, rsync -Homepage: https://wiki.hpdd.intel.com/ -Vcs-Git: git://git.hpdd.intel.com/fs/lustre-release.git +Build-Depends: module-assistant, libreadline-dev, debhelper (>=9.0.0), dpatch, automake (>=1.7) | automake1.7 | automake1.8 | automake1.9, pkg-config, libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, bzip2, quilt, linux-headers-generic | linux-headers | linux-headers-amd64, rsync +Homepage: https://wiki.whamcloud.com/ +Vcs-Git: git://git.whamcloud.com/fs/lustre-release.git Package: lustre-source Section: admin Architecture: all Priority: optional -Depends: module-assistant, bzip2, debhelper (>= 7.0.0), libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, dpatch +Depends: module-assistant, bzip2, debhelper (>= 9.0.0), libtool, libyaml-dev, libselinux-dev, libsnmp-dev, mpi-default-dev, dpatch, pkg-config Description: source for Lustre filesystem client kernel modules Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package contains the module source. The client kernel modules can be built for kernels 2.6.32+ with the use of module-assistant @@ -29,8 +29,8 @@ Priority: optional Depends: ${shlibs:Depends}, ${misc:Depends}, libyaml-0-2, libselinux1, libsnmp30, zlib1g, perl Description: Userspace utilities for the Lustre filesystem (client) Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a number of userspace utilities for accessing Lustre filesystems from a client. If you need server utils, @@ -41,13 +41,13 @@ Section: utils Architecture: i386 powerpc ppc64el amd64 ia64 arm64 Priority: optional Depends: ${shlibs:Depends}, ${misc:Depends}, libyaml-0-2, libselinux1, libsnmp30, zlib1g, perl -Provides: lustre-server-utils, lustre-client-utils +Provides: lustre-server-utils, lustre-client-utils (= ${binary:Version}) Conflicts: lustre-client-utils Replaces: lustre-client-utils Description: Userspace utilities for the Lustre filesystem (server) Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a number of userspace utilities for accessing and maintaining Lustre filesystems from a server. @@ -61,8 +61,8 @@ Priority: optional Depends: lustre-client-utils (= ${binary:Version}), python, perl, sg3-utils Description: Collection of benchmark tools for the Lustre filesystem Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a collection of benchmark tools @@ -73,8 +73,8 @@ Priority: optional Depends: lustre-iokit (= ${binary:Version}), attr, rsync, perl, lsof, mpi-default-bin Description: Test suite for the Lustre filesystem Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides a number of test utilities for the Lustre filesystem. @@ -85,7 +85,7 @@ Architecture: i386 powerpc ppc64el amd64 ia64 arm64 Depends: lustre-client-utils (= ${binary:Version}) Description: Development files for the Lustre filesystem Lustre is a scalable, secure, robust, highly-available cluster file system. - This release is maintained by Intel Corporation and available from - https://wiki.hpdd.intel.com/ + This release is maintained by Whamcloud and available from + https://wiki.whamcloud.com/ . This package provides development libraries for the Lustre filesystem. diff --git a/debian/control.modules.in b/debian/control.modules.in index c4d10ec..33735a5 100644 --- a/debian/control.modules.in +++ b/debian/control.modules.in @@ -4,7 +4,7 @@ Priority: optional Maintainer: Brian J. Murrell Uploaders: Brian J. Murrell Standards-Version: 3.7.2 -Build-Depends: debhelper (>= 7.0.0), bzip2 +Build-Depends: debhelper (>= 9.0.0), bzip2 Package: lustre-client-modules-_KVERS_ Architecture: any diff --git a/debian/lustre-client-utils.conffiles b/debian/lustre-client-utils.conffiles deleted file mode 100644 index 164a7d9..0000000 --- a/debian/lustre-client-utils.conffiles +++ /dev/null @@ -1,4 +0,0 @@ -/etc/lnet.conf -/etc/lnet_routes.conf -/etc/modprobe.d/ko2iblnd.conf -/etc/udev/rules.d/99-lustre.rules diff --git a/debian/lustre-server-utils.conffiles b/debian/lustre-server-utils.conffiles deleted file mode 100644 index fd49053..0000000 --- a/debian/lustre-server-utils.conffiles +++ /dev/null @@ -1,5 +0,0 @@ -/etc/ldev.conf -/etc/lnet.conf -/etc/lnet_routes.conf -/etc/modprobe.d/ko2iblnd.conf -/etc/udev/rules.d/99-lustre.rules diff --git a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-eas.patch index a321b21..968ab15 100644 --- a/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-eas.patch +++ b/ldiskfs/kernel_patches/patches/rhel6.3/ext4-large-eas.patch @@ -18,19 +18,6 @@ Index: linux-stage/fs/ext4/ext4.h EXT4_FEATURE_INCOMPAT_MMP| \ EXT4_FEATURE_INCOMPAT_DIRDATA) -@@ -1695,6 +1697,12 @@ struct mmpd_data { - #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - - /* -+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb -+ * This limit is arbitrary, but is reasonable for the xattr API. -+ */ -+#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) -+ -+/* - * Function prototypes - */ - @@ -1706,6 +1714,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, diff --git a/ldiskfs/kernel_patches/patches/rhel7.2/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-large-eas.patch index afe5447..35ca4d5 100644 --- a/ldiskfs/kernel_patches/patches/rhel7.2/ext4-large-eas.patch +++ b/ldiskfs/kernel_patches/patches/rhel7.2/ext4-large-eas.patch @@ -16,19 +16,6 @@ Index: linux-stage/fs/ext4/ext4.h EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_DIRDATA| \ EXT4_FEATURE_INCOMPAT_INLINE_DATA) -@@ -1979,6 +1980,12 @@ struct mmpd_data { - #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - - /* -+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1MB -+ * This limit is arbitrary, but is reasonable for the xattr API. -+ */ -+#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) -+ -+/* - * Function prototypes - */ - @@ -1990,6 +1997,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-large-eas.patch index bc87ecc..5ef75ea 100644 --- a/ldiskfs/kernel_patches/patches/rhel7/ext4-large-eas.patch +++ b/ldiskfs/kernel_patches/patches/rhel7/ext4-large-eas.patch @@ -16,19 +16,6 @@ Index: linux-stage/fs/ext4/ext4.h EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_DIRDATA| \ EXT4_FEATURE_INCOMPAT_INLINE_DATA) -@@ -1979,6 +1980,12 @@ struct mmpd_data { - #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - - /* -+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb -+ * This limit is arbitrary, but is reasonable for the xattr API. -+ */ -+#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) -+ -+/* - * Function prototypes - */ - @@ -1990,6 +1997,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, diff --git a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-eas.patch index 1c784d9..38e5ff5 100644 --- a/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-eas.patch +++ b/ldiskfs/kernel_patches/patches/sles11sp2/ext4-large-eas.patch @@ -16,19 +16,6 @@ Index: linux-stage/fs/ext4/ext4.h EXT4_FEATURE_INCOMPAT_MMP| \ EXT4_FEATURE_INCOMPAT_DIRDATA) -@@ -1764,6 +1765,12 @@ struct mmpd_data { - #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - - /* -+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb -+ * This limit is arbitrary, but is reasonable for the xattr API. -+ */ -+#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) -+ -+/* - * Function prototypes - */ - @@ -1775,6 +1782,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, diff --git a/ldiskfs/kernel_patches/patches/sles12/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/sles12/ext4-large-eas.patch index 8430ef1..6dfcb5c 100644 --- a/ldiskfs/kernel_patches/patches/sles12/ext4-large-eas.patch +++ b/ldiskfs/kernel_patches/patches/sles12/ext4-large-eas.patch @@ -16,19 +16,6 @@ Index: linux-stage/fs/ext4/ext4.h EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_DIRDATA| \ EXT4_FEATURE_INCOMPAT_INLINE_DATA) -@@ -1945,6 +1946,12 @@ struct mmpd_data { - #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - - /* -+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1MB -+ * This limit is arbitrary, but is reasonable for the xattr API. -+ */ -+#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) -+ -+/* - * Function prototypes - */ - @@ -1956,6 +1963,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, diff --git a/ldiskfs/kernel_patches/patches/sles12sp2/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/sles12sp2/ext4-large-eas.patch index 851e45f..0bf0c7d 100644 --- a/ldiskfs/kernel_patches/patches/sles12sp2/ext4-large-eas.patch +++ b/ldiskfs/kernel_patches/patches/sles12sp2/ext4-large-eas.patch @@ -16,19 +16,6 @@ Index: linux-stage/fs/ext4/ext4.h EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_DIRDATA| \ EXT4_FEATURE_INCOMPAT_INLINE_DATA) -@@ -1979,6 +1980,12 @@ struct mmpd_data { - #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - - /* -+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb -+ * This limit is arbitrary, but is reasonable for the xattr API. -+ */ -+#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) -+ -+/* - * Function prototypes - */ - @@ -1990,6 +1997,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-eas.patch index 1b848b9..9383900 100644 --- a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-eas.patch +++ b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-eas.patch @@ -10,19 +10,6 @@ index 10a2a86..217fdcc 100644 EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_DIRDATA| \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ -@@ -2241,6 +2242,12 @@ struct mmpd_data { - #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL - - /* -+ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb -+ * This limit is arbitrary, but is reasonable for the xattr API. -+ */ -+#define EXT4_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) -+ -+/* - * Function prototypes - */ - @@ -2252,6 +2259,10 @@ struct mmpd_data { # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-attach-jinode-in-writepages.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-attach-jinode-in-writepages.patch new file mode 100644 index 0000000..8354011 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-attach-jinode-in-writepages.patch @@ -0,0 +1,44 @@ +Index: linux-4.15.0/fs/ext4/inode.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/inode.c ++++ linux-4.15.0/fs/ext4/inode.c +@@ -728,6 +728,9 @@ out_sem: + !(flags & EXT4_GET_BLOCKS_ZERO) && + !ext4_is_quota_file(inode) && + ext4_should_order_data(inode)) { ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ return ret; + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + ret = ext4_jbd2_inode_add_wait(handle, inode); + else +@@ -2806,6 +2809,9 @@ static int ext4_writepages(struct addres + mpd.last_page = wbc->range_end >> PAGE_SHIFT; + } + ++ ret = ext4_inode_attach_jinode(inode); ++ if (ret) ++ goto out_writepages; + mpd.inode = inode; + mpd.wbc = wbc; + ext4_io_submit_init(&mpd.io_submit, wbc); +@@ -4356,6 +4362,7 @@ int ext4_inode_attach_jinode(struct inod + jbd2_free_inode(jinode); + return 0; + } ++EXPORT_SYMBOL(ext4_inode_attach_jinode); + + /* + * ext4_truncate() +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -2595,6 +2595,7 @@ extern int ext4_trim_fs(struct super_blo + extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); + + /* inode.c */ ++#define HAVE_LDISKFS_INFO_JINODE + int ext4_inode_is_fast_symlink(struct inode *inode); + struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); + struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-corrupted-inode-block-bitmaps-handling-patches.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-corrupted-inode-block-bitmaps-handling-patches.patch new file mode 100644 index 0000000..7dd6da0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-corrupted-inode-block-bitmaps-handling-patches.patch @@ -0,0 +1,444 @@ +Since we could skip corrupt block groups, this patch +use ext4_warning() intead of ext4_error() to make FS not +emount RO in default, also fix a leftover from upstream +commit 163a203ddb36c36d4a1c942 + +Index: linux-4.15.0/fs/ext4/balloc.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/balloc.c ++++ linux-4.15.0/fs/ext4/balloc.c +@@ -185,25 +185,17 @@ static int ext4_init_block_bitmap(struct + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start, tmp; + int flex_bg = 0; +- struct ext4_group_info *grp; + + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks used to prevent allocation + * essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { +- grp = ext4_get_group_info(sb, block_group); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); +- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +- int count; +- count = ext4_free_inodes_count(sb, gdp); +- percpu_counter_sub(&sbi->s_freeinodes_counter, +- count); +- } +- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_corrupted_block_group(sb, block_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT | ++ EXT4_GROUP_INFO_IBITMAP_CORRUPT, ++ "Checksum bad for group %u", ++ block_group); + return -EFSBADCRC; + } + memset(bh->b_data, 0, sb->s_blocksize); +@@ -371,7 +363,6 @@ static int ext4_validate_block_bitmap(st + { + ext4_fsblk_t blk; + struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); +- struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (buffer_verified(bh)) + return 0; +@@ -382,22 +373,19 @@ static int ext4_validate_block_bitmap(st + if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, + desc, bh))) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_corrupted_block_group(sb, block_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT, ++ "bg %u: bad block bitmap checksum", ++ block_group); + return -EFSBADCRC; + } + blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); +- ext4_error(sb, "bg %u: block %llu: invalid block bitmap", +- block_group, blk); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_corrupted_block_group(sb, block_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT, ++ "bg %u: block %llu: invalid block bitmap", ++ block_group, blk); + return -EFSCORRUPTED; + } + set_buffer_verified(bh); +@@ -451,8 +439,6 @@ ext4_read_block_bitmap_nowait(struct sup + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + if (err) { +- ext4_error(sb, "Failed to init block bitmap for group " +- "%u: %d", block_group, err); + goto out; + } + goto verify; +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -101,6 +101,18 @@ typedef __u32 ext4_lblk_t; + /* data type for block group number */ + typedef unsigned int ext4_group_t; + ++void __ext4_corrupted_block_group(struct super_block *sb, ++ ext4_group_t group, unsigned int flags, ++ const char *function, unsigned int line); ++ ++#define ext4_corrupted_block_group(sb, group, flags, fmt, ...) \ ++ do { \ ++ __ext4_warning(sb, __func__, __LINE__, fmt, \ ++ ##__VA_ARGS__); \ ++ __ext4_corrupted_block_group(sb, group, flags, \ ++ __func__, __LINE__); \ ++ } while (0) ++ + enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +@@ -3032,7 +3044,11 @@ struct ext4_group_info { + #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 + #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 + #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 ++#define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ ++ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) + #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 ++#define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ ++ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) + + #define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +Index: linux-4.15.0/fs/ext4/ialloc.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ialloc.c ++++ linux-4.15.0/fs/ext4/ialloc.c +@@ -72,25 +72,15 @@ static int ext4_init_inode_bitmap(struct + ext4_group_t block_group, + struct ext4_group_desc *gdp) + { +- struct ext4_group_info *grp; +- struct ext4_sb_info *sbi = EXT4_SB(sb); + J_ASSERT_BH(bh, buffer_locked(bh)); + + /* If checksum is bad mark all blocks and inodes use to prevent + * allocation, essentially implementing a per-group read-only flag. */ + if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { +- grp = ext4_get_group_info(sb, block_group); +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); +- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +- int count; +- count = ext4_free_inodes_count(sb, gdp); +- percpu_counter_sub(&sbi->s_freeinodes_counter, +- count); +- } +- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_corrupted_block_group(sb, block_group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT | ++ EXT4_GROUP_INFO_IBITMAP_CORRUPT, ++ "Checksum bad for group %u", block_group); + return -EFSBADCRC; + } + +@@ -194,8 +184,6 @@ ext4_read_inode_bitmap(struct super_bloc + ext4_unlock_group(sb, block_group); + unlock_buffer(bh); + if (err) { +- ext4_error(sb, "Failed to init inode bitmap for group " +- "%u: %d", block_group, err); + goto out; + } + return bh; +@@ -371,14 +359,9 @@ out: + if (!fatal) + fatal = err; + } else { +- ext4_error(sb, "bit already cleared for inode %lu", ino); +- if (gdp && !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { +- int count; +- count = ext4_free_inodes_count(sb, gdp); +- percpu_counter_sub(&sbi->s_freeinodes_counter, +- count); +- } +- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); ++ ext4_corrupted_block_group(sb, block_group, ++ EXT4_GROUP_INFO_IBITMAP_CORRUPT, ++ "bit already cleared for inode %lu", ino); + } + + error_return: +Index: linux-4.15.0/fs/ext4/mballoc.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/mballoc.c ++++ linux-4.15.0/fs/ext4/mballoc.c +@@ -751,10 +751,18 @@ int ext4_mb_generate_buddy(struct super_ + if (free != grp->bb_free) { + struct ext4_group_desc *gdp; + gdp = ext4_get_group_desc(sb, group, NULL); +- ext4_error(sb, "group %lu: %u blocks in bitmap, %u in bb, " +- "%u in gd, %lu pa's\n", (long unsigned int)group, +- free, grp->bb_free, ext4_free_group_clusters(sb, gdp), +- grp->bb_prealloc_nr); ++ ++ ext4_corrupted_block_group(sb, group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT, ++ "group %lu: %u blocks in bitmap, %u in bb, %u in gd, %lu pa's block bitmap corrupt", ++ (unsigned long int)group, free, grp->bb_free, ++ ext4_free_group_clusters(sb, gdp), ++ grp->bb_prealloc_nr); ++ /* ++ * If we intend to continue, we consider group descriptor ++ * corrupt and update bb_free using bitmap value ++ */ ++ grp->bb_free = free; + return -EIO; + } + mb_set_largest_free_order(sb, grp); +@@ -1110,7 +1118,7 @@ ext4_mb_load_buddy_gfp(struct super_bloc + int block; + int pnum; + int poff; +- struct page *page; ++ struct page *page = NULL; + int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); +@@ -1136,7 +1144,7 @@ ext4_mb_load_buddy_gfp(struct super_bloc + */ + ret = ext4_mb_init_group(sb, group, gfp); + if (ret) +- return ret; ++ goto err; + } + + /* +@@ -1239,6 +1247,7 @@ err: + put_page(e4b->bd_buddy_page); + e4b->bd_buddy = NULL; + e4b->bd_bitmap = NULL; ++ ext4_warning(sb, "Error loading buddy information for %u", group); + return ret; + } + +@@ -3689,9 +3698,11 @@ int ext4_mb_check_ondisk_bitmap(struct s + } + + if (free != free_in_gdp) { +- ext4_error(sb, "on-disk bitmap for group %d" +- "corrupted: %u blocks free in bitmap, %u - in gd\n", +- group, free, free_in_gdp); ++ ext4_corrupted_block_group(sb, group, ++ EXT4_GROUP_INFO_BBITMAP_CORRUPT, ++ "on-disk bitmap for group %d corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, ++ free_in_gdp); + return -EIO; + } + return 0; +@@ -4052,16 +4063,8 @@ ext4_mb_release_inode_pa(struct ext4_bud + /* "free < pa->pa_free" means we maybe double alloc the same blocks, + * otherwise maybe leave some free blocks unavailable, no need to BUG.*/ + if ((free > pa->pa_free && !pa->pa_error) || (free < pa->pa_free)) { +- ext4_error(sb, "pa free mismatch: [pa %p] " +- "[phy %lu] [logic %lu] [len %u] [free %u] " +- "[error %u] [inode %lu] [freed %u]", pa, +- (unsigned long)pa->pa_pstart, +- (unsigned long)pa->pa_lstart, +- (unsigned)pa->pa_len, (unsigned)pa->pa_free, +- (unsigned)pa->pa_error, pa->pa_inode->i_ino, +- free); + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", +- free, pa->pa_free); ++ free, pa->pa_free); + /* + * pa is already deleted so we use the value obtained + * from the bitmap and continue. +@@ -4123,15 +4126,11 @@ ext4_mb_discard_group_preallocations(str + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error(sb, "Error %d reading block bitmap for %u", +- err, group); + return 0; + } + + err = ext4_mb_load_buddy(sb, group, &e4b); + if (err) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- err, group); + put_bh(bitmap_bh); + return 0; + } +@@ -4292,17 +4291,12 @@ repeat: + + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error(sb, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + return; +- } + + bitmap_bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bitmap_bh)) { + err = PTR_ERR(bitmap_bh); +- ext4_error(sb, "Error %d reading block bitmap for %u", +- err, group); + ext4_mb_unload_buddy(&e4b); + continue; + } +@@ -4565,11 +4559,8 @@ ext4_mb_discard_lg_preallocations(struct + group = ext4_get_group_number(sb, pa->pa_pstart); + err = ext4_mb_load_buddy_gfp(sb, group, &e4b, + GFP_NOFS|__GFP_NOFAIL); +- if (err) { +- ext4_error(sb, "Error %d loading buddy information for %u", +- err, group); ++ if (err) + continue; +- } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); + ext4_get_group_info(sb, group)->bb_prealloc_nr--; +@@ -4822,17 +4813,18 @@ errout: + * been updated or not when fail case. So can + * not revert pa_free back, just mark pa_error*/ + pa->pa_error++; +- ext4_error(sb, +- "Updating bitmap error: [err %d] " +- "[pa %p] [phy %lu] [logic %lu] " +- "[len %u] [free %u] [error %u] " +- "[inode %lu]", *errp, pa, +- (unsigned long)pa->pa_pstart, +- (unsigned long)pa->pa_lstart, +- (unsigned)pa->pa_len, +- (unsigned)pa->pa_free, +- (unsigned)pa->pa_error, +- pa->pa_inode ? pa->pa_inode->i_ino : 0); ++ ext4_corrupted_block_group(sb, 0, 0, ++ "Updating bitmap error: [err %d] " ++ "[pa %p] [phy %lu] [logic %lu] " ++ "[len %u] [free %u] [error %u] " ++ "[inode %lu]", *errp, pa, ++ (unsigned long)pa->pa_pstart, ++ (unsigned long)pa->pa_lstart, ++ (unsigned)pa->pa_len, ++ (unsigned)pa->pa_free, ++ (unsigned)pa->pa_error, ++ pa->pa_inode ? ++ pa->pa_inode->i_ino : 0); + } + } + ext4_mb_release_context(ac); +@@ -5118,7 +5110,7 @@ do_more: + err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, + GFP_NOFS|__GFP_NOFAIL); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * We need to make sure we don't reuse the freed block until after the +@@ -5200,8 +5192,9 @@ do_more: + goto do_more; + } + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); + return; + } + +@@ -5301,7 +5294,7 @@ int ext4_group_add_blocks(handle_t *hand + + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) +- goto error_return; ++ goto error_brelse; + + /* + * need to update group_info->bb_free and bitmap +@@ -5339,8 +5332,9 @@ int ext4_group_add_blocks(handle_t *hand + err = ret; + + error_return: +- brelse(bitmap_bh); + ext4_std_error(sb, err); ++error_brelse: ++ brelse(bitmap_bh); + return err; + } + +@@ -5415,11 +5409,9 @@ ext4_trim_all_free(struct super_block *s + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); +- if (ret) { +- ext4_warning(sb, "Error %d loading buddy information for %u", +- ret, group); ++ if (ret) + return ret; +- } ++ + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); +Index: linux-4.15.0/fs/ext4/super.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/super.c ++++ linux-4.15.0/fs/ext4/super.c +@@ -708,6 +708,37 @@ void __ext4_warning_inode(const struct i + va_end(args); + } + ++void __ext4_corrupted_block_group(struct super_block *sb, ext4_group_t group, ++ unsigned int flags, const char *function, ++ unsigned int line) ++{ ++ struct ext4_sb_info *sbi = EXT4_SB(sb); ++ struct ext4_group_info *grp = ext4_get_group_info(sb, group); ++ struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); ++ ++ if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT && ++ !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) { ++ percpu_counter_sub(&sbi->s_freeclusters_counter, ++ grp->bb_free); ++ set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, ++ &grp->bb_state); ++ } ++ ++ if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT && ++ !EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { ++ if (gdp) { ++ int count; ++ ++ count = ext4_free_inodes_count(sb, gdp); ++ percpu_counter_sub(&sbi->s_freeinodes_counter, ++ count); ++ } ++ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, ++ &grp->bb_state); ++ } ++ save_error_info(sb, function, line); ++} ++ + void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-data-in-dirent.patch new file mode 100644 index 0000000..3eb83f0 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-data-in-dirent.patch @@ -0,0 +1,753 @@ +this patch implements feature which allows ext4 fs users (e.g. Lustre) +to store data in ext4 dirent. +data is stored in ext4 dirent after file-name, this space is accounted +in de->rec_len. flag EXT4_DIRENT_LUFID added to d_type if extra data +is present. + +make use of dentry->d_fsdata to pass fid to ext4. so no +changes in ext4_add_entry() interface required. + +Index: linux-4.15.0/fs/ext4/dir.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/dir.c ++++ linux-4.15.0/fs/ext4/dir.c +@@ -68,11 +68,11 @@ int __ext4_check_dir_entry(const char *f + const int rlen = ext4_rec_len_from_disk(de->rec_len, + dir->i_sb->s_blocksize); + +- if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) ++ if (unlikely(rlen < __EXT4_DIR_REC_LEN(1))) + error_msg = "rec_len is smaller than minimal"; + else if (unlikely(rlen % 4 != 0)) + error_msg = "rec_len % 4 != 0"; +- else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) ++ else if (unlikely(rlen < EXT4_DIR_REC_LEN(de))) + error_msg = "rec_len is too small for name_len"; + else if (unlikely(((char *) de - buf) + rlen > size)) + error_msg = "directory entry across range"; +@@ -219,7 +219,7 @@ static int ext4_readdir(struct file *fil + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, +- sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) ++ sb->s_blocksize) < __EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + sb->s_blocksize); +@@ -442,12 +442,17 @@ int ext4_htree_store_dirent(struct file + struct fname *fname, *new_fn; + struct dir_private_info *info; + int len; ++ int extra_data = 0; + + info = dir_file->private_data; + p = &info->root.rb_node; + + /* Create and allocate the fname structure */ +- len = sizeof(struct fname) + ent_name->len + 1; ++ if (dirent->file_type & EXT4_DIRENT_LUFID) ++ extra_data = ext4_get_dirent_data_len(dirent); ++ ++ len = sizeof(struct fname) + ent_name->len + extra_data + 1; ++ + new_fn = kzalloc(len, GFP_KERNEL); + if (!new_fn) + return -ENOMEM; +@@ -456,7 +461,7 @@ int ext4_htree_store_dirent(struct file + new_fn->inode = le32_to_cpu(dirent->inode); + new_fn->name_len = ent_name->len; + new_fn->file_type = dirent->file_type; +- memcpy(new_fn->name, ent_name->name, ent_name->len); ++ memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data); + new_fn->name[ent_name->len] = 0; + + while (*p) { +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -1055,6 +1055,7 @@ struct ext4_inode_info { + __u32 i_csum_seed; + + kprojid_t i_projid; ++ void *i_dirdata; + }; + + /* +@@ -1098,6 +1099,7 @@ struct ext4_inode_info { + #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ + #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ + #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ ++#define EXT4_MOUNT_DIRDATA 0x40000 /* Data in directory entries*/ + #define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ + #define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, + * enable enforcement for hidden +@@ -1768,6 +1770,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, EN + EXT4_FEATURE_INCOMPAT_FLEX_BG| \ + EXT4_FEATURE_INCOMPAT_EA_INODE| \ + EXT4_FEATURE_INCOMPAT_MMP | \ ++ EXT4_FEATURE_INCOMPAT_DIRDATA| \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ +@@ -1943,6 +1946,43 @@ struct ext4_dir_entry_tail { + #define EXT4_FT_SYMLINK 7 + + #define EXT4_FT_MAX 8 ++#define EXT4_FT_MASK 0xf ++ ++#if EXT4_FT_MAX > EXT4_FT_MASK ++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK" ++#endif ++ ++/* ++ * d_type has 4 unused bits, so it can hold four types data. these different ++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be ++ * stored, in flag order, after file-name in ext4 dirent. ++*/ ++/* ++ * this flag is added to d_type if ext4 dirent has extra data after ++ * filename. this data length is variable and length is stored in first byte ++ * of data. data start after filename NUL byte. ++ * This is used by Lustre FS. ++ */ ++#define EXT4_DIRENT_LUFID 0x10 ++ ++#define EXT4_LUFID_MAGIC 0xAD200907UL ++struct ext4_dentry_param { ++ __u32 edp_magic; /* EXT4_LUFID_MAGIC */ ++ char edp_len; /* size of edp_data in bytes */ ++ char edp_data[0]; /* packed array of data */ ++} __packed; ++ ++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb, ++ struct ext4_dentry_param *p) ++ ++{ ++ if (!ext4_has_feature_dirdata(sb)) ++ return NULL; ++ if (p && p->edp_magic == EXT4_LUFID_MAGIC) ++ return &p->edp_len; ++ else ++ return NULL; ++} + + #define EXT4_FT_DIR_CSUM 0xDE + +@@ -1953,8 +1993,11 @@ struct ext4_dir_entry_tail { + */ + #define EXT4_DIR_PAD 4 + #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +-#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ ++#define __EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) ++#define EXT4_DIR_REC_LEN(de) (__EXT4_DIR_REC_LEN((de)->name_len +\ ++ ext4_get_dirent_data_len(de))) ++ + #define EXT4_MAX_REC_LEN ((1<<16)-1) + + /* +@@ -2357,11 +2400,11 @@ extern int ext4_find_dest_de(struct inod + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de); ++ struct ext4_dir_entry_2 **dest_de, int *dlen); + void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname); ++ struct ext4_filename *fname, void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { + if (!ext4_has_feature_dir_index(inode->i_sb)) +@@ -2373,10 +2416,17 @@ static const unsigned char ext4_filetype + + static inline unsigned char get_dtype(struct super_block *sb, int filetype) + { +- if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) ++ int fl_index = filetype & EXT4_FT_MASK; ++ ++ if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX) + return DT_UNKNOWN; + +- return ext4_filetype_table[filetype]; ++ if (!test_opt(sb, DIRDATA)) ++ return ext4_filetype_table[fl_index]; ++ ++ return (ext4_filetype_table[fl_index]) | ++ (filetype & EXT4_DIRENT_LUFID); ++ + } + extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, + void *buf, int buf_size); +@@ -2525,6 +2575,8 @@ extern struct inode *ext4_create_inode(h + extern int ext4_delete_entry(handle_t *handle, struct inode * dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh); ++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, const void *, const void *); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern int ext4_search_dir(struct buffer_head *bh, +@@ -3265,6 +3317,28 @@ static inline void ext4_clear_io_unwritt + + extern const struct iomap_ops ext4_iomap_ops; + ++/* ++ * Compute the total directory entry data length. ++ * This includes the filename and an implicit NUL terminator (always present), ++ * and optional extensions. Each extension has a bit set in the high 4 bits of ++ * de->file_type, and the extension length is the first byte in each entry. ++ */ ++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de) ++{ ++ char *len = de->name + de->name_len + 1 /* NUL terminator */; ++ int dlen = 0; ++ __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4; ++ ++ while (extra_data_flags) { ++ if (extra_data_flags & 1) { ++ dlen += *len + (dlen == 0); ++ len += *len; ++ } ++ extra_data_flags >>= 1; ++ } ++ return dlen; ++} ++ + #endif /* __KERNEL__ */ + + #define EFSBADCRC EBADMSG /* Bad CRC detected */ +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -242,7 +242,8 @@ static unsigned dx_get_count(struct dx_e + static unsigned dx_get_limit(struct dx_entry *entries); + static void dx_set_count(struct dx_entry *entries, unsigned value); + static void dx_set_limit(struct dx_entry *entries, unsigned value); +-static unsigned dx_root_limit(struct inode *dir, unsigned infosize); ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize); + static unsigned dx_node_limit(struct inode *dir); + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, +@@ -505,11 +506,12 @@ ext4_next_entry(struct ext4_dir_entry_2 + */ + struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) + { ++ BUG_ON(de->name_len != 1); + /* get dotdot first */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); + + /* dx root info is after dotdot entry */ +- de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de)); + + return (struct dx_root_info *)de; + } +@@ -554,10 +556,16 @@ static inline void dx_set_limit(struct d + ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); + } + +-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ++static inline unsigned dx_root_limit(struct inode *dir, ++ struct ext4_dir_entry_2 *dot_de, unsigned infosize) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - +- EXT4_DIR_REC_LEN(2) - infosize; ++ struct ext4_dir_entry_2 *dotdot_de; ++ unsigned entry_space; ++ ++ BUG_ON(dot_de->name_len != 1); ++ dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize); ++ entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(dot_de) - ++ EXT4_DIR_REC_LEN(dotdot_de) - infosize; + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); +@@ -566,7 +574,7 @@ static inline unsigned dx_root_limit(str + + static inline unsigned dx_node_limit(struct inode *dir) + { +- unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); ++ unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0); + + if (ext4_has_metadata_csum(dir->i_sb)) + entry_space -= sizeof(struct dx_tail); +@@ -678,7 +686,7 @@ static struct stats dx_show_leaf(struct + (unsigned) ((char *) de - base)); + #endif + } +- space += EXT4_DIR_REC_LEN(de->name_len); ++ space += EXT4_DIR_REC_LEN(de); + names++; + } + de = ext4_next_entry(de, size); +@@ -785,11 +793,14 @@ dx_probe(struct ext4_filename *fname, st + + entries = (struct dx_entry *)(((char *)info) + info->info_length); + +- if (dx_get_limit(entries) != dx_root_limit(dir, +- info->info_length)) { ++ if (dx_get_limit(entries) != ++ dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), +- dx_root_limit(dir, info->info_length)); ++ dx_root_limit(dir, ++ (struct ext4_dir_entry_2 *)frame->bh->b_data, ++ info->info_length)); + goto fail; + } + +@@ -981,7 +992,7 @@ static int htree_dirblock_to_tree(struct + de = (struct ext4_dir_entry_2 *) bh->b_data; + top = (struct ext4_dir_entry_2 *) ((char *) de + + dir->i_sb->s_blocksize - +- EXT4_DIR_REC_LEN(0)); ++ __EXT4_DIR_REC_LEN(0)); + #ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Check if the directory is encrypted */ + if (ext4_encrypted_inode(dir)) { +@@ -1653,7 +1664,7 @@ dx_move_dirents(char *from, char *to, st + while (count--) { + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) + (from + (map->offs<<2)); +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_REC_LEN(de); + memcpy (to, de, rec_len); + ((struct ext4_dir_entry_2 *) to)->rec_len = + ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1677,7 +1688,7 @@ static struct ext4_dir_entry_2* dx_pack_ + while ((char*)de < base + blocksize) { + next = ext4_next_entry(de, blocksize); + if (de->inode && de->name_len) { +- rec_len = EXT4_DIR_REC_LEN(de->name_len); ++ rec_len = EXT4_DIR_REC_LEN(de); + if (de > to) + memmove(to, de, rec_len); + to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); +@@ -1808,14 +1819,16 @@ int ext4_find_dest_de(struct inode *dir, + struct buffer_head *bh, + void *buf, int buf_size, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **dest_de) ++ struct ext4_dir_entry_2 **dest_de, int *dlen) + { + struct ext4_dir_entry_2 *de; +- unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); ++ unsigned short reclen = __EXT4_DIR_REC_LEN(fname_len(fname)) + ++ (dlen ? *dlen : 0); + int nlen, rlen; + unsigned int offset = 0; + char *top; + ++ dlen ? *dlen = 0 : 0; /* default set to 0 */ + de = (struct ext4_dir_entry_2 *)buf; + top = buf + buf_size - reclen; + while ((char *) de <= top) { +@@ -1824,10 +1837,26 @@ int ext4_find_dest_de(struct inode *dir, + return -EFSCORRUPTED; + if (ext4_match(fname, de)) + return -EEXIST; +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_REC_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if ((de->inode ? rlen - nlen : rlen) >= reclen) + break; ++ /* Then for dotdot entries, check for the smaller space ++ * required for just the entry, no FID */ ++ if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) { ++ if ((de->inode ? rlen - nlen : rlen) >= ++ __EXT4_DIR_REC_LEN(fname_len(fname))) { ++ /* set dlen=1 to indicate not ++ * enough space store fid */ ++ dlen ? *dlen = 1 : 0; ++ break; ++ } ++ /* The new ".." entry must be written over the ++ * previous ".." entry, which is the first ++ * entry traversed by this scan. If it doesn't ++ * fit, something is badly wrong, so -EIO. */ ++ return -EIO; ++ } + de = (struct ext4_dir_entry_2 *)((char *)de + rlen); + offset += rlen; + } +@@ -1841,12 +1870,12 @@ int ext4_find_dest_de(struct inode *dir, + void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, +- struct ext4_filename *fname) ++ struct ext4_filename *fname, void *data) + { + + int nlen, rlen; + +- nlen = EXT4_DIR_REC_LEN(de->name_len); ++ nlen = EXT4_DIR_REC_LEN(de); + rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); + if (de->inode) { + struct ext4_dir_entry_2 *de1 = +@@ -1860,6 +1889,11 @@ void ext4_insert_dentry(struct inode *in + ext4_set_de_type(inode->i_sb, de, inode->i_mode); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); ++ if (data) { ++ de->name[fname_len(fname)] = 0; ++ memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + } + + /* +@@ -1877,14 +1911,19 @@ static int add_dirent_to_buf(handle_t *h + { + unsigned int blocksize = dir->i_sb->s_blocksize; + int csum_size = 0; +- int err; ++ int err, dlen = 0; ++ unsigned char *data; + ++ data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *) ++ EXT4_I(inode)->i_dirdata); + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + + if (!de) { ++ if (data) ++ dlen = (*data) + 1; + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, +- blocksize - csum_size, fname, &de); ++ blocksize - csum_size, fname, &de, &dlen); + if (err) + return err; + } +@@ -1896,7 +1935,10 @@ static int add_dirent_to_buf(handle_t *h + } + + /* By now the buffer is marked for journaling */ +- ext4_insert_dentry(inode, de, blocksize, fname); ++ /* If writing the short form of "dotdot", don't add the data section */ ++ if (dlen == 1) ++ data = NULL; ++ ext4_insert_dentry(inode, de, blocksize, fname, data); + + /* + * XXX shouldn't update any times until successful +@@ -2005,7 +2047,8 @@ static int make_indexed_dir(handle_t *ha + + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); ++ dx_set_limit(entries, dx_root_limit(dir, ++ dot_de, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ + fname->hinfo.hash_version = dx_info->hash_version; +@@ -2055,6 +2098,8 @@ static int ext4_update_dotdot(handle_t * + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; + int len, journal = 0, err = 0; ++ int dlen = 0; ++ char *data; + + if (IS_ERR(handle)) + return PTR_ERR(handle); +@@ -2072,19 +2117,24 @@ static int ext4_update_dotdot(handle_t * + /* the first item must be "." */ + assert(de->name_len == 1 && de->name[0] == '.'); + len = le16_to_cpu(de->rec_len); +- assert(len >= EXT4_DIR_REC_LEN(1)); +- if (len > EXT4_DIR_REC_LEN(1)) { ++ assert(len >= __EXT4_DIR_REC_LEN(1)); ++ if (len > __EXT4_DIR_REC_LEN(1)) { + BUFFER_TRACE(dir_block, "get_write_access"); + err = ext4_journal_get_write_access(handle, dir_block); + if (err) + goto out_journal; + + journal = 1; +- de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); + } + +- len -= EXT4_DIR_REC_LEN(1); +- assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ len -= EXT4_DIR_REC_LEN(de); ++ data = ext4_dentry_get_data(dir->i_sb, ++ (struct ext4_dentry_param *)dentry->d_fsdata); ++ if (data) ++ dlen = *data + 1; ++ assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen)); ++ + de = (struct ext4_dir_entry_2 *) + ((char *) de + le16_to_cpu(de->rec_len)); + if (!journal) { +@@ -2098,10 +2148,15 @@ static int ext4_update_dotdot(handle_t * + if (len > 0) + de->rec_len = cpu_to_le16(len); + else +- assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2)); + de->name_len = 2; + strcpy(de->name, ".."); +- ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) { ++ de->name[2] = 0; ++ memcpy(&de->name[2 + 1], data, *data); ++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + + out_journal: + if (journal) { +@@ -2140,6 +2195,7 @@ static int ext4_add_entry(handle_t *hand + ext4_lblk_t block, blocks; + int csum_size = 0; + ++ EXT4_I(inode)->i_dirdata = dentry->d_fsdata; + if (ext4_has_metadata_csum(inode->i_sb)) + csum_size = sizeof(struct ext4_dir_entry_tail); + +@@ -2661,37 +2717,70 @@ err_unlock_inode: + return err; + } + ++struct tp_block { ++ struct inode *inode; ++ void *data1; ++ void *data2; ++}; ++ + struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, + struct ext4_dir_entry_2 *de, + int blocksize, int csum_size, + unsigned int parent_ino, int dotdot_real_len) + { ++ void *data1 = NULL, *data2 = NULL; ++ int dot_reclen = 0; ++ ++ if (dotdot_real_len == 10) { ++ struct tp_block *tpb = (struct tp_block *)inode; ++ data1 = tpb->data1; ++ data2 = tpb->data2; ++ inode = tpb->inode; ++ dotdot_real_len = 0; ++ } + de->inode = cpu_to_le32(inode->i_ino); + de->name_len = 1; +- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), +- blocksize); + strcpy(de->name, "."); + ext4_set_de_type(inode->i_sb, de, S_IFDIR); + ++ /* get packed fid data*/ ++ data1 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data1); ++ if (data1) { ++ de->name[1] = 0; ++ memcpy(&de->name[2], data1, *(char *) data1); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de)); ++ dot_reclen = cpu_to_le16(de->rec_len); + de = ext4_next_entry(de, blocksize); + de->inode = cpu_to_le32(parent_ino); + de->name_len = 2; ++ strcpy(de->name, ".."); ++ ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ data2 = ext4_dentry_get_data(inode->i_sb, ++ (struct ext4_dentry_param *) data2); ++ if (data2) { ++ de->name[2] = 0; ++ memcpy(&de->name[3], data2, *(char *) data2); ++ de->file_type |= EXT4_DIRENT_LUFID; ++ } + if (!dotdot_real_len) + de->rec_len = ext4_rec_len_to_disk(blocksize - +- (csum_size + EXT4_DIR_REC_LEN(1)), ++ (csum_size + dot_reclen), + blocksize); + else + de->rec_len = ext4_rec_len_to_disk( +- EXT4_DIR_REC_LEN(de->name_len), blocksize); +- strcpy(de->name, ".."); +- ext4_set_de_type(inode->i_sb, de, S_IFDIR); ++ EXT4_DIR_REC_LEN(de), blocksize); + + return ext4_next_entry(de, blocksize); + } + + static int ext4_init_new_dir(handle_t *handle, struct inode *dir, +- struct inode *inode) ++ struct inode *inode, ++ const void *data1, const void *data2) + { ++ struct tp_block param; + struct buffer_head *dir_block = NULL; + struct ext4_dir_entry_2 *de; + struct ext4_dir_entry_tail *t; +@@ -2716,7 +2805,11 @@ static int ext4_init_new_dir(handle_t *h + if (IS_ERR(dir_block)) + return PTR_ERR(dir_block); + de = (struct ext4_dir_entry_2 *)dir_block->b_data; +- ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); ++ param.inode = inode; ++ param.data1 = (void *)data1; ++ param.data2 = (void *)data2; ++ ext4_init_dot_dotdot((struct inode *)(¶m), de, blocksize, ++ csum_size, dir->i_ino, 10); + set_nlink(inode, 2); + if (csum_size) { + t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize); +@@ -2733,6 +2826,29 @@ out: + return err; + } + ++/* Initialize @inode as a subdirectory of @dir, and add the ++ * "." and ".." entries into the first directory block. */ ++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir, ++ struct inode *inode, ++ const void *data1, const void *data2) ++{ ++ int rc; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ ext4_handle_sync(handle); ++ ++ inode->i_op = &ext4_dir_inode_operations; ++ inode->i_fop = &ext4_dir_operations; ++ rc = ext4_init_new_dir(handle, dir, inode, data1, data2); ++ if (!rc) ++ rc = ext4_mark_inode_dirty(handle, inode); ++ return rc; ++} ++EXPORT_SYMBOL(ext4_add_dot_dotdot); ++ + static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) + { + handle_t *handle; +@@ -2759,7 +2875,7 @@ retry: + + inode->i_op = &ext4_dir_inode_operations; + inode->i_fop = &ext4_dir_operations; +- err = ext4_init_new_dir(handle, dir, inode); ++ err = ext4_init_new_dir(handle, dir, inode, NULL, NULL); + if (err) + goto out_clear_inode; + err = ext4_mark_inode_dirty(handle, inode); +@@ -2811,7 +2927,7 @@ bool ext4_empty_dir(struct inode *inode) + } + + sb = inode->i_sb; +- if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { ++ if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2)) { + EXT4_ERROR_INODE(inode, "invalid size"); + return true; + } +Index: linux-4.15.0/fs/ext4/inline.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/inline.c ++++ linux-4.15.0/fs/ext4/inline.c +@@ -1017,7 +1017,7 @@ static int ext4_add_dirent_to_inline(han + struct ext4_dir_entry_2 *de; + + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, +- inline_size, fname, &de); ++ inline_size, fname, &de, NULL); + if (err) + return err; + +@@ -1025,7 +1025,7 @@ static int ext4_add_dirent_to_inline(han + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) + return err; +- ext4_insert_dentry(inode, de, inline_size, fname); ++ ext4_insert_dentry(inode, de, inline_size, fname, NULL); + + ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); + +@@ -1094,7 +1094,7 @@ static int ext4_update_inline_dir(handle + int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; + int new_size = get_max_inline_xattr_value_size(dir, iloc); + +- if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) ++ if (new_size - old_size <= __EXT4_DIR_REC_LEN(1)) + return -ENOSPC; + + ret = ext4_update_inline_data(handle, dir, +@@ -1375,7 +1375,7 @@ int htree_inlinedir_to_tree(struct file + fake.name_len = 1; + strcpy(fake.name, "."); + fake.rec_len = ext4_rec_len_to_disk( +- EXT4_DIR_REC_LEN(fake.name_len), ++ EXT4_DIR_REC_LEN(&fake), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +@@ -1385,7 +1385,7 @@ int htree_inlinedir_to_tree(struct file + fake.name_len = 2; + strcpy(fake.name, ".."); + fake.rec_len = ext4_rec_len_to_disk( +- EXT4_DIR_REC_LEN(fake.name_len), ++ EXT4_DIR_REC_LEN(&fake), + inline_size); + ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); + de = &fake; +@@ -1483,8 +1483,8 @@ int ext4_read_inline_dir(struct file *fi + * So we will use extra_offset and extra_size to indicate them + * during the inline dir iteration. + */ +- dotdot_offset = EXT4_DIR_REC_LEN(1); +- dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); ++ dotdot_offset = __EXT4_DIR_REC_LEN(1); ++ dotdot_size = dotdot_offset + __EXT4_DIR_REC_LEN(2); + extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; + extra_size = extra_offset + inline_size; + +@@ -1519,7 +1519,7 @@ int ext4_read_inline_dir(struct file *fi + * failure will be detected in the + * dirent test below. */ + if (ext4_rec_len_from_disk(de->rec_len, extra_size) +- < EXT4_DIR_REC_LEN(1)) ++ < __EXT4_DIR_REC_LEN(1)) + break; + i += ext4_rec_len_from_disk(de->rec_len, + extra_size); +Index: linux-4.15.0/fs/ext4/super.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/super.c ++++ linux-4.15.0/fs/ext4/super.c +@@ -1347,7 +1347,7 @@ enum { + Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, + Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, + Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, +- Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, ++ Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata, + Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_i_version, Opt_dax, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_lazytime, Opt_nolazytime, Opt_debug_want_extra_isize, +@@ -1420,6 +1420,7 @@ static const match_table_t tokens = { + {Opt_nolazytime, "nolazytime"}, + {Opt_debug_want_extra_isize, "debug_want_extra_isize=%u"}, + {Opt_nodelalloc, "nodelalloc"}, ++ {Opt_dirdata, "dirdata"}, + {Opt_removed, "mblk_io_submit"}, + {Opt_removed, "nomblk_io_submit"}, + {Opt_block_validity, "block_validity"}, +@@ -1641,6 +1642,7 @@ static const struct mount_opts { + {Opt_usrjquota, 0, MOPT_Q}, + {Opt_grpjquota, 0, MOPT_Q}, + {Opt_offusrjquota, 0, MOPT_Q}, ++ {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET}, + {Opt_offgrpjquota, 0, MOPT_Q}, + {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT}, + {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-dont-check-before-replay.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-dont-check-before-replay.patch new file mode 100644 index 0000000..1d82dc2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-dont-check-before-replay.patch @@ -0,0 +1,36 @@ +When ldiskfs run in failover mode whith read-only disk. +Part of allocation updates are lost and ldiskfs may fail +while mounting this is due to inconsistent state of +group-descriptor. Group-descriptor check is added after +journal replay. + +Index: linux-4.15.0/fs/ext4/super.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/super.c ++++ linux-4.15.0/fs/ext4/super.c +@@ -4051,11 +4051,6 @@ static int ext4_fill_super(struct super_ + goto failed_mount2; + } + } +- if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { +- ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); +- ret = -EFSCORRUPTED; +- goto failed_mount2; +- } + + sbi->s_gdb_count = db_count; + +@@ -4196,6 +4191,13 @@ static int ext4_fill_super(struct super_ + sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; + + no_journal: ++ ++ if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) { ++ ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); ++ ret = -EFSCORRUPTED; ++ goto failed_mount_wq; ++ } ++ + if (!test_opt(sb, NO_MBCACHE)) { + sbi->s_ea_block_cache = ext4_xattr_create_cache(); + if (!sbi->s_ea_block_cache) { diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-give-warning-with-dir-htree-growing.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-give-warning-with-dir-htree-growing.patch new file mode 100644 index 0000000..ba2e4a6 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-give-warning-with-dir-htree-growing.patch @@ -0,0 +1,156 @@ +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -1450,6 +1450,7 @@ struct ext4_sb_info { + unsigned long s_mb_prealloc_table_size; + unsigned int s_mb_group_prealloc; + unsigned int s_max_dir_size_kb; ++ unsigned long s_warning_dir_size; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -751,12 +751,20 @@ struct ext4_dir_lock_data { + #define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) + #define ext4_find_entry(dir, name, dirent, inline) \ + __ext4_find_entry(dir, name, dirent, inline, NULL) +-#define ext4_add_entry(handle, dentry, inode) \ +- __ext4_add_entry(handle, dentry, inode, NULL) + + /* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ + #define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) + ++inline int ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int ret = __ext4_add_entry(handle, dentry, inode, NULL); ++ ++ if (ret == -ENOBUFS) ++ ret = 0; ++ return ret; ++} ++ + static void ext4_htree_event_cb(void *target, void *event) + { + u64 *block = (u64 *)target; +@@ -2508,6 +2516,54 @@ out: + return err; + } + ++static unsigned long __ext4_max_dir_size(struct dx_frame *frames, ++ struct dx_frame *frame, struct inode *dir) ++{ ++ unsigned long max_dir_size; ++ ++ if (EXT4_SB(dir->i_sb)->s_max_dir_size_kb) { ++ max_dir_size = EXT4_SB(dir->i_sb)->s_max_dir_size_kb << 10; ++ } else { ++ max_dir_size = EXT4_BLOCK_SIZE(dir->i_sb); ++ while (frame >= frames) { ++ max_dir_size *= dx_get_limit(frame->entries); ++ if (frame == frames) ++ break; ++ frame--; ++ } ++ /* use 75% of max dir size in average */ ++ max_dir_size = max_dir_size / 4 * 3; ++ } ++ return max_dir_size; ++} ++ ++/* ++ * With hash tree growing, it is easy to hit ENOSPC, but it is hard ++ * to predict when it will happen. let's give administrators warning ++ * when reaching 3/5 and 2/3 of limit ++ */ ++static inline bool dir_size_in_warning_range(struct dx_frame *frames, ++ struct dx_frame *frame, ++ struct inode *dir) ++{ ++ unsigned long size1, size2; ++ struct super_block *sb = dir->i_sb; ++ ++ if (unlikely(!EXT4_SB(sb)->s_warning_dir_size)) ++ EXT4_SB(sb)->s_warning_dir_size = ++ __ext4_max_dir_size(frames, frame, dir); ++ ++ size1 = EXT4_SB(sb)->s_warning_dir_size / 16 * 10; ++ size1 = size1 & ~(EXT4_BLOCK_SIZE(sb) - 1); ++ size2 = EXT4_SB(sb)->s_warning_dir_size / 16 * 11; ++ size2 = size2 & ~(EXT4_BLOCK_SIZE(sb) - 1); ++ if (in_range(dir->i_size, size1, EXT4_BLOCK_SIZE(sb)) || ++ in_range(dir->i_size, size2, EXT4_BLOCK_SIZE(sb))) ++ return true; ++ ++ return false; ++} ++ + /* + * ext4_add_entry() + * +@@ -2629,6 +2685,7 @@ static int ext4_dx_add_entry(handle_t *h + struct ext4_dir_entry_2 *de; + int restart; + int err; ++ bool ret_warn = false; + + again: + restart = 0; +@@ -2657,6 +2714,11 @@ again: + /* Block full, should compress but for now just split */ + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", + dx_get_count(entries), dx_get_limit(entries))); ++ ++ if (frame - frames + 1 >= ext4_dir_htree_level(sb) || ++ EXT4_SB(sb)->s_warning_dir_size) ++ ret_warn = dir_size_in_warning_range(frames, frame, dir); ++ + /* Need to split index? */ + if (dx_get_count(entries) == dx_get_limit(entries)) { + ext4_lblk_t newblock; +@@ -2819,6 +2881,8 @@ cleanup: + */ + if (restart && err == 0) + goto again; ++ if (err == 0 && ret_warn) ++ err = -ENOBUFS; + return err; + } + +Index: linux-4.15.0/fs/ext4/super.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/super.c ++++ linux-4.15.0/fs/ext4/super.c +@@ -1804,6 +1804,8 @@ static int handle_mount_opt(struct super + sbi->s_li_wait_mult = arg; + } else if (token == Opt_max_dir_size_kb) { + sbi->s_max_dir_size_kb = arg; ++ /* reset s_warning_dir_size and make it re-calculated */ ++ sbi->s_warning_dir_size = 0; + } else if (token == Opt_stripe) { + sbi->s_stripe = arg; + } else if (token == Opt_resuid) { +Index: linux-4.15.0/fs/ext4/sysfs.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/sysfs.c ++++ linux-4.15.0/fs/ext4/sysfs.c +@@ -173,6 +173,7 @@ EXT4_ATTR_OFFSET(inode_readahead_blks, 0 + EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); + EXT4_RW_ATTR_SBI_UI(max_dir_size, s_max_dir_size_kb); + EXT4_RW_ATTR_SBI_UI(max_dir_size_kb, s_max_dir_size_kb); ++EXT4_RW_ATTR_SBI_UI(warning_dir_size, s_warning_dir_size); + EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats); + EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan); + EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); +@@ -204,6 +205,7 @@ static struct attribute *ext4_attrs[] = + ATTR_LIST(inode_goal), + ATTR_LIST(max_dir_size), + ATTR_LIST(max_dir_size_kb), ++ ATTR_LIST(warning_dir_size), + ATTR_LIST(mb_stats), + ATTR_LIST(mb_max_to_scan), + ATTR_LIST(mb_min_to_scan), diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch new file mode 100644 index 0000000..7f0b0e5 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch @@ -0,0 +1,89 @@ +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -2043,6 +2043,74 @@ out_frames: + return retval; + } + ++/* update ".." for hash-indexed directory, split the item "." if necessary */ ++static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ struct inode *dir = dentry->d_parent->d_inode; ++ struct buffer_head *dir_block; ++ struct ext4_dir_entry_2 *de; ++ int len, journal = 0, err = 0; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ if (IS_DIRSYNC(dir)) ++ handle->h_sync = 1; ++ ++ dir_block = ext4_bread(handle, dir, 0, 0); ++ if (IS_ERR(dir_block)) { ++ err = PTR_ERR(dir_block); ++ goto out; ++ } ++ ++ de = (struct ext4_dir_entry_2 *)dir_block->b_data; ++ /* the first item must be "." */ ++ assert(de->name_len == 1 && de->name[0] == '.'); ++ len = le16_to_cpu(de->rec_len); ++ assert(len >= EXT4_DIR_REC_LEN(1)); ++ if (len > EXT4_DIR_REC_LEN(1)) { ++ BUFFER_TRACE(dir_block, "get_write_access"); ++ err = ext4_journal_get_write_access(handle, dir_block); ++ if (err) ++ goto out_journal; ++ ++ journal = 1; ++ de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1)); ++ } ++ ++ len -= EXT4_DIR_REC_LEN(1); ++ assert(len == 0 || len >= EXT4_DIR_REC_LEN(2)); ++ de = (struct ext4_dir_entry_2 *) ++ ((char *) de + le16_to_cpu(de->rec_len)); ++ if (!journal) { ++ BUFFER_TRACE(dir_block, "get_write_access"); ++ err = ext4_journal_get_write_access(handle, dir_block); ++ if (err) ++ goto out_journal; ++ } ++ ++ de->inode = cpu_to_le32(inode->i_ino); ++ if (len > 0) ++ de->rec_len = cpu_to_le16(len); ++ else ++ assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2)); ++ de->name_len = 2; ++ strcpy(de->name, ".."); ++ ext4_set_de_type(dir->i_sb, de, S_IFDIR); ++ ++out_journal: ++ if (journal) { ++ BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); ++ err = ext4_handle_dirty_dirent_node(handle, dir, dir_block); ++ ext4_mark_inode_dirty(handle, dir); ++ } ++ brelse(dir_block); ++ ++out: ++ return err; ++} ++ + /* + * ext4_add_entry() + * +@@ -2091,6 +2159,9 @@ static int ext4_add_entry(handle_t *hand + } + + if (is_dx(dir)) { ++ if (dentry->d_name.len == 2 && ++ memcmp(dentry->d_name.name, "..", 2) == 0) ++ return ext4_update_dotdot(handle, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dir, inode); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-jcb-optimization.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-jcb-optimization.patch new file mode 100644 index 0000000..cad5c35 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-jcb-optimization.patch @@ -0,0 +1,19 @@ +Change list_add_tail to list_add. It gives advantages to ldiskfs +in tgt_cb_last_committed. In the beginning of list will be placed +thandles with the highest transaction numbers. So at the first +iterations we will have the highest transno. It will save from +extra call of ptlrpc_commit_replies. + +Index: linux-4.15.0/fs/ext4/ext4_jbd2.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4_jbd2.h ++++ linux-4.15.0/fs/ext4/ext4_jbd2.h +@@ -172,7 +172,7 @@ static inline void _ext4_journal_callbac + struct ext4_journal_cb_entry *jce) + { + /* Add the jce to transaction's private list */ +- list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); ++ list_add(&jce->jce_list, &handle->h_transaction->t_private_list); + } + + static inline void ext4_journal_callback_add(handle_t *handle, diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-kill-dx-root.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-kill-dx-root.patch new file mode 100644 index 0000000..826b2d7 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-kill-dx-root.patch @@ -0,0 +1,230 @@ +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -196,22 +196,13 @@ struct dx_entry + * hash version mod 4 should never be 0. Sincerely, the paranoia department. + */ + +-struct dx_root ++struct dx_root_info + { +- struct fake_dirent dot; +- char dot_name[4]; +- struct fake_dirent dotdot; +- char dotdot_name[4]; +- struct dx_root_info +- { +- __le32 reserved_zero; +- u8 hash_version; +- u8 info_length; /* 8 */ +- u8 indirect_levels; +- u8 unused_flags; +- } +- info; +- struct dx_entry entries[0]; ++ __le32 reserved_zero; ++ u8 hash_version; ++ u8 info_length; /* 8 */ ++ u8 indirect_levels; ++ u8 unused_flags; + }; + + struct dx_node +@@ -512,6 +503,16 @@ ext4_next_entry(struct ext4_dir_entry_2 + * Future: use high four bits of block for coalesce-on-delete flags + * Mask them off for now. + */ ++struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de) ++{ ++ /* get dotdot first */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1)); ++ ++ /* dx root info is after dotdot entry */ ++ de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2)); ++ ++ return (struct dx_root_info *)de; ++} + + static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) + { +@@ -736,7 +737,7 @@ dx_probe(struct ext4_filename *fname, st + { + unsigned count, indirect; + struct dx_entry *at, *entries, *p, *q, *m; +- struct dx_root *root; ++ struct dx_root_info *info; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); + u32 hash; +@@ -745,18 +746,17 @@ dx_probe(struct ext4_filename *fname, st + frame->bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; +- +- root = (struct dx_root *) frame->bh->b_data; +- if (root->info.hash_version != DX_HASH_TEA && +- root->info.hash_version != DX_HASH_HALF_MD4 && +- root->info.hash_version != DX_HASH_LEGACY) { +- ext4_warning_inode(dir, "Unrecognised inode hash code %u for directory " +- "%lu", root->info.hash_version, dir->i_ino); ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frame->bh->b_data); ++ if (info->hash_version != DX_HASH_TEA && ++ info->hash_version != DX_HASH_HALF_MD4 && ++ info->hash_version != DX_HASH_LEGACY) { ++ ldiskfs_warning(dir->i_sb, "Unrecognised inode hash code %d for directory " ++ "#%lu", info->hash_version, dir->i_ino); + goto fail; + } + if (fname) + hinfo = &fname->hinfo; +- hinfo->hash_version = root->info.hash_version; ++ hinfo->hash_version = info->hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -764,13 +764,13 @@ dx_probe(struct ext4_filename *fname, st + ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo); + hash = hinfo->hash; + +- if (root->info.unused_flags & 1) { ++ if (info->unused_flags & 1) { + ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", +- root->info.unused_flags); ++ info->unused_flags); + goto fail; + } + +- indirect = root->info.indirect_levels; ++ indirect = info->indirect_levels; + if (indirect >= ext4_dir_htree_level(dir->i_sb)) { + ext4_warning(dir->i_sb, + "Directory (ino: %lu) htree depth %#06x exceed" +@@ -783,14 +783,13 @@ dx_probe(struct ext4_filename *fname, st + goto fail; + } + +- entries = (struct dx_entry *)(((char *)&root->info) + +- root->info.info_length); ++ entries = (struct dx_entry *)(((char *)info) + info->info_length); + + if (dx_get_limit(entries) != dx_root_limit(dir, +- root->info.info_length)) { ++ info->info_length)) { + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), +- dx_root_limit(dir, root->info.info_length)); ++ dx_root_limit(dir, info->info_length)); + goto fail; + } + +@@ -874,7 +873,7 @@ static void dx_release(struct dx_frame * + if (frames[0].bh == NULL) + return; + +- info = &((struct dx_root *)frames[0].bh->b_data)->info; ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data); + for (i = 0; i <= info->indirect_levels; i++) { + if (frames[i].bh == NULL) + break; +@@ -1930,17 +1929,16 @@ static int make_indexed_dir(handle_t *ha + struct inode *inode, struct buffer_head *bh) + { + struct buffer_head *bh2; +- struct dx_root *root; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries; +- struct ext4_dir_entry_2 *de, *de2; ++ struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de; + struct ext4_dir_entry_tail *t; + char *data1, *top; + unsigned len; + int retval; + unsigned blocksize; + ext4_lblk_t block; +- struct fake_dirent *fde; ++ struct dx_root_info *dx_info; + int csum_size = 0; + + if (ext4_has_metadata_csum(inode->i_sb)) +@@ -1955,18 +1953,19 @@ static int make_indexed_dir(handle_t *ha + brelse(bh); + return retval; + } +- root = (struct dx_root *) bh->b_data; ++ ++ dot_de = (struct ext4_dir_entry_2 *)bh->b_data; ++ dotdot_de = ext4_next_entry(dot_de, blocksize); + + /* The 0th block becomes the root, move the dirents out */ +- fde = &root->dotdot; +- de = (struct ext4_dir_entry_2 *)((char *)fde + +- ext4_rec_len_from_disk(fde->rec_len, blocksize)); +- if ((char *) de >= (((char *) root) + blocksize)) { ++ de = (struct ext4_dir_entry_2 *)((char *)dotdot_de + ++ ext4_rec_len_from_disk(dotdot_de->rec_len, blocksize)); ++ if ((char *)de >= (((char *)dot_de) + blocksize)) { + EXT4_ERROR_INODE(dir, "invalid rec_len for '..'"); + brelse(bh); + return -EFSCORRUPTED; + } +- len = ((char *) root) + (blocksize - csum_size) - (char *) de; ++ len = ((char *)dot_de) + (blocksize - csum_size) - (char *)de; + + /* Allocate new block for the 0th block's dirents */ + bh2 = ext4_append(handle, dir, &block); +@@ -1992,19 +1991,24 @@ static int make_indexed_dir(handle_t *ha + } + + /* Initialize the root; the dot dirents already exist */ +- de = (struct ext4_dir_entry_2 *) (&root->dotdot); +- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), +- blocksize); +- memset (&root->info, 0, sizeof(root->info)); +- root->info.info_length = sizeof(root->info); +- root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; +- entries = root->entries; ++ dotdot_de->rec_len = ++ ext4_rec_len_to_disk(blocksize - le16_to_cpu(dot_de->rec_len), ++ blocksize); ++ ++ /* initialize hashing info */ ++ dx_info = dx_get_dx_info(dot_de); ++ memset(dx_info, 0, sizeof(*dx_info)); ++ dx_info->info_length = sizeof(*dx_info); ++ dx_info->hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; ++ ++ entries = (void *)dx_info + sizeof(*dx_info); ++ + dx_set_block(entries, 1); + dx_set_count(entries, 1); +- dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); ++ dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info))); + + /* Initialize as for dx_probe */ +- fname->hinfo.hash_version = root->info.hash_version; ++ fname->hinfo.hash_version = dx_info->hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; +@@ -2346,7 +2350,7 @@ again: + goto journal_error; + } + } else { +- struct dx_root *dxroot; ++ struct dx_root_info *info; + memcpy((char *) entries2, (char *) entries, + icount * sizeof(struct dx_entry)); + dx_set_limit(entries2, dx_node_limit(dir)); +@@ -2354,8 +2358,9 @@ again: + /* Set up root */ + dx_set_count(entries, 1); + dx_set_block(entries + 0, newblock); +- dxroot = (struct dx_root *)frames[0].bh->b_data; +- dxroot->info.indirect_levels += 1; ++ info = dx_get_dx_info((struct ext4_dir_entry_2 *) ++ frames[0].bh->b_data); ++ info->indirect_levels = 1; + dxtrace(printk(KERN_DEBUG + "Creating %d level index...\n", + info->indirect_levels)); diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-mballoc-extra-checks.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-mballoc-extra-checks.patch new file mode 100644 index 0000000..b298d35 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-mballoc-extra-checks.patch @@ -0,0 +1,320 @@ +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -2874,6 +2874,7 @@ struct ext4_group_info { + ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ + ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + struct list_head bb_prealloc_list; ++ unsigned long bb_prealloc_nr; + #ifdef DOUBLE_CHECK + void *bb_bitmap; + #endif +Index: linux-4.15.0/fs/ext4/mballoc.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/mballoc.c ++++ linux-4.15.0/fs/ext4/mballoc.c +@@ -363,7 +363,7 @@ static const char * const ext4_groupinfo + "ext4_groupinfo_64k", "ext4_groupinfo_128k" + }; + +-static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++static int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); + static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +@@ -717,7 +717,7 @@ mb_set_largest_free_order(struct super_b + } + + static noinline_for_stack +-void ext4_mb_generate_buddy(struct super_block *sb, ++int ext4_mb_generate_buddy(struct super_block *sb, + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +@@ -749,19 +749,13 @@ void ext4_mb_generate_buddy(struct super + grp->bb_fragments = fragments; + + if (free != grp->bb_free) { +- ext4_grp_locked_error(sb, group, 0, 0, +- "block bitmap and bg descriptor " +- "inconsistent: %u vs %u free clusters", +- free, grp->bb_free); +- /* +- * If we intend to continue, we consider group descriptor +- * corrupt and update bb_free using bitmap value +- */ +- grp->bb_free = free; +- if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) +- percpu_counter_sub(&sbi->s_freeclusters_counter, +- grp->bb_free); +- set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state); ++ struct ext4_group_desc *gdp; ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ ext4_error(sb, "group %lu: %u blocks in bitmap, %u in bb, " ++ "%u in gd, %lu pa's\n", (long unsigned int)group, ++ free, grp->bb_free, ext4_free_group_clusters(sb, gdp), ++ grp->bb_prealloc_nr); ++ return -EIO; + } + mb_set_largest_free_order(sb, grp); + +@@ -772,6 +766,8 @@ void ext4_mb_generate_buddy(struct super + EXT4_SB(sb)->s_mb_buddies_generated++; + EXT4_SB(sb)->s_mb_generation_time += period; + spin_unlock(&EXT4_SB(sb)->s_bal_lock); ++ ++ return 0; + } + + static void mb_regenerate_buddy(struct ext4_buddy *e4b) +@@ -892,7 +888,7 @@ static int ext4_mb_init_cache(struct pag + } + + first_block = page->index * blocks_per_page; +- for (i = 0; i < blocks_per_page; i++) { ++ for (i = 0; i < blocks_per_page && err == 0; i++) { + group = (first_block + i) >> 1; + if (group >= ngroups) + break; +@@ -936,7 +932,7 @@ static int ext4_mb_init_cache(struct pag + ext4_lock_group(sb, group); + /* init the buddy */ + memset(data, 0xff, blocksize); +- ext4_mb_generate_buddy(sb, data, incore, group); ++ err = ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); + incore = NULL; + } else { +@@ -951,7 +947,7 @@ static int ext4_mb_init_cache(struct pag + memcpy(data, bitmap, blocksize); + + /* mark all preallocated blks used in in-core bitmap */ +- ext4_mb_generate_from_pa(sb, data, group); ++ err = ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); + ext4_unlock_group(sb, group); + +@@ -961,7 +957,8 @@ static int ext4_mb_init_cache(struct pag + incore = data; + } + } +- SetPageUptodate(page); ++ if (likely(err == 0)) ++ SetPageUptodate(page); + + out: + if (bh) { +@@ -2309,9 +2306,11 @@ static void *ext4_mb_seq_groups_next(str + static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) + { + struct super_block *sb = seq->private; ++ struct ext4_group_desc *gdp; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); + int i; + int err, buddy_loaded = 0; ++ int free = 0; + struct ext4_buddy e4b; + struct ext4_group_info *grinfo; + unsigned char blocksize_bits = min_t(unsigned char, +@@ -2324,7 +2323,7 @@ static int ext4_mb_seq_groups_show(struc + + group--; + if (group == 0) +- seq_puts(seq, "#group: free frags first [" ++ seq_puts(seq, "#group: bfree gfree frags first pa [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); + +@@ -2342,13 +2341,19 @@ static int ext4_mb_seq_groups_show(struc + buddy_loaded = 1; + } + ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp != NULL) ++ free = ext4_free_group_clusters(sb, gdp); ++ + memcpy(&sg, ext4_get_group_info(sb, group), i); + + if (buddy_loaded) + ext4_mb_unload_buddy(&e4b); + +- seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, +- sg.info.bb_fragments, sg.info.bb_first_free); ++ seq_printf(seq, "#%-5lu: %-5u %-5u %-5u %-5u %-5lu [", ++ (long unsigned int)group, sg.info.bb_free, free, ++ sg.info.bb_fragments, sg.info.bb_first_free, ++ sg.info.bb_prealloc_nr); + for (i = 0; i <= 13; i++) + seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? + sg.info.bb_counters[i] : 0); +@@ -3656,22 +3661,71 @@ static void ext4_mb_generate_from_freeli + } + + /* ++ * check free blocks in bitmap match free block in group descriptor ++ * do this before taking preallocated blocks into account to be able ++ * to detect on-disk corruptions. The group lock should be hold by the ++ * caller. ++ */ ++int ext4_mb_check_ondisk_bitmap(struct super_block *sb, void *bitmap, ++ struct ext4_group_desc *gdp, int group) ++{ ++ unsigned short max = EXT4_CLUSTERS_PER_GROUP(sb); ++ unsigned short i, first, free = 0; ++ unsigned short free_in_gdp = ext4_free_group_clusters(sb, gdp); ++ ++ if (free_in_gdp == 0 && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) ++ return 0; ++ ++ i = mb_find_next_zero_bit(bitmap, max, 0); ++ ++ while (i < max) { ++ first = i; ++ i = mb_find_next_bit(bitmap, max, i); ++ if (i > max) ++ i = max; ++ free += i - first; ++ if (i < max) ++ i = mb_find_next_zero_bit(bitmap, max, i); ++ } ++ ++ if (free != free_in_gdp) { ++ ext4_error(sb, "on-disk bitmap for group %d" ++ "corrupted: %u blocks free in bitmap, %u - in gd\n", ++ group, free, free_in_gdp); ++ return -EIO; ++ } ++ return 0; ++} ++ ++/* + * the function goes through all preallocation in this group and marks them + * used in in-core bitmap. buddy must be generated from this bitmap + * Need to be called with ext4 group lock held + */ + static noinline_for_stack +-void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ++int ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); + struct ext4_prealloc_space *pa; ++ struct ext4_group_desc *gdp; + struct list_head *cur; + ext4_group_t groupnr; + ext4_grpblk_t start; + int preallocated = 0; ++ int skip = 0, count = 0; ++ int err; + int len; + ++ gdp = ext4_get_group_desc(sb, group, NULL); ++ if (gdp == NULL) ++ return -EIO; ++ ++ /* before applying preallocations, check bitmap consistency */ ++ err = ext4_mb_check_ondisk_bitmap(sb, bitmap, gdp, group); ++ if (err) ++ return err; ++ + /* all form of preallocation discards first load group, + * so the only competing code is preallocation use. + * we don't need any locking here +@@ -3687,13 +3741,23 @@ void ext4_mb_generate_from_pa(struct sup + &groupnr, &start); + len = pa->pa_len; + spin_unlock(&pa->pa_lock); +- if (unlikely(len == 0)) ++ if (unlikely(len == 0)) { ++ skip++; + continue; ++ } + BUG_ON(groupnr != group); + ext4_set_bits(bitmap, start, len); + preallocated += len; ++ count++; ++ } ++ if (count + skip != grp->bb_prealloc_nr) { ++ ext4_error(sb, "lost preallocations: " ++ "count %d, bb_prealloc_nr %lu, skip %d\n", ++ count, grp->bb_prealloc_nr, skip); ++ return -EIO; + } + mb_debug(1, "preallocated %u for group %u\n", preallocated, group); ++ return 0; + } + + static void ext4_mb_pa_callback(struct rcu_head *head) +@@ -3757,6 +3821,7 @@ static void ext4_mb_put_pa(struct ext4_a + */ + ext4_lock_group(sb, grp); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, grp)->bb_prealloc_nr--; + ext4_unlock_group(sb, grp); + + spin_lock(pa->pa_obj_lock); +@@ -3851,6 +3916,7 @@ ext4_mb_new_inode_pa(struct ext4_allocat + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + spin_lock(pa->pa_obj_lock); +@@ -3912,6 +3978,7 @@ ext4_mb_new_group_pa(struct ext4_allocat + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + list_add(&pa->pa_group_list, &grp->bb_prealloc_list); ++ grp->bb_prealloc_nr++; + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); + + /* +@@ -4084,6 +4151,8 @@ repeat: + + spin_unlock(&pa->pa_lock); + ++ BUG_ON(grp->bb_prealloc_nr == 0); ++ grp->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + list_add(&pa->u.pa_tmp_list, &list); + } +@@ -4214,7 +4283,7 @@ repeat: + if (err) { + ext4_error(sb, "Error %d loading buddy information for %u", + err, group); +- continue; ++ return; + } + + bitmap_bh = ext4_read_block_bitmap(sb, group); +@@ -4227,6 +4296,8 @@ repeat: + } + + ext4_lock_group(sb, group); ++ BUG_ON(e4b.bd_info->bb_prealloc_nr == 0); ++ e4b.bd_info->bb_prealloc_nr--; + list_del(&pa->pa_group_list); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_unlock_group(sb, group); +@@ -4489,6 +4560,7 @@ ext4_mb_discard_lg_preallocations(struct + } + ext4_lock_group(sb, group); + list_del(&pa->pa_group_list); ++ ext4_get_group_info(sb, group)->bb_prealloc_nr--; + ext4_mb_release_group_pa(&e4b, pa); + ext4_unlock_group(sb, group); + +Index: linux-4.15.0/fs/ext4/mballoc.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/mballoc.h ++++ linux-4.15.0/fs/ext4/mballoc.h +@@ -70,7 +70,7 @@ do { \ + /* + * for which requests use 2^N search using buddies + */ +-#define MB_DEFAULT_ORDER2_REQS 2 ++#define MB_DEFAULT_ORDER2_REQS 8 + + /* + * default group prealloc size 512 blocks diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-misc.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-misc.patch new file mode 100644 index 0000000..3ace5eb --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-misc.patch @@ -0,0 +1,183 @@ +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -1587,6 +1587,8 @@ static inline void ext4_clear_state_flag + */ + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + ++#define JOURNAL_START_HAS_3ARGS 1 ++ + /* + * Codes for operating systems + */ +@@ -1801,7 +1803,21 @@ static inline bool ext4_has_unknown_ext# + + EXTN_FEATURE_FUNCS(2) + EXTN_FEATURE_FUNCS(3) +-EXTN_FEATURE_FUNCS(4) ++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_compat & ++ cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & ++ cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0); ++} ++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb) ++{ ++ return ((EXT4_SB(sb)->s_es->s_feature_incompat & ++ cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0); ++} + + static inline bool ext4_has_compat_features(struct super_block *sb) + { +@@ -3103,6 +3119,11 @@ struct ext4_extent; + + extern int ext4_ext_tree_init(handle_t *handle, struct inode *); + extern int ext4_ext_writepage_trans_blocks(struct inode *, int); ++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb, ++ ext4_group_t block_group); ++extern struct buffer_head *ext4_append(handle_t *handle, ++ struct inode *inode, ++ ext4_lblk_t *block); + extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); + extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int flags); +Index: linux-4.15.0/fs/ext4/ialloc.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ialloc.c ++++ linux-4.15.0/fs/ext4/ialloc.c +@@ -156,7 +156,7 @@ static int ext4_validate_inode_bitmap(st + * + * Return buffer_head of bitmap on success or NULL. + */ +-static struct buffer_head * ++struct buffer_head * + ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) + { + struct ext4_group_desc *desc; +@@ -236,6 +236,7 @@ out: + put_bh(bh); + return ERR_PTR(err); + } ++EXPORT_SYMBOL(ext4_read_inode_bitmap); + + /* + * NOTE! When we get the inode, we're the only people +Index: linux-4.15.0/fs/ext4/inode.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/inode.c ++++ linux-4.15.0/fs/ext4/inode.c +@@ -6179,3 +6179,18 @@ int ext4_filemap_fault(struct vm_fault * + + return err; + } ++EXPORT_SYMBOL(ext4_map_blocks); ++EXPORT_SYMBOL(ext4_truncate); ++EXPORT_SYMBOL(ext4_iget); ++EXPORT_SYMBOL(ext4_bread); ++EXPORT_SYMBOL(ext4_itable_unused_count); ++EXPORT_SYMBOL(ext4_force_commit); ++EXPORT_SYMBOL(ext4_mark_inode_dirty); ++EXPORT_SYMBOL(ext4_get_group_desc); ++EXPORT_SYMBOL(__ext4_journal_get_write_access); ++EXPORT_SYMBOL(__ext4_journal_start_sb); ++EXPORT_SYMBOL(__ext4_journal_stop); ++EXPORT_SYMBOL(__ext4_handle_dirty_metadata); ++EXPORT_SYMBOL(__ext4_std_error); ++EXPORT_SYMBOL(ext4fs_dirhash); ++EXPORT_SYMBOL(ext4_get_inode_loc); +Index: linux-4.15.0/fs/ext4/mballoc.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/mballoc.c ++++ linux-4.15.0/fs/ext4/mballoc.c +@@ -721,7 +721,6 @@ void ext4_mb_generate_buddy(struct super + void *buddy, void *bitmap, ext4_group_t group) + { + struct ext4_group_info *grp = ext4_get_group_info(sb, group); +- struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); + ext4_grpblk_t i = 0; + ext4_grpblk_t first; +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -48,7 +48,7 @@ + #define NAMEI_RA_BLOCKS 4 + #define NAMEI_RA_SIZE (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS) + +-static struct buffer_head *ext4_append(handle_t *handle, ++struct buffer_head *ext4_append(handle_t *handle, + struct inode *inode, + ext4_lblk_t *block) + { +@@ -158,6 +158,7 @@ static struct buffer_head *__ext4_read_d + } + return bh; + } ++EXPORT_SYMBOL(ext4_append); + + #ifndef assert + #define assert(test) J_ASSERT(test) +@@ -2412,23 +2413,25 @@ EXPORT_SYMBOL(ext4_delete_entry); + * for checking S_ISDIR(inode) (since the INODE_INDEX feature will not be set + * on regular files) and to avoid creating huge/slow non-HTREE directories. + */ +-static void ext4_inc_count(handle_t *handle, struct inode *inode) ++void ext4_inc_count(handle_t *handle, struct inode *inode) + { + inc_nlink(inode); + if (is_dx(inode) && + (inode->i_nlink > EXT4_LINK_MAX || inode->i_nlink == 2)) + set_nlink(inode, 1); + } ++EXPORT_SYMBOL(ext4_inc_count); + + /* + * If a directory had nlink == 1, then we should let it be 1. This indicates + * directory has >EXT4_LINK_MAX subdirs. + */ +-static void ext4_dec_count(handle_t *handle, struct inode *inode) ++void ext4_dec_count(handle_t *handle, struct inode *inode) + { + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) + drop_nlink(inode); + } ++EXPORT_SYMBOL(ext4_dec_count); + + + static int ext4_add_nondir(handle_t *handle, +Index: linux-4.15.0/fs/ext4/super.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/super.c ++++ linux-4.15.0/fs/ext4/super.c +@@ -5860,16 +5860,12 @@ static int __init ext4_init_fs(void) + err = init_inodecache(); + if (err) + goto out1; +- register_as_ext3(); +- register_as_ext2(); + err = register_filesystem(&ext4_fs_type); + if (err) + goto out; + + return 0; + out: +- unregister_as_ext2(); +- unregister_as_ext3(); + destroy_inodecache(); + out1: + ext4_exit_mballoc(); +@@ -5888,8 +5884,6 @@ out5: + static void __exit ext4_exit_fs(void) + { + ext4_destroy_lazyinit_thread(); +- unregister_as_ext2(); +- unregister_as_ext3(); + unregister_filesystem(&ext4_fs_type); + destroy_inodecache(); + ext4_exit_mballoc(); diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-nocmtime.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-nocmtime.patch new file mode 100644 index 0000000..718ee9d --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-nocmtime.patch @@ -0,0 +1,29 @@ +We won't change i_xtime in ldiskfs code path. But also +need keep normal function out of Lustre. So we using +S_NOCMTIME to indicate invoked from Lustre. + +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -655,6 +655,20 @@ enum { + #define EXT4_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ + #define EXT4_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + ++#define current_time(a) ext4_current_time(a) ++static inline struct timespec ext4_current_time(struct inode *inode) ++{ ++ struct timespec now = current_kernel_time(); ++ ++ if (IS_NOCMTIME(inode)) ++ return inode->i_ctime; ++ ++ /* Copy from fs/inode.c */ ++ if (unlikely(!inode->i_sb)) ++ return now; ++ ++ return timespec_trunc(now, inode->i_sb->s_time_gran); ++} + + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) + /* diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-osd-iop-common.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-osd-iop-common.patch new file mode 100644 index 0000000..3c58c30c --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-osd-iop-common.patch @@ -0,0 +1,67 @@ +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -2504,6 +2504,11 @@ extern int ext4_dirent_csum_verify(struc + struct ext4_dir_entry *dirent); + extern int ext4_orphan_add(handle_t *, struct inode *); + extern int ext4_orphan_del(handle_t *, struct inode *); ++extern struct inode *ext4_create_inode(handle_t *handle, ++ struct inode *dir, int mode); ++extern int ext4_delete_entry(handle_t *handle, struct inode * dir, ++ struct ext4_dir_entry_2 *de_del, ++ struct buffer_head *bh); + extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + extern int ext4_search_dir(struct buffer_head *bh, +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -2360,7 +2360,7 @@ int ext4_generic_delete_entry(handle_t * + return -ENOENT; + } + +-static int ext4_delete_entry(handle_t *handle, ++int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) +@@ -2400,7 +2400,7 @@ out: + ext4_std_error(dir->i_sb, err); + return err; + } +- ++EXPORT_SYMBOL(ext4_delete_entry); + /* + * Set directory link count to 1 if nlinks > EXT4_LINK_MAX, or if nlinks == 2 + * since this indicates that nlinks count was previously 1 to avoid overflowing +@@ -2447,6 +2447,28 @@ static int ext4_add_nondir(handle_t *han + return err; + } + ++ /* Return locked inode, then the caller can modify the inode's states/flags ++ * before others finding it. The caller should unlock the inode by itself. */ ++struct inode *ext4_create_inode(handle_t *handle, struct inode *dir, int mode) ++{ ++ struct inode *inode; ++ ++ inode = ext4_new_inode(handle, dir, mode, NULL, 0, NULL, 0); ++ if (!IS_ERR(inode)) { ++ if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode)) { ++#ifdef CONFIG_LDISKFS_FS_XATTR ++ inode->i_op = &ext4_special_inode_operations; ++#endif ++ } else { ++ inode->i_op = &ext4_file_inode_operations; ++ inode->i_fop = &ext4_file_operations; ++ ext4_set_aops(inode); ++ } ++ } ++ return inode; ++} ++EXPORT_SYMBOL(ext4_create_inode); ++ + /* + * By the time this is called, we already have created + * the directory cache entry for the new file, but it diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-pdirop.patch new file mode 100644 index 0000000..4f93d37 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-pdirop.patch @@ -0,0 +1,1931 @@ +Single directory performance is a critical for HPC workloads. In a +typical use case an application creates a separate output file for +each node and task in a job. As nodes and tasks increase, hundreds +of thousands of files may be created in a single directory within +a short window of time. +Today, both filename lookup and file system modifying operations +(such as create and unlink) are protected with a single lock for +an entire ldiskfs directory. PDO project will remove this +bottleneck by introducing a parallel locking mechanism for entire +ldiskfs directories. This work will enable multiple application +threads to simultaneously lookup, create and unlink in parallel. + +This patch contains: + - pdirops support for ldiskfs + - integrate with osd-ldiskfs + +Index: linux-4.15.0/fs/ext4/Makefile +=================================================================== +--- linux-4.15.0.orig/fs/ext4/Makefile ++++ linux-4.15.0/fs/ext4/Makefile +@@ -7,6 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o + + ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ ++ htree_lock.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o +Index: linux-4.15.0/fs/ext4/ext4.h +=================================================================== +--- linux-4.15.0.orig/fs/ext4/ext4.h ++++ linux-4.15.0/fs/ext4/ext4.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -932,6 +933,9 @@ struct ext4_inode_info { + __u32 i_dtime; + ext4_fsblk_t i_file_acl; + ++ /* following fields for parallel directory operations -bzzz */ ++ struct semaphore i_append_sem; ++ + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, +@@ -2114,6 +2118,72 @@ struct dx_hash_info + */ + #define HASH_NB_ALWAYS 1 + ++/* assume name-hash is protected by upper layer */ ++#define EXT4_HTREE_LOCK_HASH 0 ++ ++enum ext4_pdo_lk_types { ++#if EXT4_HTREE_LOCK_HASH ++ EXT4_LK_HASH, ++#endif ++ EXT4_LK_DX, /* index block */ ++ EXT4_LK_DE, /* directory entry block */ ++ EXT4_LK_SPIN, /* spinlock */ ++ EXT4_LK_MAX, ++}; ++ ++/* read-only bit */ ++#define EXT4_LB_RO(b) (1 << (b)) ++/* read + write, high bits for writer */ ++#define EXT4_LB_RW(b) ((1 << (b)) | (1 << (EXT4_LK_MAX + (b)))) ++ ++enum ext4_pdo_lock_bits { ++ /* DX lock bits */ ++ EXT4_LB_DX_RO = EXT4_LB_RO(EXT4_LK_DX), ++ EXT4_LB_DX = EXT4_LB_RW(EXT4_LK_DX), ++ /* DE lock bits */ ++ EXT4_LB_DE_RO = EXT4_LB_RO(EXT4_LK_DE), ++ EXT4_LB_DE = EXT4_LB_RW(EXT4_LK_DE), ++ /* DX spinlock bits */ ++ EXT4_LB_SPIN_RO = EXT4_LB_RO(EXT4_LK_SPIN), ++ EXT4_LB_SPIN = EXT4_LB_RW(EXT4_LK_SPIN), ++ /* accurate searching */ ++ EXT4_LB_EXACT = EXT4_LB_RO(EXT4_LK_MAX << 1), ++}; ++ ++enum ext4_pdo_lock_opc { ++ /* external */ ++ EXT4_HLOCK_READDIR = (EXT4_LB_DE_RO | EXT4_LB_DX_RO), ++ EXT4_HLOCK_LOOKUP = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL = (EXT4_LB_DE | EXT4_LB_SPIN_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_ADD = (EXT4_LB_DE | EXT4_LB_SPIN_RO), ++ ++ /* internal */ ++ EXT4_HLOCK_LOOKUP_SAFE = (EXT4_LB_DE_RO | EXT4_LB_DX_RO | ++ EXT4_LB_EXACT), ++ EXT4_HLOCK_DEL_SAFE = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT), ++ EXT4_HLOCK_SPLIT = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN), ++}; ++ ++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits); ++#define ext4_htree_lock_head_free(lhead) htree_lock_head_free(lhead) ++ ++extern struct htree_lock *ext4_htree_lock_alloc(void); ++#define ext4_htree_lock_free(lck) htree_lock_free(lck) ++ ++extern void ext4_htree_lock(struct htree_lock *lck, ++ struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags); ++#define ext4_htree_unlock(lck) htree_unlock(lck) ++ ++extern struct buffer_head *__ext4_find_entry(struct inode *dir, ++ const struct qstr *d_name, ++ struct ext4_dir_entry_2 **res_dir, ++ int *inlined, struct htree_lock *lck); ++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck); ++ + struct ext4_filename { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; +@@ -2421,8 +2491,16 @@ void ext4_insert_dentry(struct inode *in + struct ext4_filename *fname, void *data); + static inline void ext4_update_dx_flag(struct inode *inode) + { ++ /* Disable it for ldiskfs, because going from a DX directory to ++ * a non-DX directory while it is in use will completely break ++ * the htree-locking. ++ * If we really want to support this operation in the future, ++ * we need to exclusively lock the directory at here which will ++ * increase complexity of code */ ++#if 0 + if (!ext4_has_feature_dir_index(inode->i_sb)) + ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); ++#endif + } + static const unsigned char ext4_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +Index: linux-4.15.0/fs/ext4/htree_lock.c +=================================================================== +--- /dev/null ++++ linux-4.15.0/fs/ext4/htree_lock.c +@@ -0,0 +1,880 @@ ++/* ++ * fs/ext4/htree_lock.c ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++#include ++#include ++#include ++#include ++ ++enum { ++ HTREE_LOCK_BIT_EX = (1 << HTREE_LOCK_EX), ++ HTREE_LOCK_BIT_PW = (1 << HTREE_LOCK_PW), ++ HTREE_LOCK_BIT_PR = (1 << HTREE_LOCK_PR), ++ HTREE_LOCK_BIT_CW = (1 << HTREE_LOCK_CW), ++ HTREE_LOCK_BIT_CR = (1 << HTREE_LOCK_CR), ++}; ++ ++enum { ++ HTREE_LOCK_COMPAT_EX = 0, ++ HTREE_LOCK_COMPAT_PW = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR, ++ HTREE_LOCK_COMPAT_PR = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR, ++ HTREE_LOCK_COMPAT_CW = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW, ++ HTREE_LOCK_COMPAT_CR = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR | ++ HTREE_LOCK_BIT_PW, ++}; ++ ++static int htree_lock_compat[] = { ++ [HTREE_LOCK_EX] HTREE_LOCK_COMPAT_EX, ++ [HTREE_LOCK_PW] HTREE_LOCK_COMPAT_PW, ++ [HTREE_LOCK_PR] HTREE_LOCK_COMPAT_PR, ++ [HTREE_LOCK_CW] HTREE_LOCK_COMPAT_CW, ++ [HTREE_LOCK_CR] HTREE_LOCK_COMPAT_CR, ++}; ++ ++/* max allowed htree-lock depth. ++ * We only need depth=3 for ext4 although user can have higher value. */ ++#define HTREE_LOCK_DEP_MAX 16 ++ ++#ifdef HTREE_LOCK_DEBUG ++ ++static char *hl_name[] = { ++ [HTREE_LOCK_EX] "EX", ++ [HTREE_LOCK_PW] "PW", ++ [HTREE_LOCK_PR] "PR", ++ [HTREE_LOCK_CW] "CW", ++ [HTREE_LOCK_CR] "CR", ++}; ++ ++/* lock stats */ ++struct htree_lock_node_stats { ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long retried[HTREE_LOCK_MAX]; ++ unsigned long long events; ++}; ++ ++struct htree_lock_stats { ++ struct htree_lock_node_stats nodes[HTREE_LOCK_DEP_MAX]; ++ unsigned long long granted[HTREE_LOCK_MAX]; ++ unsigned long long blocked[HTREE_LOCK_MAX]; ++}; ++ ++static struct htree_lock_stats hl_stats; ++ ++void htree_lock_stat_reset(void) ++{ ++ memset(&hl_stats, 0, sizeof(hl_stats)); ++} ++ ++void htree_lock_stat_print(int depth) ++{ ++ int i; ++ int j; ++ ++ printk(KERN_DEBUG "HTREE LOCK STATS:\n"); ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n", ++ hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]); ++ } ++ for (i = 0; i < depth; i++) { ++ printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i); ++ for (j = 0; j < HTREE_LOCK_MAX; j++) { ++ printk(KERN_DEBUG ++ "[%s]: G [%10llu], B [%10llu], R [%10llu]\n", ++ hl_name[j], hl_stats.nodes[i].granted[j], ++ hl_stats.nodes[i].blocked[j], ++ hl_stats.nodes[i].retried[j]); ++ } ++ } ++} ++ ++#define lk_grant_inc(m) do { hl_stats.granted[m]++; } while (0) ++#define lk_block_inc(m) do { hl_stats.blocked[m]++; } while (0) ++#define ln_grant_inc(d, m) do { hl_stats.nodes[d].granted[m]++; } while (0) ++#define ln_block_inc(d, m) do { hl_stats.nodes[d].blocked[m]++; } while (0) ++#define ln_retry_inc(d, m) do { hl_stats.nodes[d].retried[m]++; } while (0) ++#define ln_event_inc(d) do { hl_stats.nodes[d].events++; } while (0) ++ ++#else /* !DEBUG */ ++ ++void htree_lock_stat_reset(void) {} ++void htree_lock_stat_print(int depth) {} ++ ++#define lk_grant_inc(m) do {} while (0) ++#define lk_block_inc(m) do {} while (0) ++#define ln_grant_inc(d, m) do {} while (0) ++#define ln_block_inc(d, m) do {} while (0) ++#define ln_retry_inc(d, m) do {} while (0) ++#define ln_event_inc(d) do {} while (0) ++ ++#endif /* DEBUG */ ++ ++EXPORT_SYMBOL(htree_lock_stat_reset); ++EXPORT_SYMBOL(htree_lock_stat_print); ++ ++#define HTREE_DEP_ROOT (-1) ++ ++#define htree_spin_lock(lhead, dep) \ ++ bit_spin_lock((dep) + 1, &(lhead)->lh_lock) ++#define htree_spin_unlock(lhead, dep) \ ++ bit_spin_unlock((dep) + 1, &(lhead)->lh_lock) ++ ++#define htree_key_event_ignore(child, ln) \ ++ (!((child)->lc_events & (1 << (ln)->ln_mode))) ++ ++static int ++htree_key_list_empty(struct htree_lock_node *ln) ++{ ++ return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list); ++} ++ ++static void ++htree_key_list_del_init(struct htree_lock_node *ln) ++{ ++ struct htree_lock_node *tmp = NULL; ++ ++ if (!list_empty(&ln->ln_minor_list)) { ++ tmp = list_entry(ln->ln_minor_list.next, ++ struct htree_lock_node, ln_minor_list); ++ list_del_init(&ln->ln_minor_list); ++ } ++ ++ if (list_empty(&ln->ln_major_list)) ++ return; ++ ++ if (tmp == NULL) { /* not on minor key list */ ++ list_del_init(&ln->ln_major_list); ++ } else { ++ BUG_ON(!list_empty(&tmp->ln_major_list)); ++ list_replace_init(&ln->ln_major_list, &tmp->ln_major_list); ++ } ++} ++ ++static void ++htree_key_list_replace_init(struct htree_lock_node *old, ++ struct htree_lock_node *new) ++{ ++ if (!list_empty(&old->ln_major_list)) ++ list_replace_init(&old->ln_major_list, &new->ln_major_list); ++ ++ if (!list_empty(&old->ln_minor_list)) ++ list_replace_init(&old->ln_minor_list, &new->ln_minor_list); ++} ++ ++static void ++htree_key_event_enqueue(struct htree_lock_child *child, ++ struct htree_lock_node *ln, int dep, void *event) ++{ ++ struct htree_lock_node *tmp; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ BUG_ON(ln->ln_mode == HTREE_LOCK_NL); ++ if (event == NULL || htree_key_event_ignore(child, ln)) ++ return; ++ ++ /* shouldn't be a very long list */ ++ list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) { ++ if (tmp->ln_mode == HTREE_LOCK_NL) { ++ ln_event_inc(dep); ++ if (child->lc_callback != NULL) ++ child->lc_callback(tmp->ln_ev_target, event); ++ } ++ } ++} ++ ++static int ++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk, ++ unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_child *child = &newlk->lk_head->lh_children[dep]; ++ struct htree_lock_node *newln = &newlk->lk_nodes[dep]; ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ ++ /* NB: ALWAYS called holding lhead::lh_lock(dep) */ ++ /* NB: we only expect PR/PW lock mode at here, only these two modes are ++ * allowed for htree_node_lock(asserted in htree_node_lock_internal), ++ * NL is only used for listener, user can't directly require NL mode */ ++ if ((curln->ln_mode == HTREE_LOCK_NL) || ++ (curln->ln_mode != HTREE_LOCK_PW && ++ newln->ln_mode != HTREE_LOCK_PW)) { ++ /* no conflict, attach it on granted list of @curlk */ ++ if (curln->ln_mode != HTREE_LOCK_NL) { ++ list_add(&newln->ln_granted_list, ++ &curln->ln_granted_list); ++ } else { ++ /* replace key owner */ ++ htree_key_list_replace_init(curln, newln); ++ } ++ ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ htree_key_event_enqueue(child, newln, dep, event); ++ ln_grant_inc(dep, newln->ln_mode); ++ return 1; /* still hold lh_lock */ ++ } ++ ++ if (!wait) { /* can't grant and don't want to wait */ ++ ln_retry_inc(dep, newln->ln_mode); ++ newln->ln_mode = HTREE_LOCK_INVAL; ++ return -1; /* don't wait and just return -1 */ ++ } ++ ++ newlk->lk_task = current; ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ /* conflict, attach it on blocked list of curlk */ ++ list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list); ++ list_add(&newln->ln_alive_list, &curln->ln_alive_list); ++ ln_block_inc(dep, newln->ln_mode); ++ ++ htree_spin_unlock(newlk->lk_head, dep); ++ /* wait to be given the lock */ ++ if (newlk->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt, wake up will set me RUNNING */ ++ if (event == NULL || htree_key_event_ignore(child, newln)) ++ return 0; /* granted without lh_lock */ ++ ++ htree_spin_lock(newlk->lk_head, dep); ++ htree_key_event_enqueue(child, newln, dep, event); ++ return 1; /* still hold lh_lock */ ++} ++ ++/* ++ * get PR/PW access to particular tree-node according to @dep and @key, ++ * it will return -1 if @wait is false and can't immediately grant this lock. ++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get ++ * @event if it's not NULL. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck, ++ htree_lock_mode_t mode, u32 key, unsigned dep, ++ int wait, void *event) ++{ ++ LIST_HEAD(list); ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ u16 major; ++ u16 minor; ++ u8 reverse; ++ u8 ma_bits; ++ u8 mi_bits; ++ ++ BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR); ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ ++ key = hash_long(key, lhead->lh_hbits); ++ ++ mi_bits = lhead->lh_hbits >> 1; ++ ma_bits = lhead->lh_hbits - mi_bits; ++ ++ lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1); ++ lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits; ++ lck->lk_nodes[dep].ln_mode = mode; ++ ++ /* ++ * The major key list is an ordered list, so searches are started ++ * at the end of the list that is numerically closer to major_key, ++ * so at most half of the list will be walked (for well-distributed ++ * keys). The list traversal aborts early if the expected key ++ * location is passed. ++ */ ++ reverse = (major >= (1 << (ma_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp, ++ &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key < major) { ++ /* attach _after_ @tmp */ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ ++ } else { ++ list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list, ++ lk_nodes[dep].ln_major_list) { ++ if (tmp->lk_nodes[dep].ln_major_key == major) { ++ goto search_minor; ++ ++ } else if (tmp->lk_nodes[dep].ln_major_key > major) { ++ /* insert _before_ @tmp */ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &tmp->lk_nodes[dep].ln_major_list); ++ goto out_grant_major; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_major_list, ++ &lhead->lh_children[dep].lc_list); ++ goto out_grant_major; ++ } ++ ++ search_minor: ++ /* ++ * NB: minor_key list doesn't have a "head", @list is just a ++ * temporary stub for helping list searching, make sure it's removed ++ * after searching. ++ * minor_key list is an ordered list too. ++ */ ++ list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list); ++ ++ reverse = (minor >= (1 << (mi_bits - 1))); ++ ++ if (reverse) { ++ list_for_each_entry_reverse(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) { ++ /* attach _after_ @tmp2 */ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add(&lck->lk_nodes[dep].ln_minor_list, &list); ++ ++ } else { ++ list_for_each_entry(tmp2, &list, ++ lk_nodes[dep].ln_minor_list) { ++ if (tmp2->lk_nodes[dep].ln_minor_key == minor) { ++ goto out_enqueue; ++ ++ } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) { ++ /* insert _before_ @tmp2 */ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, ++ &tmp2->lk_nodes[dep].ln_minor_list); ++ goto out_grant_minor; ++ } ++ } ++ ++ list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list); ++ } ++ ++ out_grant_minor: ++ if (list.next == &lck->lk_nodes[dep].ln_minor_list) { ++ /* new lock @lck is the first one on minor_key list, which ++ * means it has the smallest minor_key and it should ++ * replace @tmp as minor_key owner */ ++ list_replace_init(&tmp->lk_nodes[dep].ln_major_list, ++ &lck->lk_nodes[dep].ln_major_list); ++ } ++ /* remove the temporary head */ ++ list_del(&list); ++ ++ out_grant_major: ++ ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode); ++ return 1; /* granted with holding lh_lock */ ++ ++ out_enqueue: ++ list_del(&list); /* remove temprary head */ ++ return htree_node_lock_enqueue(lck, tmp2, dep, wait, event); ++} ++ ++/* ++ * release the key of @lck at level @dep, and grant any blocked locks. ++ * caller will still listen on @key if @event is not NULL, which means ++ * caller can see a event (by event_cb) while granting any lock with ++ * the same key at level @dep. ++ * NB: ALWAYS called holding lhead::lh_lock ++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL ++ */ ++static void ++htree_node_unlock_internal(struct htree_lock_head *lhead, ++ struct htree_lock *curlk, unsigned dep, void *event) ++{ ++ struct htree_lock_node *curln = &curlk->lk_nodes[dep]; ++ struct htree_lock *grtlk = NULL; ++ struct htree_lock_node *grtln; ++ struct htree_lock *poslk; ++ struct htree_lock *tmplk; ++ ++ if (!htree_node_is_granted(curlk, dep)) ++ return; ++ ++ if (!list_empty(&curln->ln_granted_list)) { ++ /* there is another granted lock */ ++ grtlk = list_entry(curln->ln_granted_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_granted_list); ++ list_del_init(&curln->ln_granted_list); ++ } ++ ++ if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) { ++ /* ++ * @curlk is the only granted lock, so we confirmed: ++ * a) curln is key owner (attached on major/minor_list), ++ * so if there is any blocked lock, it should be attached ++ * on curln->ln_blocked_list ++ * b) we always can grant the first blocked lock ++ */ ++ grtlk = list_entry(curln->ln_blocked_list.next, ++ struct htree_lock, ++ lk_nodes[dep].ln_blocked_list); ++ BUG_ON(grtlk->lk_task == NULL); ++ wake_up_process(grtlk->lk_task); ++ } ++ ++ if (event != NULL && ++ lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) { ++ curln->ln_ev_target = event; ++ curln->ln_mode = HTREE_LOCK_NL; /* listen! */ ++ } else { ++ curln->ln_mode = HTREE_LOCK_INVAL; ++ } ++ ++ if (grtlk == NULL) { /* I must be the only one locking this key */ ++ struct htree_lock_node *tmpln; ++ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (curln->ln_mode == HTREE_LOCK_NL) /* listening */ ++ return; ++ ++ /* not listening */ ++ if (list_empty(&curln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(curln); ++ return; ++ } ++ ++ tmpln = list_entry(curln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL); ++ ++ htree_key_list_replace_init(curln, tmpln); ++ list_del_init(&curln->ln_alive_list); ++ ++ return; ++ } ++ ++ /* have a granted lock */ ++ grtln = &grtlk->lk_nodes[dep]; ++ if (!list_empty(&curln->ln_blocked_list)) { ++ /* only key owner can be on both lists */ ++ BUG_ON(htree_key_list_empty(curln)); ++ ++ if (list_empty(&grtln->ln_blocked_list)) { ++ list_add(&grtln->ln_blocked_list, ++ &curln->ln_blocked_list); ++ } ++ list_del_init(&curln->ln_blocked_list); ++ } ++ /* ++ * NB: this is the tricky part: ++ * We have only two modes for child-lock (PR and PW), also, ++ * only owner of the key (attached on major/minor_list) can be on ++ * both blocked_list and granted_list, so @grtlk must be one ++ * of these two cases: ++ * ++ * a) @grtlk is taken from granted_list, which means we've granted ++ * more than one lock so @grtlk has to be PR, the first blocked ++ * lock must be PW and we can't grant it at all. ++ * So even @grtlk is not owner of the key (empty blocked_list), ++ * we don't care because we can't grant any lock. ++ * b) we just grant a new lock which is taken from head of blocked ++ * list, and it should be the first granted lock, and it should ++ * be the first one linked on blocked_list. ++ * ++ * Either way, we can get correct result by iterating blocked_list ++ * of @grtlk, and don't have to bother on how to find out ++ * owner of current key. ++ */ ++ list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list, ++ lk_nodes[dep].ln_blocked_list) { ++ if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW || ++ poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW) ++ break; ++ /* grant all readers */ ++ list_del_init(&poslk->lk_nodes[dep].ln_blocked_list); ++ list_add(&poslk->lk_nodes[dep].ln_granted_list, ++ &grtln->ln_granted_list); ++ ++ BUG_ON(poslk->lk_task == NULL); ++ wake_up_process(poslk->lk_task); ++ } ++ ++ /* if @curln is the owner of this key, replace it with @grtln */ ++ if (!htree_key_list_empty(curln)) ++ htree_key_list_replace_init(curln, grtln); ++ ++ if (curln->ln_mode == HTREE_LOCK_INVAL) ++ list_del_init(&curln->ln_alive_list); ++} ++ ++/* ++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted ++ * and 0 only if @wait is false and can't grant it immediately ++ */ ++int ++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event); ++ if (rc != 0) ++ htree_spin_unlock(lhead, dep); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_node_lock_try); ++ ++/* it's wrapper of htree_node_unlock_internal */ ++void ++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ ++ BUG_ON(dep >= lck->lk_depth); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_spin_lock(lhead, dep); ++ htree_node_unlock_internal(lhead, lck, dep, event); ++ htree_spin_unlock(lhead, dep); ++} ++EXPORT_SYMBOL(htree_node_unlock); ++ ++/* stop listening on child-lock level @dep */ ++void ++htree_node_stop_listen(struct htree_lock *lck, unsigned dep) ++{ ++ struct htree_lock_node *ln = &lck->lk_nodes[dep]; ++ struct htree_lock_node *tmp; ++ ++ BUG_ON(htree_node_is_granted(lck, dep)); ++ BUG_ON(!list_empty(&ln->ln_blocked_list)); ++ BUG_ON(!list_empty(&ln->ln_granted_list)); ++ ++ if (!htree_node_is_listening(lck, dep)) ++ return; ++ ++ htree_spin_lock(lck->lk_head, dep); ++ ln->ln_mode = HTREE_LOCK_INVAL; ++ ln->ln_ev_target = NULL; ++ ++ if (htree_key_list_empty(ln)) { /* not owner */ ++ list_del_init(&ln->ln_alive_list); ++ goto out; ++ } ++ ++ /* I'm the owner... */ ++ if (list_empty(&ln->ln_alive_list)) { /* no more listener */ ++ htree_key_list_del_init(ln); ++ goto out; ++ } ++ ++ tmp = list_entry(ln->ln_alive_list.next, ++ struct htree_lock_node, ln_alive_list); ++ ++ BUG_ON(tmp->ln_mode != HTREE_LOCK_NL); ++ htree_key_list_replace_init(ln, tmp); ++ list_del_init(&ln->ln_alive_list); ++ out: ++ htree_spin_unlock(lck->lk_head, dep); ++} ++EXPORT_SYMBOL(htree_node_stop_listen); ++ ++/* release all child-locks if we have any */ ++static void ++htree_node_release_all(struct htree_lock *lck) ++{ ++ int i; ++ ++ for (i = 0; i < lck->lk_depth; i++) { ++ if (htree_node_is_granted(lck, i)) ++ htree_node_unlock(lck, i, NULL); ++ else if (htree_node_is_listening(lck, i)) ++ htree_node_stop_listen(lck, i); ++ } ++} ++ ++/* ++ * obtain htree lock, it could be blocked inside if there's conflict ++ * with any granted or blocked lock and @wait is true. ++ * NB: ALWAYS called holding lhead::lh_lock ++ */ ++static int ++htree_lock_internal(struct htree_lock *lck, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int granted = 0; ++ int blocked = 0; ++ int i; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ if (lhead->lh_nblocked[i] != 0) ++ blocked |= 1 << i; ++ } ++ if ((htree_lock_compat[lck->lk_mode] & granted) != granted || ++ (htree_lock_compat[lck->lk_mode] & blocked) != blocked) { ++ /* will block current lock even it just conflicts with any ++ * other blocked lock, so lock like EX wouldn't starve */ ++ if (!wait) ++ return -1; ++ lhead->lh_nblocked[lck->lk_mode]++; ++ lk_block_inc(lck->lk_mode); ++ ++ lck->lk_task = current; ++ list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list); ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ /* wait to be given the lock */ ++ if (lck->lk_task != NULL) ++ schedule(); ++ /* granted, no doubt. wake up will set me RUNNING */ ++ return 0; /* without lh_lock */ ++ } ++ lhead->lh_ngranted[lck->lk_mode]++; ++ lk_grant_inc(lck->lk_mode); ++ return 1; ++} ++ ++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */ ++static void ++htree_unlock_internal(struct htree_lock *lck) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ struct htree_lock *tmp; ++ struct htree_lock *tmp2; ++ int granted = 0; ++ int i; ++ ++ BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0); ++ ++ lhead->lh_ngranted[lck->lk_mode]--; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ ++ for (i = 0; i < HTREE_LOCK_MAX; i++) { ++ if (lhead->lh_ngranted[i] != 0) ++ granted |= 1 << i; ++ } ++ list_for_each_entry_safe(tmp, tmp2, ++ &lhead->lh_blocked_list, lk_blocked_list) { ++ /* conflict with any granted lock? */ ++ if ((htree_lock_compat[tmp->lk_mode] & granted) != granted) ++ break; ++ ++ list_del_init(&tmp->lk_blocked_list); ++ ++ BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0); ++ ++ lhead->lh_nblocked[tmp->lk_mode]--; ++ lhead->lh_ngranted[tmp->lk_mode]++; ++ granted |= 1 << tmp->lk_mode; ++ ++ BUG_ON(tmp->lk_task == NULL); ++ wake_up_process(tmp->lk_task); ++ } ++} ++ ++/* it's wrapper of htree_lock_internal and exported interface. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait) ++{ ++ int rc; ++ ++ BUG_ON(lck->lk_depth > lhead->lh_depth); ++ BUG_ON(lck->lk_head != NULL); ++ BUG_ON(lck->lk_task != NULL); ++ ++ lck->lk_head = lhead; ++ lck->lk_mode = mode; ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_lock_try); ++ ++/* it's wrapper of htree_unlock_internal and exported interface. ++ * It will release all htree_node_locks and htree_lock */ ++void ++htree_unlock(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_head == NULL); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT); ++ lck->lk_head = NULL; ++ lck->lk_task = NULL; ++} ++EXPORT_SYMBOL(htree_unlock); ++ ++/* change lock mode */ ++void ++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode) ++{ ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL); ++ lck->lk_mode = mode; ++} ++EXPORT_SYMBOL(htree_change_mode); ++ ++/* release htree lock, and lock it again with new mode. ++ * This function will first release all htree_node_locks and htree_lock, ++ * then try to gain htree_lock with new @mode. ++ * It always return 1 with granted lock if @wait is true, it can return 0 ++ * if @wait is false and locking request can't be granted immediately */ ++int ++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait) ++{ ++ struct htree_lock_head *lhead = lck->lk_head; ++ int rc; ++ ++ BUG_ON(lhead == NULL); ++ BUG_ON(lck->lk_mode == mode); ++ BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL); ++ ++ htree_node_release_all(lck); ++ ++ htree_spin_lock(lhead, HTREE_DEP_ROOT); ++ htree_unlock_internal(lck); ++ lck->lk_mode = mode; ++ rc = htree_lock_internal(lck, wait); ++ if (rc != 0) ++ htree_spin_unlock(lhead, HTREE_DEP_ROOT); ++ return rc >= 0; ++} ++EXPORT_SYMBOL(htree_change_lock_try); ++ ++/* create a htree_lock head with @depth levels (number of child-locks), ++ * it is a per resoruce structure */ ++struct htree_lock_head * ++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv) ++{ ++ struct htree_lock_head *lhead; ++ int i; ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ ++ lhead = kzalloc(offsetof(struct htree_lock_head, ++ lh_children[depth]) + priv, GFP_NOFS); ++ if (lhead == NULL) ++ return NULL; ++ ++ if (hbits < HTREE_HBITS_MIN) ++ lhead->lh_hbits = HTREE_HBITS_MIN; ++ else if (hbits > HTREE_HBITS_MAX) ++ lhead->lh_hbits = HTREE_HBITS_MAX; ++ ++ lhead->lh_lock = 0; ++ lhead->lh_depth = depth; ++ INIT_LIST_HEAD(&lhead->lh_blocked_list); ++ if (priv > 0) { ++ lhead->lh_private = (void *)lhead + ++ offsetof(struct htree_lock_head, lh_children[depth]); ++ } ++ ++ for (i = 0; i < depth; i++) { ++ INIT_LIST_HEAD(&lhead->lh_children[i].lc_list); ++ lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE; ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(htree_lock_head_alloc); ++ ++/* free the htree_lock head */ ++void ++htree_lock_head_free(struct htree_lock_head *lhead) ++{ ++ int i; ++ ++ BUG_ON(!list_empty(&lhead->lh_blocked_list)); ++ for (i = 0; i < lhead->lh_depth; i++) ++ BUG_ON(!list_empty(&lhead->lh_children[i].lc_list)); ++ kfree(lhead); ++} ++EXPORT_SYMBOL(htree_lock_head_free); ++ ++/* register event callback for @events of child-lock at level @dep */ ++void ++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep, ++ unsigned events, htree_event_cb_t callback) ++{ ++ BUG_ON(lhead->lh_depth <= dep); ++ lhead->lh_children[dep].lc_events = events; ++ lhead->lh_children[dep].lc_callback = callback; ++} ++EXPORT_SYMBOL(htree_lock_event_attach); ++ ++/* allocate a htree_lock, which is per-thread structure, @pbytes is some ++ * extra-bytes as private data for caller */ ++struct htree_lock * ++htree_lock_alloc(unsigned depth, unsigned pbytes) ++{ ++ struct htree_lock *lck; ++ int i = offsetof(struct htree_lock, lk_nodes[depth]); ++ ++ if (depth > HTREE_LOCK_DEP_MAX) { ++ printk(KERN_ERR "%d is larger than max htree_lock depth %d\n", ++ depth, HTREE_LOCK_DEP_MAX); ++ return NULL; ++ } ++ lck = kzalloc(i + pbytes, GFP_NOFS); ++ if (lck == NULL) ++ return NULL; ++ ++ if (pbytes != 0) ++ lck->lk_private = (void *)lck + i; ++ lck->lk_mode = HTREE_LOCK_INVAL; ++ lck->lk_depth = depth; ++ INIT_LIST_HEAD(&lck->lk_blocked_list); ++ ++ for (i = 0; i < depth; i++) { ++ struct htree_lock_node *node = &lck->lk_nodes[i]; ++ ++ node->ln_mode = HTREE_LOCK_INVAL; ++ INIT_LIST_HEAD(&node->ln_major_list); ++ INIT_LIST_HEAD(&node->ln_minor_list); ++ INIT_LIST_HEAD(&node->ln_alive_list); ++ INIT_LIST_HEAD(&node->ln_blocked_list); ++ INIT_LIST_HEAD(&node->ln_granted_list); ++ } ++ ++ return lck; ++} ++EXPORT_SYMBOL(htree_lock_alloc); ++ ++/* free htree_lock node */ ++void ++htree_lock_free(struct htree_lock *lck) ++{ ++ BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL); ++ kfree(lck); ++} ++EXPORT_SYMBOL(htree_lock_free); +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -53,6 +53,7 @@ struct buffer_head *ext4_append(handle_t + ext4_lblk_t *block) + { + struct buffer_head *bh; ++ struct ext4_inode_info *ei = EXT4_I(inode); + int err; + + if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && +@@ -60,15 +61,22 @@ struct buffer_head *ext4_append(handle_t + EXT4_SB(inode->i_sb)->s_max_dir_size_kb))) + return ERR_PTR(-ENOSPC); + ++ /* with parallel dir operations all appends ++ * have to be serialized -bzzz */ ++ down(&ei->i_append_sem); ++ + *block = inode->i_size >> inode->i_sb->s_blocksize_bits; + + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); +- if (IS_ERR(bh)) ++ if (IS_ERR(bh)) { ++ up(&ei->i_append_sem); + return bh; ++ } + inode->i_size += inode->i_sb->s_blocksize; + EXT4_I(inode)->i_disksize = inode->i_size; + BUFFER_TRACE(bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, bh); ++ up(&ei->i_append_sem); + if (err) { + brelse(bh); + ext4_std_error(inode->i_sb, err); +@@ -248,7 +256,8 @@ static unsigned dx_node_limit(struct ino + static struct dx_frame *dx_probe(struct ext4_filename *fname, + struct inode *dir, + struct dx_hash_info *hinfo, +- struct dx_frame *frame); ++ struct dx_frame *frame, ++ struct htree_lock *lck); + static void dx_release(struct dx_frame *frames); + static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, +@@ -262,12 +271,13 @@ static void dx_insert_block(struct dx_fr + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash); ++ __u32 *start_hash, struct htree_lock *lck); + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir); ++ struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck); + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode); ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck); + + /* checksumming functions */ + void initialize_dirent_tail(struct ext4_dir_entry_tail *t, +@@ -730,6 +740,227 @@ struct stats dx_show_entries(struct dx_h + } + #endif /* DX_DEBUG */ + ++/* private data for htree_lock */ ++struct ext4_dir_lock_data { ++ unsigned ld_flags; /* bits-map for lock types */ ++ unsigned ld_count; /* # entries of the last DX block */ ++ struct dx_entry ld_at_entry; /* copy of leaf dx_entry */ ++ struct dx_entry *ld_at; /* position of leaf dx_entry */ ++}; ++ ++#define ext4_htree_lock_data(l) ((struct ext4_dir_lock_data *)(l)->lk_private) ++#define ext4_find_entry(dir, name, dirent, inline) \ ++ __ext4_find_entry(dir, name, dirent, inline, NULL) ++#define ext4_add_entry(handle, dentry, inode) \ ++ __ext4_add_entry(handle, dentry, inode, NULL) ++ ++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */ ++#define EXT4_HTREE_NODE_CHANGED (0xcafeULL << 32) ++ ++static void ext4_htree_event_cb(void *target, void *event) ++{ ++ u64 *block = (u64 *)target; ++ ++ if (*block == dx_get_block((struct dx_entry *)event)) ++ *block = EXT4_HTREE_NODE_CHANGED; ++} ++ ++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits) ++{ ++ struct htree_lock_head *lhead; ++ ++ lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0); ++ if (lhead != NULL) { ++ htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR, ++ ext4_htree_event_cb); ++ } ++ return lhead; ++} ++EXPORT_SYMBOL(ext4_htree_lock_head_alloc); ++ ++struct htree_lock *ext4_htree_lock_alloc(void) ++{ ++ return htree_lock_alloc(EXT4_LK_MAX, ++ sizeof(struct ext4_dir_lock_data)); ++} ++EXPORT_SYMBOL(ext4_htree_lock_alloc); ++ ++static htree_lock_mode_t ext4_htree_mode(unsigned flags) ++{ ++ switch (flags) { ++ default: /* 0 or unknown flags require EX lock */ ++ return HTREE_LOCK_EX; ++ case EXT4_HLOCK_READDIR: ++ return HTREE_LOCK_PR; ++ case EXT4_HLOCK_LOOKUP: ++ return HTREE_LOCK_CR; ++ case EXT4_HLOCK_DEL: ++ case EXT4_HLOCK_ADD: ++ return HTREE_LOCK_CW; ++ } ++} ++ ++/* return PR for read-only operations, otherwise return EX */ ++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags) ++{ ++ int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE; ++ ++ /* 0 requires EX lock */ ++ return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR; ++} ++ ++static int ext4_htree_safe_locked(struct htree_lock *lck) ++{ ++ int writer; ++ ++ if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX) ++ return 1; ++ ++ writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) == ++ EXT4_LB_DE; ++ if (writer) /* all readers & writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_EX; ++ ++ /* all writers are excluded? */ ++ return lck->lk_mode == HTREE_LOCK_PR || ++ lck->lk_mode == HTREE_LOCK_PW || ++ lck->lk_mode == HTREE_LOCK_EX; ++} ++ ++/* relock htree_lock with EX mode if it's change operation, otherwise ++ * relock it with PR mode. It's noop if PDO is disabled. */ ++static void ext4_htree_safe_relock(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck)) { ++ unsigned flags = ext4_htree_lock_data(lck)->ld_flags; ++ ++ htree_change_lock(lck, ext4_htree_safe_mode(flags)); ++ } ++} ++ ++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead, ++ struct inode *dir, unsigned flags) ++{ ++ htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) : ++ ext4_htree_safe_mode(flags); ++ ++ ext4_htree_lock_data(lck)->ld_flags = flags; ++ htree_lock(lck, lhead, mode); ++ if (!is_dx(dir)) ++ ext4_htree_safe_relock(lck); /* make sure it's safe locked */ ++} ++EXPORT_SYMBOL(ext4_htree_lock); ++ ++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at, ++ unsigned lmask, int wait, void *ev) ++{ ++ u32 key = (at == NULL) ? 0 : dx_get_block(at); ++ u32 mode; ++ ++ /* NOOP if htree is well protected or caller doesn't require the lock */ ++ if (ext4_htree_safe_locked(lck) || ++ !(ext4_htree_lock_data(lck)->ld_flags & lmask)) ++ return 1; ++ ++ mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ? ++ HTREE_LOCK_PW : HTREE_LOCK_PR; ++ while (1) { ++ if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev)) ++ return 1; ++ if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */ ++ return 0; ++ cpu_relax(); /* spin until granted */ ++ } ++} ++ ++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask) ++{ ++ return ext4_htree_safe_locked(lck) || ++ htree_node_is_granted(lck, ffz(~lmask)); ++} ++ ++static void ext4_htree_node_unlock(struct htree_lock *lck, ++ unsigned lmask, void *buf) ++{ ++ /* NB: it's safe to call mutiple times or even it's not locked */ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_granted(lck, ffz(~lmask))) ++ htree_node_unlock(lck, ffz(~lmask), buf); ++} ++ ++#define ext4_htree_dx_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL) ++#define ext4_htree_dx_lock_try(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL) ++#define ext4_htree_dx_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL) ++#define ext4_htree_dx_locked(lck) \ ++ ext4_htree_node_locked(lck, EXT4_LB_DX) ++ ++static void ext4_htree_dx_need_lock(struct htree_lock *lck) ++{ ++ struct ext4_dir_lock_data *ld; ++ ++ if (ext4_htree_safe_locked(lck)) ++ return; ++ ++ ld = ext4_htree_lock_data(lck); ++ switch (ld->ld_flags) { ++ default: ++ return; ++ case EXT4_HLOCK_LOOKUP: ++ ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE; ++ return; ++ case EXT4_HLOCK_DEL: ++ ld->ld_flags = EXT4_HLOCK_DEL_SAFE; ++ return; ++ case EXT4_HLOCK_ADD: ++ ld->ld_flags = EXT4_HLOCK_SPLIT; ++ return; ++ } ++} ++ ++#define ext4_htree_de_lock(lck, key) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL) ++#define ext4_htree_de_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL) ++ ++#define ext4_htree_spin_lock(lck, key, event) \ ++ ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event) ++#define ext4_htree_spin_unlock(lck) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL) ++#define ext4_htree_spin_unlock_listen(lck, p) \ ++ ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p) ++ ++static void ext4_htree_spin_stop_listen(struct htree_lock *lck) ++{ ++ if (!ext4_htree_safe_locked(lck) && ++ htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN))) ++ htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN)); ++} ++ ++enum { ++ DX_HASH_COL_IGNORE, /* ignore collision while probing frames */ ++ DX_HASH_COL_YES, /* there is collision and it does matter */ ++ DX_HASH_COL_NO, /* there is no collision */ ++}; ++ ++static int dx_probe_hash_collision(struct htree_lock *lck, ++ struct dx_entry *entries, ++ struct dx_entry *at, u32 hash) ++{ ++ if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) { ++ return DX_HASH_COL_IGNORE; /* don't care about collision */ ++ ++ } else if (at == entries + dx_get_count(entries) - 1) { ++ return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */ ++ ++ } else { /* hash collision? */ ++ return ((dx_get_hash(at + 1) & ~1) == hash) ? ++ DX_HASH_COL_YES : DX_HASH_COL_NO; ++ } ++} ++ + /* + * Probe for a directory leaf block to search. + * +@@ -741,10 +972,11 @@ struct stats dx_show_entries(struct dx_h + */ + static struct dx_frame * + dx_probe(struct ext4_filename *fname, struct inode *dir, +- struct dx_hash_info *hinfo, struct dx_frame *frame_in) ++ struct dx_hash_info *hinfo, struct dx_frame *frame_in, ++ struct htree_lock *lck) + { + unsigned count, indirect; +- struct dx_entry *at, *entries, *p, *q, *m; ++ struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL; + struct dx_root_info *info; + struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); +@@ -806,8 +1038,15 @@ dx_probe(struct ext4_filename *fname, st + + dxtrace(printk("Look up %x", hash)); + while (1) { ++ if (indirect == 0) { /* the last index level */ ++ /* NB: ext4_htree_dx_lock() could be noop if ++ * DX-lock flag is not set for current operation */ ++ ext4_htree_dx_lock(lck, dx); ++ ext4_htree_spin_lock(lck, dx, NULL); ++ } + count = dx_get_count(entries); +- if (!count || count > dx_get_limit(entries)) { ++ if (count == 0 || count > dx_get_limit(entries)) { ++ ext4_htree_spin_unlock(lck); /* release spin */ + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); +@@ -846,8 +1085,70 @@ dx_probe(struct ext4_filename *fname, st + dx_get_block(at))); + frame->entries = entries; + frame->at = at; +- if (!indirect--) ++ ++ if (indirect == 0) { /* the last index level */ ++ struct ext4_dir_lock_data *ld; ++ u64 myblock; ++ ++ /* By default we only lock DE-block, however, we will ++ * also lock the last level DX-block if: ++ * a) there is hash collision ++ * we will set DX-lock flag (a few lines below) ++ * and redo to lock DX-block ++ * see detail in dx_probe_hash_collision() ++ * b) it's a retry from splitting ++ * we need to lock the last level DX-block so nobody ++ * else can split any leaf blocks under the same ++ * DX-block, see detail in ext4_dx_add_entry() ++ */ ++ if (ext4_htree_dx_locked(lck)) { ++ /* DX-block is locked, just lock DE-block ++ * and return */ ++ ext4_htree_spin_unlock(lck); ++ if (!ext4_htree_safe_locked(lck)) ++ ext4_htree_de_lock(lck, frame->at); ++ return frame; ++ } ++ /* it's pdirop and no DX lock */ ++ if (dx_probe_hash_collision(lck, entries, at, hash) == ++ DX_HASH_COL_YES) { ++ /* found hash collision, set DX-lock flag ++ * and retry to abtain DX-lock */ ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_need_lock(lck); ++ continue; ++ } ++ ld = ext4_htree_lock_data(lck); ++ /* because I don't lock DX, so @at can't be trusted ++ * after I release spinlock so I have to save it */ ++ ld->ld_at = at; ++ ld->ld_at_entry = *at; ++ ld->ld_count = dx_get_count(entries); ++ ++ frame->at = &ld->ld_at_entry; ++ myblock = dx_get_block(at); ++ ++ /* NB: ordering locking */ ++ ext4_htree_spin_unlock_listen(lck, &myblock); ++ /* other thread can split this DE-block because: ++ * a) I don't have lock for the DE-block yet ++ * b) I released spinlock on DX-block ++ * if it happened I can detect it by listening ++ * splitting event on this DE-block */ ++ ext4_htree_de_lock(lck, frame->at); ++ ext4_htree_spin_stop_listen(lck); ++ ++ if (myblock == EXT4_HTREE_NODE_CHANGED) { ++ /* someone split this DE-block before ++ * I locked it, I need to retry and lock ++ * valid DE-block */ ++ ext4_htree_de_unlock(lck); ++ continue; ++ } + return frame; ++ } ++ dx = at; ++ indirect--; + frame++; + frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(frame->bh)) { +@@ -913,7 +1214,7 @@ static void dx_release(struct dx_frame * + static int ext4_htree_next_block(struct inode *dir, __u32 hash, + struct dx_frame *frame, + struct dx_frame *frames, +- __u32 *start_hash) ++ __u32 *start_hash, struct htree_lock *lck) + { + struct dx_frame *p; + struct buffer_head *bh; +@@ -928,12 +1229,22 @@ static int ext4_htree_next_block(struct + * this loop, num_frames indicates the number of interior + * nodes need to be read. + */ ++ ext4_htree_de_unlock(lck); + while (1) { +- if (++(p->at) < p->entries + dx_get_count(p->entries)) +- break; ++ if (num_frames > 0 || ext4_htree_dx_locked(lck)) { ++ /* num_frames > 0 : ++ * DX block ++ * ext4_htree_dx_locked: ++ * frame->at is reliable pointer returned by dx_probe, ++ * otherwise dx_probe already knew no collision */ ++ if (++(p->at) < p->entries + dx_get_count(p->entries)) ++ break; ++ } + if (p == frames) + return 0; + num_frames++; ++ if (num_frames == 1) ++ ext4_htree_dx_unlock(lck); + p--; + } + +@@ -956,6 +1267,13 @@ static int ext4_htree_next_block(struct + * block so no check is necessary + */ + while (num_frames--) { ++ if (num_frames == 0) { ++ /* it's not always necessary, we just don't want to ++ * detect hash collision again */ ++ ext4_htree_dx_need_lock(lck); ++ ext4_htree_dx_lock(lck, p->at); ++ } ++ + bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX); + if (IS_ERR(bh)) + return PTR_ERR(bh); +@@ -964,6 +1282,7 @@ static int ext4_htree_next_block(struct + p->bh = bh; + p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; + } ++ ext4_htree_de_lock(lck, p->at); + return 1; + } + +@@ -1111,10 +1430,10 @@ int ext4_htree_fill_tree(struct file *di + } + hinfo.hash = start_hash; + hinfo.minor_hash = 0; +- frame = dx_probe(NULL, dir, &hinfo, frames); ++ /* assume it's PR locked */ ++ frame = dx_probe(NULL, dir, &hinfo, frames, NULL); + if (IS_ERR(frame)) + return PTR_ERR(frame); +- + /* Add '.' and '..' from the htree header */ + if (!start_hash && !start_minor_hash) { + de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; +@@ -1154,7 +1473,7 @@ int ext4_htree_fill_tree(struct file *di + count += ret; + hashval = ~0; + ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS, +- frame, frames, &hashval); ++ frame, frames, &hashval, NULL); + *next_hash = hashval; + if (ret < 0) { + err = ret; +@@ -1346,10 +1665,10 @@ static int is_dx_internal_node(struct in + * The returned buffer_head has ->b_count elevated. The caller is expected + * to brelse() it when appropriate. + */ +-static struct buffer_head * ext4_find_entry (struct inode *dir, ++struct buffer_head *__ext4_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, +- int *inlined) ++ int *inlined, struct htree_lock *lck) + { + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; +@@ -1398,7 +1717,7 @@ static struct buffer_head * ext4_find_en + goto restart; + } + if (is_dx(dir)) { +- ret = ext4_dx_find_entry(dir, &fname, res_dir); ++ ret = ext4_dx_find_entry(dir, &fname, res_dir, lck); + /* + * On success, or if the error was file not found, + * return. Otherwise, fall back to doing a search the +@@ -1408,6 +1727,7 @@ static struct buffer_head * ext4_find_en + goto cleanup_and_exit; + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); ++ ext4_htree_safe_relock(lck); + } + nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); + if (!nblocks) { +@@ -1495,10 +1815,12 @@ cleanup_and_exit: + ext4_fname_free_filename(&fname); + return ret; + } ++EXPORT_SYMBOL(__ext4_find_entry); + + static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, +- struct ext4_dir_entry_2 **res_dir) ++ struct ext4_dir_entry_2 **res_dir, ++ struct htree_lock *lck) + { + struct super_block * sb = dir->i_sb; + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; +@@ -1509,7 +1831,7 @@ static struct buffer_head * ext4_dx_find + #ifdef CONFIG_EXT4_FS_ENCRYPTION + *res_dir = NULL; + #endif +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; + do { +@@ -1531,7 +1853,7 @@ static struct buffer_head * ext4_dx_find + + /* Check to see if we should continue to search */ + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, +- frames, NULL); ++ frames, NULL, lck); + if (retval < 0) { + ext4_warning_inode(dir, + "error %d reading directory index block", +@@ -1706,8 +2028,9 @@ static struct ext4_dir_entry_2* dx_pack_ + * Returns pointer to de in block into which the new entry will be inserted. + */ + static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, +- struct buffer_head **bh,struct dx_frame *frame, +- struct dx_hash_info *hinfo) ++ struct buffer_head **bh, struct dx_frame *frames, ++ struct dx_frame *frame, struct dx_hash_info *hinfo, ++ struct htree_lock *lck) + { + unsigned blocksize = dir->i_sb->s_blocksize; + unsigned count, continued; +@@ -1769,8 +2092,14 @@ static struct ext4_dir_entry_2 *do_split + hash2, split, count-split)); + + /* Fancy dance to stay within two buffers */ +- de2 = dx_move_dirents(data1, data2, map + split, count - split, +- blocksize); ++ if (hinfo->hash < hash2) { ++ de2 = dx_move_dirents(data1, data2, map + split, ++ count - split, blocksize); ++ } else { ++ /* make sure we will add entry to the same block which ++ * we have already locked */ ++ de2 = dx_move_dirents(data1, data2, map, split, blocksize); ++ } + de = dx_pack_dirents(data1, blocksize); + de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - + (char *) de, +@@ -1791,12 +2120,21 @@ static struct ext4_dir_entry_2 *do_split + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); + +- /* Which block gets the new entry? */ +- if (hinfo->hash >= hash2) { +- swap(*bh, bh2); +- de = de2; ++ ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL, ++ frame->at); /* notify block is being split */ ++ if (hinfo->hash < hash2) { ++ dx_insert_block(frame, hash2 + continued, newblock); ++ ++ } else { ++ /* switch block number */ ++ dx_insert_block(frame, hash2 + continued, ++ dx_get_block(frame->at)); ++ dx_set_block(frame->at, newblock); ++ (frame->at)++; + } +- dx_insert_block(frame, hash2 + continued, newblock); ++ ext4_htree_spin_unlock(lck); ++ ext4_htree_dx_unlock(lck); ++ + err = ext4_handle_dirty_dirent_node(handle, dir, bh2); + if (err) + goto journal_error; +@@ -2070,7 +2408,7 @@ static int make_indexed_dir(handle_t *ha + if (retval) + goto out_frames; + +- de = do_split(handle,dir, &bh2, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh2, frames, frame, &fname->hinfo, NULL); + if (IS_ERR(de)) { + retval = PTR_ERR(de); + goto out_frames; +@@ -2180,8 +2518,8 @@ out: + * may not sleep between calling this and putting something into + * the entry, as someone else might have used it while you slept. + */ +-static int ext4_add_entry(handle_t *handle, struct dentry *dentry, +- struct inode *inode) ++int __ext4_add_entry(handle_t *handle, struct dentry *dentry, ++ struct inode *inode, struct htree_lock *lck) + { + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; +@@ -2222,9 +2560,10 @@ static int ext4_add_entry(handle_t *hand + if (dentry->d_name.len == 2 && + memcmp(dentry->d_name.name, "..", 2) == 0) + return ext4_update_dotdot(handle, dentry, inode); +- retval = ext4_dx_add_entry(handle, &fname, dir, inode); ++ retval = ext4_dx_add_entry(handle, &fname, dir, inode, lck); + if (!retval || (retval != ERR_BAD_DX_DIR)) + goto out; ++ ext4_htree_safe_relock(lck); + ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); + dx_fallback++; + ext4_mark_inode_dirty(handle, dir); +@@ -2274,12 +2613,14 @@ out: + ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); + return retval; + } ++EXPORT_SYMBOL(__ext4_add_entry); + + /* + * Returns 0 for success, or a negative error value + */ + static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, +- struct inode *dir, struct inode *inode) ++ struct inode *dir, struct inode *inode, ++ struct htree_lock *lck) + { + struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; + struct dx_entry *entries, *at; +@@ -2291,7 +2632,7 @@ static int ext4_dx_add_entry(handle_t *h + + again: + restart = 0; +- frame = dx_probe(fname, dir, NULL, frames); ++ frame = dx_probe(fname, dir, NULL, frames, lck); + if (IS_ERR(frame)) + return PTR_ERR(frame); + entries = frame->entries; +@@ -2326,6 +2667,11 @@ again: + struct dx_node *node2; + struct buffer_head *bh2; + ++ if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */ ++ ext4_htree_safe_relock(lck); ++ restart = 1; ++ goto cleanup; ++ } + while (frame > frames) { + if (dx_get_count((frame - 1)->entries) < + dx_get_limit((frame - 1)->entries)) { +@@ -2428,8 +2774,32 @@ again: + restart = 1; + goto journal_error; + } ++ } else if (!ext4_htree_dx_locked(lck)) { ++ struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck); ++ ++ /* not well protected, require DX lock */ ++ ext4_htree_dx_need_lock(lck); ++ at = frame > frames ? (frame - 1)->at : NULL; ++ ++ /* NB: no risk of deadlock because it's just a try. ++ * ++ * NB: we check ld_count for twice, the first time before ++ * having DX lock, the second time after holding DX lock. ++ * ++ * NB: We never free blocks for directory so far, which ++ * means value returned by dx_get_count() should equal to ++ * ld->ld_count if nobody split any DE-block under @at, ++ * and ld->ld_at still points to valid dx_entry. */ ++ if ((ld->ld_count != dx_get_count(entries)) || ++ !ext4_htree_dx_lock_try(lck, at) || ++ (ld->ld_count != dx_get_count(entries))) { ++ restart = 1; ++ goto cleanup; ++ } ++ /* OK, I've got DX lock and nothing changed */ ++ frame->at = ld->ld_at; + } +- de = do_split(handle, dir, &bh, frame, &fname->hinfo); ++ de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck); + if (IS_ERR(de)) { + err = PTR_ERR(de); + goto cleanup; +@@ -2440,6 +2810,8 @@ again: + journal_error: + ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */ + cleanup: ++ ext4_htree_dx_unlock(lck); ++ ext4_htree_de_unlock(lck); + brelse(bh); + dx_release(frames); + /* @restart is true means htree-path has been changed, we need to +Index: linux-4.15.0/fs/ext4/super.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/super.c ++++ linux-4.15.0/fs/ext4/super.c +@@ -975,6 +975,7 @@ static struct inode *ext4_alloc_inode(st + + ei->vfs_inode.i_version = 1; + spin_lock_init(&ei->i_raw_lock); ++ sema_init(&ei->i_append_sem, 1); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); + ext4_es_init_tree(&ei->i_es_tree); +Index: linux-4.15.0/include/linux/htree_lock.h +=================================================================== +--- /dev/null ++++ linux-4.15.0/include/linux/htree_lock.h +@@ -0,0 +1,187 @@ ++/* ++ * include/linux/htree_lock.h ++ * ++ * Copyright (c) 2011, 2012, Intel Corporation. ++ * ++ * Author: Liang Zhen ++ */ ++ ++/* ++ * htree lock ++ * ++ * htree_lock is an advanced lock, it can support five lock modes (concept is ++ * taken from DLM) and it's a sleeping lock. ++ * ++ * most common use case is: ++ * - create a htree_lock_head for data ++ * - each thread (contender) creates it's own htree_lock ++ * - contender needs to call htree_lock(lock_node, mode) to protect data and ++ * call htree_unlock to release lock ++ * ++ * Also, there is advanced use-case which is more complex, user can have ++ * PW/PR lock on particular key, it's mostly used while user holding shared ++ * lock on the htree (CW, CR) ++ * ++ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR ++ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR ++ * ... ++ * htree_node_unlock(lock_node);; unlock the key ++ * ++ * Another tip is, we can have N-levels of this kind of keys, all we need to ++ * do is specifying N-levels while creating htree_lock_head, then we can ++ * lock/unlock a specific level by: ++ * htree_node_lock(lock_node, mode1, key1, level1...); ++ * do something; ++ * htree_node_lock(lock_node, mode1, key2, level2...); ++ * do something; ++ * htree_node_unlock(lock_node, level2); ++ * htree_node_unlock(lock_node, level1); ++ * ++ * NB: for multi-level, should be careful about locking order to avoid deadlock ++ */ ++ ++#ifndef _LINUX_HTREE_LOCK_H ++#define _LINUX_HTREE_LOCK_H ++ ++#include ++#include ++#include ++ ++/* ++ * Lock Modes ++ * more details can be found here: ++ * http://en.wikipedia.org/wiki/Distributed_lock_manager ++ */ ++typedef enum { ++ HTREE_LOCK_EX = 0, /* exclusive lock: incompatible with all others */ ++ HTREE_LOCK_PW, /* protected write: allows only CR users */ ++ HTREE_LOCK_PR, /* protected read: allow PR, CR users */ ++ HTREE_LOCK_CW, /* concurrent write: allow CR, CW users */ ++ HTREE_LOCK_CR, /* concurrent read: allow all but EX users */ ++ HTREE_LOCK_MAX, /* number of lock modes */ ++} htree_lock_mode_t; ++ ++#define HTREE_LOCK_NL HTREE_LOCK_MAX ++#define HTREE_LOCK_INVAL 0xdead10c ++ ++enum { ++ HTREE_HBITS_MIN = 2, ++ HTREE_HBITS_DEF = 14, ++ HTREE_HBITS_MAX = 32, ++}; ++ ++enum { ++ HTREE_EVENT_DISABLE = (0), ++ HTREE_EVENT_RD = (1 << HTREE_LOCK_PR), ++ HTREE_EVENT_WR = (1 << HTREE_LOCK_PW), ++ HTREE_EVENT_RDWR = (HTREE_EVENT_RD | HTREE_EVENT_WR), ++}; ++ ++struct htree_lock; ++ ++typedef void (*htree_event_cb_t)(void *target, void *event); ++ ++struct htree_lock_child { ++ struct list_head lc_list; /* granted list */ ++ htree_event_cb_t lc_callback; /* event callback */ ++ unsigned lc_events; /* event types */ ++}; ++ ++struct htree_lock_head { ++ unsigned long lh_lock; /* bits lock */ ++ /* blocked lock list (htree_lock) */ ++ struct list_head lh_blocked_list; ++ /* # key levels */ ++ u16 lh_depth; ++ /* hash bits for key and limit number of locks */ ++ u16 lh_hbits; ++ /* counters for blocked locks */ ++ u16 lh_nblocked[HTREE_LOCK_MAX]; ++ /* counters for granted locks */ ++ u16 lh_ngranted[HTREE_LOCK_MAX]; ++ /* private data */ ++ void *lh_private; ++ /* array of children locks */ ++ struct htree_lock_child lh_children[0]; ++}; ++ ++/* htree_lock_node_t is child-lock for a specific key (ln_value) */ ++struct htree_lock_node { ++ htree_lock_mode_t ln_mode; ++ /* major hash key */ ++ u16 ln_major_key; ++ /* minor hash key */ ++ u16 ln_minor_key; ++ struct list_head ln_major_list; ++ struct list_head ln_minor_list; ++ /* alive list, all locks (granted, blocked, listening) are on it */ ++ struct list_head ln_alive_list; ++ /* blocked list */ ++ struct list_head ln_blocked_list; ++ /* granted list */ ++ struct list_head ln_granted_list; ++ void *ln_ev_target; ++}; ++ ++struct htree_lock { ++ struct task_struct *lk_task; ++ struct htree_lock_head *lk_head; ++ void *lk_private; ++ unsigned lk_depth; ++ htree_lock_mode_t lk_mode; ++ struct list_head lk_blocked_list; ++ struct htree_lock_node lk_nodes[0]; ++}; ++ ++/* create a lock head, which stands for a resource */ ++struct htree_lock_head *htree_lock_head_alloc(unsigned depth, ++ unsigned hbits, unsigned priv); ++/* free a lock head */ ++void htree_lock_head_free(struct htree_lock_head *lhead); ++/* register event callback for child lock at level @depth */ ++void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth, ++ unsigned events, htree_event_cb_t callback); ++/* create a lock handle, which stands for a thread */ ++struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes); ++/* free a lock handle */ ++void htree_lock_free(struct htree_lock *lck); ++/* lock htree, when @wait is true, 0 is returned if the lock can't ++ * be granted immediately */ ++int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead, ++ htree_lock_mode_t mode, int wait); ++/* unlock htree */ ++void htree_unlock(struct htree_lock *lck); ++/* unlock and relock htree with @new_mode */ ++int htree_change_lock_try(struct htree_lock *lck, ++ htree_lock_mode_t new_mode, int wait); ++void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode); ++/* require child lock (key) of htree at level @dep, @event will be sent to all ++ * listeners on this @key while lock being granted */ ++int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, ++ u32 key, unsigned dep, int wait, void *event); ++/* release child lock at level @dep, this lock will listen on it's key ++ * if @event isn't NULL, event_cb will be called against @lck while granting ++ * any other lock at level @dep with the same key */ ++void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event); ++/* stop listening on child lock at level @dep */ ++void htree_node_stop_listen(struct htree_lock *lck, unsigned dep); ++/* for debug */ ++void htree_lock_stat_print(int depth); ++void htree_lock_stat_reset(void); ++ ++#define htree_lock(lck, lh, mode) htree_lock_try(lck, lh, mode, 1) ++#define htree_change_lock(lck, mode) htree_change_lock_try(lck, mode, 1) ++ ++#define htree_lock_mode(lck) ((lck)->lk_mode) ++ ++#define htree_node_lock(lck, mode, key, dep) \ ++ htree_node_lock_try(lck, mode, key, dep, 1, NULL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_granted(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \ ++ (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL) ++/* this is only safe in thread context of lock owner */ ++#define htree_node_is_listening(lck, dep) \ ++ ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL) ++ ++#endif diff --git a/ldiskfs/kernel_patches/patches/ubuntu18/ext4-remove-truncate-warning.patch b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-remove-truncate-warning.patch new file mode 100644 index 0000000..17bc7e3 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ubuntu18/ext4-remove-truncate-warning.patch @@ -0,0 +1,39 @@ +There will cause a deadlock if invoke ext4_truncate with i_mutex locked +in lustre. Since lustre has own lock to provide protect so we don't +need this check at all. + +Index: linux-4.15.0/fs/ext4/inode.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/inode.c ++++ linux-4.15.0/fs/ext4/inode.c +@@ -4398,8 +4398,6 @@ int ext4_truncate(struct inode *inode) + * or it's a completely new inode. In those cases we might not + * have i_mutex locked because it's not necessary. + */ +- if (!(inode->i_state & (I_NEW|I_FREEING))) +- WARN_ON(!inode_is_locked(inode)); + trace_ext4_truncate_enter(inode); + + if (!ext4_can_truncate(inode)) +Index: linux-4.15.0/fs/ext4/namei.c +=================================================================== +--- linux-4.15.0.orig/fs/ext4/namei.c ++++ linux-4.15.0/fs/ext4/namei.c +@@ -3370,8 +3370,6 @@ int ext4_orphan_add(handle_t *handle, st + if (!sbi->s_journal || is_bad_inode(inode)) + return 0; + +- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && +- !inode_is_locked(inode)); + /* + * Exit early if inode already is on orphan list. This is a big speedup + * since we don't have to contend on the global s_orphan_lock. +@@ -3452,8 +3450,6 @@ int ext4_orphan_del(handle_t *handle, st + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) + return 0; + +- WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && +- !inode_is_locked(inode)); + /* Do this quick check before taking global s_orphan_lock. */ + if (list_empty(&ei->i_orphan)) + return 0; diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.15.0-20-ubuntu18.series b/ldiskfs/kernel_patches/series/ldiskfs-4.15.0-20-ubuntu18.series new file mode 100644 index 0000000..481d262 --- /dev/null +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.15.0-20-ubuntu18.series @@ -0,0 +1,22 @@ +sles12sp2/ext4-inode-version.patch +sles12sp2/ext4-lookup-dotdot.patch +sles12sp2/ext4-print-inum-in-htree-warning.patch +sles12sp2/ext4-prealloc.patch +ubuntu18/ext4-osd-iop-common.patch +ubuntu18/ext4-misc.patch +ubuntu18/ext4-mballoc-extra-checks.patch +ubuntu18/ext4-hash-indexed-dir-dotdot-update.patch +ubuntu18/ext4-kill-dx-root.patch +rhel7/ext4-mballoc-pa-free-mismatch.patch +ubuntu18/ext4-data-in-dirent.patch +ubuntu18/ext4-nocmtime.patch +ubuntu18/ext4-pdirop.patch +sles12sp2/ext4-max-dir-size.patch +ubuntu18/ext4-remove-truncate-warning.patch +ubuntu18/ext4-corrupted-inode-block-bitmaps-handling-patches.patch +ubuntu18/ext4-give-warning-with-dir-htree-growing.patch +ubuntu18/ext4-jcb-optimization.patch +ubuntu18/ext4-attach-jinode-in-writepages.patch +ubuntu18/ext4-dont-check-before-replay.patch +rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 909cc7e..8607f57 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -249,16 +249,13 @@ AS_IF([test $ENABLEO2IB = "no"], [ # we know at this point that the found OFED source is good O2IB_SYMVER="" if test $ENABLEO2IB = "withpath" -o "x$OFED" = "xyes" ; then - # OFED default rpm not handle sles10 Modules.symvers name - for name in Module.symvers Modules.symvers; do - if test -f $O2IBPATH/$name; then - O2IB_SYMVER=$name; - break; - fi - done + if test -f $O2IBPATH/Module.symvers; then + O2IB_SYMVER=$O2IBPATH/Module.symvers; + break; + fi if test -n "$O2IB_SYMVER"; then - AC_MSG_NOTICE([adding $O2IBPATH/$O2IB_SYMVER to Symbol Path]) - EXTRA_SYMBOLS="$EXTRA_SYMBOLS $O2IBPATH/$O2IB_SYMVER" + AC_MSG_NOTICE([adding $O2IB_SYMVER to Symbol Path]) + EXTRA_SYMBOLS="$EXTRA_SYMBOLS $O2IB_SYMVER" AC_SUBST(EXTRA_SYMBOLS) else AC_MSG_ERROR([an external source tree was, either specified or detected, for o2iblnd however I could not find a $O2IBPATH/Module.symvers there]) diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index ce53272..cb27144 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -2537,6 +2537,24 @@ generic_write_sync_2args, [ ]) # LC_GENERIC_WRITE_SYNC_2ARGS # +# LC_FOP_ITERATE_SHARED +# +# Kernel version 4.7 adds iterate_shared method to file_operations +# +AC_DEFUN([LC_FOP_ITERATE_SHARED], [ +LB_CHECK_COMPILE([if 'file_operations' has 'iterate_shared'], +fop_iterate_shared, [ + #include +],[ + struct file_operations fop; + fop.iterate_shared = NULL; +],[ + AC_DEFINE(HAVE_FOP_ITERATE_SHARED, 1, + [file_operations has iterate_shared]) +]) +]) # LC_FOP_ITERATE_SHARED + +# # LC_FOPS_ITERATE_SHARED # # 4.7 commit ae05327a00fd47c34dfe25294b359a3f3fef96e8 diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 27189e0..50a3395 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -1415,4 +1415,10 @@ void osd_trunc_unlock_all(struct list_head *list); void osd_process_truncates(struct list_head *list); void osd_execute_truncate(struct osd_object *obj); +/* + * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb + * This limit is arbitrary, but is reasonable for the xattr API. + */ +#define LDISKFS_XATTR_MAX_LARGE_EA_SIZE (1024 * 1024) + #endif /* _OSD_INTERNAL_H */