Whamcloud - gitweb
LU-9564 build: Add server-build for Ubuntu with Kernel 4.4.0 15/29215/12
authorMartin Schroeder <martin.h.schroeder@intel.com>
Tue, 13 Jun 2017 14:42:57 +0000 (10:42 -0400)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 6 Feb 2018 04:27:19 +0000 (04:27 +0000)
This enables compatibility with the current LTS flavours of Ubuntu.
Do note that you need the Xenial HWE Kernel for Ubuntu 14.04.5, as
that distribution originally used a 3.x series Kernel.

The patches have been developed to apply cleanly to the kernel versions
4.4.0-45.66 to 4.4.0-85.108 from the Ubuntu Xenial (and its Trusty backports).

This change also adjusts the Debian scripting to produce the
ldiskfs modules and the server utilities.

To create the server modules run "./configure" with "--enable-server"
and specify "--enable-ldiskfs" and "--with-zfs/-spl" as appropriate.

The call to "make debs" will then produce the server modules and
utils instead of their client versions.

NOTE: This contains a small hack taken from LU-9995 / #29130

Test-Parameters: trivial
Signed-off-by: Martin Schroeder <martin.h.schroeder@intel.com>
Change-Id: I02cd5e9314367ad4e1f8f3d81712f84441a8bc71
Reviewed-on: https://review.whamcloud.com/29215
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@yahoo.com>
Reviewed-by: Thomas Stibor <t.stibor@gsi.de>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
36 files changed:
autoMakefile.am
config/lustre-build-ldiskfs.m4
config/lustre-build-linux.m4
config/lustre-build-zfs.m4
config/lustre-build.m4
debian/.gitignore
debian/control
debian/control.main
debian/control.modules.in
debian/lustre-client-utils.docs [new file with mode: 0644]
debian/lustre-client-utils.examples [new file with mode: 0644]
debian/lustre-client-utils.install [new file with mode: 0644]
debian/lustre-client-utils.manpages [new file with mode: 0644]
debian/lustre-server-modules.install [new file with mode: 0644]
debian/lustre-server-utils.docs [new file with mode: 0644]
debian/lustre-server-utils.examples [new file with mode: 0644]
debian/lustre-server-utils.install [new file with mode: 0644]
debian/lustre-server-utils.manpages [new file with mode: 0644]
debian/rules
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-data-in-dirent-001.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-data-in-dirent.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-disable-mb-cache-001.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-disable-mb-cache.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-dir-001.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-dir.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-eas.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-misc.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-pdirop-001.patch [new file with mode: 0644]
ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-pdirop.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-45-ubuntu14+16.series [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-49-ubuntu14+16.series [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-62-ubuntu14+16.series [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-73-ubuntu14+16.series [new file with mode: 0644]
lustre/ChangeLog
lustre/kernel_patches/series/4.4-ubuntu14+16.series [new file with mode: 0644]
lustre/kernel_patches/which_patch

index 0054f72..0311095 100644 (file)
@@ -188,12 +188,33 @@ debs: undef.h
        if [ "$$lversion" != "$$cversion" ]; then \
                echo -e "1i\nlustre ($$lversion-1) unstable; urgency=low\n\n  * Automated changelog entry update\n\n -- Brian J. Murrell <brian@interlinx.bc.ca>  $$(date -R)\n\n.\nwq" | ed debian/changelog; \
        fi; \
-       rm -rf debs
+       rm -rf debs; \
+       if test "x@ENABLE_SERVER@" = "xyes"; then \
+               DEB_BUILD_PROFILES="server"; \
+       else \
+               DEB_BUILD_PROFILES="client"; \
+       fi; \
+       if test "x@ENABLE_LDISKFS@" = "xyes"; then \
+               export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} ldiskfs"; \
+       fi; \
+       if test "x@ENABLE_ZFS@" = "xyes"; then \
+               export DEB_BUILD_PROFILES="$${DEB_BUILD_PROFILES} zfs"; \
+               export SPL_SRC="@SPL@"; \
+               export ZFS_SRC="@ZFS@"; \
+               echo "ZFS support is enabled. Will expect pre-compiled SPL and ZFS sources in:"; \
+               echo "SPL: $${SPL_SRC}"; \
+               echo "ZFS: $${ZFS_SRC}"; \
+       fi; \
+       export KERNEL_CFG="$(LINUX_CONFIG)"; \
+       export KERNEL_SRC="$(LINUX)"; \
+       echo "Enabled Build Profiles: $${DEB_BUILD_PROFILES}"; \
+       echo "Kernel Source: $${KERNEL_SRC}"; \
+       echo "Kernel Config: $${KERNEL_CFG}"; \
        dpkg-buildpackage -us -uc -I.git -I\*.out[0-9]\* -I\*.swp || { \
                rc=$${PIPESTATUS[0]}; \
                [ $${rc} -gt 1 ] && exit $${rc}; \
                exit 0; \
-       }
+       }; \
        export KPKG_DEST_DIR="$$(pwd)/.." && \
        version=$$(sed -ne '1s/^lustre (\(.*\)).*$$/\1/p' debian/changelog) && \
        rm -rf debian/tmp/modules-deb && \
@@ -224,9 +245,10 @@ debs: undef.h
        mkdir -p debs && \
        mv ../linux-patch-lustre_$${VER}_all.deb ../lustre-dev_$${VER}_*.deb \
           ../lustre-source_$${VER}_all.deb ../lustre-tests_$${VER}_*.deb \
-          ../lustre-utils_$${VER}_*.deb ../lustre_$${VER}.dsc \
+          ../lustre-*-utils_$${VER}_*.deb ../lustre_$${VER}.dsc \
           ../lustre_$${VER}_*.changes ../lustre_$${VER}.tar.gz \
-          ../lustre-client-modules-$${KVERS}_$${VER}_*.deb debs/
+          ../lustre-*-modules-$${KVERS}_$${VER}_*.deb \
+          debs/
 
 if USES_DPKG
 EXTRA_DIST += debian/*
index c2d619b..d48df1d 100644 (file)
@@ -54,6 +54,24 @@ AS_IF([test x$RHEL_KERNEL = xyes], [
            [LDISKFS_SERIES="4.4-sles12sp2.series"]
        )], [LDISKFS_SERIES="4.4-sles12sp3.series"],
             [LDISKFS_SERIES="4.4-sles12sp3.series"])
+], [test x$UBUNTU_KERNEL = xyes], [
+       AS_VERSION_COMPARE([$LINUXRELEASE],[4.4.0],
+               [],
+               [
+                       KPLEV=$(echo $LINUXRELEASE | sed -n 's/.*-\([0-9]\+\).*/\1/p')
+                       AS_IF(
+                               [test -z "$KPLEV"], [
+                                       AC_MSG_WARN([Failed to determine Kernel patch level. Assume latest.])
+                                       LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series"
+                               ],
+                               [test $KPLEV -ge 73], [LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series"],
+                               [test $KPLEV -ge 62], [LDISKFS_SERIES="4.4.0-62-ubuntu14+16.series"],
+                               [test $KPLEV -ge 49], [LDISKFS_SERIES="4.4.0-49-ubuntu14+16.series"],
+                               [LDISKFS_SERIES="4.4.0-45-ubuntu14+16.series"]
+                       )
+               ],
+               [LDISKFS_SERIES="4.4.0-73-ubuntu14+16.series"]
+       )
 ])
 ])
 AS_IF([test -z "$LDISKFS_SERIES"],
@@ -244,6 +262,7 @@ AS_IF([test x$enable_ldiskfs != xno],[
        # set is available for the detected kernel.  For now, we just always
        # set it to "yes".
        AS_IF([test x$enable_ldiskfs = xmaybe], [enable_ldiskfs=yes])
+       AC_SUBST(ENABLE_LDISKFS, yes)
 
        LDISKFS_LINUX_SERIES
        LDISKFS_AC_PATCH_PROGRAM
@@ -260,6 +279,8 @@ AS_IF([test x$enable_ldiskfs != xno],[
        AC_DEFINE(CONFIG_LDISKFS_FS_ENCRYPTION, 1, [enable encryption for ldiskfs])
        AC_SUBST(LDISKFS_SUBDIR, ldiskfs)
        AC_DEFINE(HAVE_LDISKFS_OSD, 1, Enable ldiskfs osd)
+], [
+       AC_SUBST(ENABLE_LDISKFS, no)
 ])
 
 AC_MSG_CHECKING([whether to build ldiskfs])
index 260047c..8895e96 100644 (file)
@@ -73,46 +73,81 @@ AC_SUBST(LINUXRELEASE)
 # get the release version of linux
 #
 AC_DEFUN([LB_LINUX_RELEASE], [
-LB_LINUX_UTSRELEASE
+       LB_LINUX_UTSRELEASE
 
-# check if the kernel is one from RHEL or SUSE
-AC_CACHE_CHECK([for RedHat kernel release number], lb_cv_rhel_kernel_version, [
-lb_cv_rhel_kernel_version=""
-AS_IF([fgrep -q RHEL_RELEASE $LINUX_OBJ/include/$VERSION_HDIR/version.h], [
-       lb_cv_rhel_kernel_version=$(awk '/ RHEL_MAJOR / { print [$]3 }' \
-               $LINUX_OBJ/include/$VERSION_HDIR/version.h)$(awk \
-               '/ RHEL_MINOR / { print [$]3 }' \
-               $LINUX_OBJ/include/$VERSION_HDIR/version.h)
-])
-])
-AS_IF([test -n "$lb_cv_rhel_kernel_version"], [
-       RHEL_KERNEL="yes"
-       RHEL_RELEASE_NO=$lb_cv_rhel_kernel_version
-], [
+       # Define default states
        RHEL_KERNEL="no"
-       LB_CHECK_CONFIG([SUSE_KERNEL], [SUSE_KERNEL="yes"], [SUSE_KERNEL="no"])
-])
-
-AC_MSG_CHECKING([for Linux kernel module package directory])
-AC_ARG_WITH([kmp-moddir],
-       AC_HELP_STRING([--with-kmp-moddir=string],
-               [set the kmod updates or extra directory]),
-       [KMP_MODDIR=$withval
-        IN_KERNEL=''],[
-       AS_IF([test x$RHEL_KERNEL = xyes], [KMP_MODDIR="extra/kernel"],
-             [test x$SUSE_KERNEL = xyes], [KMP_MODDIR="updates/kernel"])
-       IN_KERNEL="${PACKAGE}"])
-AC_MSG_RESULT($KMP_MODDIR)
+       SUSE_KERNEL="no"
+       UBUNTU_KERNEL="no"
+       # And if any of the above kernels has been detected yet
+       KERNEL_FOUND="no"
+
+       # Check for RedHat first (no need to check KERNEL_FOUND
+       AC_CACHE_CHECK([for RedHat kernel release number], lb_cv_rhel_kernel_version, [
+               lb_cv_rhel_kernel_version=""
+               AS_IF([fgrep -q RHEL_RELEASE $LINUX_OBJ/include/$VERSION_HDIR/version.h], [
+                       lb_cv_rhel_kernel_version=$(awk '/ RHEL_MAJOR / { print [$]3 }' \
+                               $LINUX_OBJ/include/$VERSION_HDIR/version.h)$(awk \
+                               '/ RHEL_MINOR / { print [$]3 }' \
+                               $LINUX_OBJ/include/$VERSION_HDIR/version.h)
+               ])
+       ])
+       AS_IF([test -n "$lb_cv_rhel_kernel_version"], [
+               RHEL_KERNEL="yes"
+               KERNEL_FOUND="yes"
+               RHEL_RELEASE_NO=$lb_cv_rhel_kernel_version
+       ])
 
-moduledir="/lib/modules/${LINUXRELEASE}/${KMP_MODDIR}"
+       # Check for SuSE
+       AS_IF([test "x$KERNEL_FOUND" = "xno"], [
+               LB_CHECK_CONFIG([SUSE_KERNEL], [
+                       SUSE_KERNEL="yes"
+                       KERNEL_FOUND="yes"
+               ], [])
+       ])
 
-modulefsdir="${moduledir}/fs/${IN_KERNEL}"
-AC_SUBST(modulefsdir)
+       # Check for Ubuntu
+       AS_IF([test "x$KERNEL_FOUND" = "xno"], [
+               AC_CACHE_CHECK([for Ubuntu kernel signature], lb_cv_ubuntu_kernel_sig, [
+                       lb_cv_ubuntu_kernel_sig="no"
+                       AS_IF([fgrep -q "CONFIG_VERSION_SIGNATURE \"Ubuntu" $LINUX_OBJ/include/generated/autoconf.h], [
+                               lb_cv_ubuntu_kernel_sig="yes"
+                       ])
+               ])
+               AS_IF([test "x$lb_cv_ubuntu_kernel_sig" = "xyes"], [
+                       UBUNTU_KERNEL="yes"
+                       KERNEL_FOUND="yes"
+               ])
+       ])
 
-modulenetdir="${moduledir}/net/${IN_KERNEL}"
-AC_SUBST(modulenetdir)
+       # If still no kernel was found, a warning is issued
+       AS_IF([test "x$KERNEL_FOUND" = "xno"], [
+               AC_MSG_WARN([Kernel Distro seems to be neither RedHat, SuSE nor Ubuntu])
+       ])
 
-AC_SUBST(KMP_MODDIR)
+       AC_MSG_CHECKING([for Linux kernel module package directory])
+       AC_ARG_WITH([kmp-moddir],
+               AC_HELP_STRING([--with-kmp-moddir=string],
+                       [set the kmod updates or extra directory]),
+               [KMP_MODDIR=$withval
+                IN_KERNEL=''],[
+               AS_IF([test x$RHEL_KERNEL = xyes], [KMP_MODDIR="extra/kernel"],
+                         [test x$SUSE_KERNEL = xyes], [KMP_MODDIR="updates/kernel"],
+                         [test x$UBUNTU_KERNEL = xyes], [KMP_MODDIR="updates/kernel"],
+                         [AC_MSG_WARN([Kernel Distro seems to be neither RedHat, SuSE nor Ubuntu])]
+               )
+               IN_KERNEL="${PACKAGE}"])
+       AC_MSG_RESULT($KMP_MODDIR)
+
+       moduledir="/lib/modules/${LINUXRELEASE}/${KMP_MODDIR}"
+
+       modulefsdir="${moduledir}/fs/${IN_KERNEL}"
+       AC_SUBST(modulefsdir)
+
+       modulenetdir="${moduledir}/net/${IN_KERNEL}"
+       AC_SUBST(modulenetdir)
+
+       AC_SUBST(KMP_MODDIR)
 ])
 
 #
index 34f29fc..f036d46 100644 (file)
@@ -657,5 +657,10 @@ your distribution.
                ])
        ])
 
+       AS_IF([test "x$enable_zfs" = xyes], [
+               AC_SUBST(ENABLE_ZFS, yes)
+       ], [
+               AC_SUBST(ENABLE_ZFS, no)
+       ])
        AM_CONDITIONAL(ZFS_ENABLED, [test "x$enable_zfs" = xyes])
 ])
index eb9c8c9..ad772c9 100644 (file)
@@ -357,6 +357,7 @@ AM_CONDITIONAL([USES_DPKG], [test x$uses_dpkg = xyes])
 AM_CONDITIONAL([USE_QUILT], [test x$use_quilt = xyes])
 AM_CONDITIONAL([RHEL], [test x$RHEL_KERNEL = xyes])
 AM_CONDITIONAL([SUSE], [test x$SUSE_KERNEL = xyes])
+AM_CONDITIONAL([UBUNTU], [test x$UBUNTU_KERNEL = xyes])
 
 # Sanity check for PCLMULQDQ instruction availability
 # PCLMULQDQ instruction is a new instruction available beginning with
@@ -438,8 +439,12 @@ AS_IF([test x$enable_ldiskfs = xno -a x$enable_zfs = xno], [
 
 AC_MSG_CHECKING([whether to build Lustre server support])
 AC_MSG_RESULT([$enable_server])
-AS_IF([test x$enable_server = xyes],
-       [AC_DEFINE(HAVE_SERVER_SUPPORT, 1, [support server])])
+AS_IF([test x$enable_server = xyes], [
+       AC_DEFINE(HAVE_SERVER_SUPPORT, 1, [support server])
+       AC_SUBST(ENABLE_SERVER, yes)
+], [
+       AC_SUBST(ENABLE_SERVER, no)
+])
 ]) # LB_CONFIG_SERVERS
 
 #
index a27dd02..6391d78 100644 (file)
@@ -9,4 +9,5 @@
 /lustre-tests.substvars
 /lustre-utils
 /lustre-utils.substvars
+/lustre-*-utils
 /tmp
index 3ba6fe4..30f17d0 100644 (file)
@@ -42,24 +42,43 @@ Description: source for Lustre filesystem client kernel modules
  can be built for kernels 2.6.32+ with the use of module-assistant
  or make-kpkg.
 
-Package: lustre-utils
+Package: lustre-client-utils
 Section: utils
 Architecture: i386 powerpc ppc64el amd64 ia64 arm64
 Priority: optional
 Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: Userspace utilities for the Lustre filesystem
+Description: Userspace utilities for the Lustre filesystem (client)
  Lustre is a scalable, secure, robust, highly-available cluster file system.
  This release is maintained by Intel Corporation and available from
  https://wiki.hpdd.intel.com/
  .
  This package provides a number of userspace utilities for
- maintaining Lustre filesystems.
+ accessing Lustre filesystems from a client. If you need server utils,
+ use lustre-server-utils instead.
+
+Package: lustre-server-utils
+Section: utils
+Architecture: i386 powerpc ppc64el amd64 ia64
+Priority: optional
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Provides: lustre-server-utils, lustre-client-utils
+Conflicts: lustre-client-utils
+Replaces: lustre-client-utils
+Description: Userspace utilities for the Lustre filesystem (server)
+ Lustre is a scalable, secure, robust, highly-available cluster file system.
+ This release is maintained by Intel Corporation and available from
+ https://wiki.hpdd.intel.com/
+ .
+ This package provides a number of userspace utilities for
+ accessing and maintaining Lustre filesystems from a server.
+ If you only need to access the LFS cluster, install lustre-client-utils
+ instead.
 
 Package: lustre-tests
 Section: utils
 Architecture: i386 powerpc ppc64el amd64 ia64 arm64
 Priority: optional
-Depends: lustre-utils (= ${binary:Version})
+Depends: lustre-client-utils (= ${binary:Version})
 Description: Test suite for the Lustre filesystem
  Lustre is a scalable, secure, robust, highly-available cluster file system.
  This release is maintained by Intel Corporation and available from
@@ -71,7 +90,7 @@ Package: lustre-dev
 Section: libdevel
 Priority: optional
 Architecture: i386 powerpc ppc64el amd64 ia64 arm64
-Depends: lustre-utils (= ${binary:Version})
+Depends: lustre-client-utils (= ${binary:Version})
 Description: Development files for the Lustre filesystem
  Lustre is a scalable, secure, robust, highly-available cluster file system.
  This release is maintained by Intel Corporation and available from
index 3ba6fe4..30f17d0 100644 (file)
@@ -42,24 +42,43 @@ Description: source for Lustre filesystem client kernel modules
  can be built for kernels 2.6.32+ with the use of module-assistant
  or make-kpkg.
 
-Package: lustre-utils
+Package: lustre-client-utils
 Section: utils
 Architecture: i386 powerpc ppc64el amd64 ia64 arm64
 Priority: optional
 Depends: ${shlibs:Depends}, ${misc:Depends}
-Description: Userspace utilities for the Lustre filesystem
+Description: Userspace utilities for the Lustre filesystem (client)
  Lustre is a scalable, secure, robust, highly-available cluster file system.
  This release is maintained by Intel Corporation and available from
  https://wiki.hpdd.intel.com/
  .
  This package provides a number of userspace utilities for
- maintaining Lustre filesystems.
+ accessing Lustre filesystems from a client. If you need server utils,
+ use lustre-server-utils instead.
+
+Package: lustre-server-utils
+Section: utils
+Architecture: i386 powerpc ppc64el amd64 ia64
+Priority: optional
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Provides: lustre-server-utils, lustre-client-utils
+Conflicts: lustre-client-utils
+Replaces: lustre-client-utils
+Description: Userspace utilities for the Lustre filesystem (server)
+ Lustre is a scalable, secure, robust, highly-available cluster file system.
+ This release is maintained by Intel Corporation and available from
+ https://wiki.hpdd.intel.com/
+ .
+ This package provides a number of userspace utilities for
+ accessing and maintaining Lustre filesystems from a server.
+ If you only need to access the LFS cluster, install lustre-client-utils
+ instead.
 
 Package: lustre-tests
 Section: utils
 Architecture: i386 powerpc ppc64el amd64 ia64 arm64
 Priority: optional
-Depends: lustre-utils (= ${binary:Version})
+Depends: lustre-client-utils (= ${binary:Version})
 Description: Test suite for the Lustre filesystem
  Lustre is a scalable, secure, robust, highly-available cluster file system.
  This release is maintained by Intel Corporation and available from
@@ -71,7 +90,7 @@ Package: lustre-dev
 Section: libdevel
 Priority: optional
 Architecture: i386 powerpc ppc64el amd64 ia64 arm64
-Depends: lustre-utils (= ${binary:Version})
+Depends: lustre-client-utils (= ${binary:Version})
 Description: Development files for the Lustre filesystem
  Lustre is a scalable, secure, robust, highly-available cluster file system.
  This release is maintained by Intel Corporation and available from
index 84a7012..8b24a54 100644 (file)
@@ -4,7 +4,7 @@ Priority: optional
 Maintainer: Brian J. Murrell <brian.murrell@intel.com>
 Uploaders: Brian J. Murrell <brian.murrell@intel.com>
 Standards-Version: 3.7.2
-Build-Depends: debhelper (>= 5.0.0), bzip2
+Build-Depends: debhelper (>= 7.0.0), bzip2
 
 Package: lustre-client-modules-_KVERS_
 Architecture: any
@@ -16,5 +16,24 @@ Description: Lustre Linux kernel module (kernel _KVERS_)
  .
  These modules are compiled for the _KVERS_ linux kernel.
 
-
+Package: lustre-server-modules-_KVERS_
+Architecture: any
+Recommends: linux-image, lustre-utils
+Provides: lustre-server-modules, lustre-client-modules
+Conflicts: lustre-client-modules-_KVERS_
+Replaces: lustre-client-modules-_KVERS_
+Description: Lustre Server Linux kernel module (kernel _KVERS_)
+ This package contains the loadable kernel modules for the
+ ldiskfs filesystem as used by the Lustre cluster filesystem servers.
+ .
+ Do note, that you will only need this, when you're running the
+ node as a Lustre server. The package might also contain the ldiskfs
+ module for use as a backing file system, in case it was enabled.
+ In that case, you should also run a lustre-patched Kernel
+ to achieve optimum performance. If you plan to run this with ZFS
+ enabled, no kernel patches are needed at all.
+ .
+ These server modules provide and supersede the client modules, if they
+ are compiled against same Kernel version.
+ .
+ These modules are compiled for the _KVERS_ linux kernel.
\ No newline at end of file
diff --git a/debian/lustre-client-utils.docs b/debian/lustre-client-utils.docs
new file mode 100644 (file)
index 0000000..7fa4581
--- /dev/null
@@ -0,0 +1,2 @@
+lustre/BUGS
+README
diff --git a/debian/lustre-client-utils.examples b/debian/lustre-client-utils.examples
new file mode 100644 (file)
index 0000000..b16e58d
--- /dev/null
@@ -0,0 +1,2 @@
+lustre/tests/llmountcleanup.sh
+lustre/tests/llmount.sh
diff --git a/debian/lustre-client-utils.install b/debian/lustre-client-utils.install
new file mode 100644 (file)
index 0000000..4eac081
--- /dev/null
@@ -0,0 +1,6 @@
+debian/tmp/sbin/*                      sbin
+debian/tmp/usr/sbin/*                  usr/sbin
+debian/tmp/usr/bin/*                   usr/bin
+debian/tmp/usr/lib/*.a                 usr/lib
+debian/tmp/usr/lib/*.la                        usr/lib
+debian/tmp/usr/lib/*.so*               usr/lib
diff --git a/debian/lustre-client-utils.manpages b/debian/lustre-client-utils.manpages
new file mode 100644 (file)
index 0000000..7f0e0c2
--- /dev/null
@@ -0,0 +1 @@
+debian/tmp/usr/share/man/*/*
diff --git a/debian/lustre-server-modules.install b/debian/lustre-server-modules.install
new file mode 100644 (file)
index 0000000..ce66ef7
--- /dev/null
@@ -0,0 +1 @@
+debian/tmp/lib         /
diff --git a/debian/lustre-server-utils.docs b/debian/lustre-server-utils.docs
new file mode 100644 (file)
index 0000000..7fa4581
--- /dev/null
@@ -0,0 +1,2 @@
+lustre/BUGS
+README
diff --git a/debian/lustre-server-utils.examples b/debian/lustre-server-utils.examples
new file mode 100644 (file)
index 0000000..b16e58d
--- /dev/null
@@ -0,0 +1,2 @@
+lustre/tests/llmountcleanup.sh
+lustre/tests/llmount.sh
diff --git a/debian/lustre-server-utils.install b/debian/lustre-server-utils.install
new file mode 100644 (file)
index 0000000..adb5d39
--- /dev/null
@@ -0,0 +1,7 @@
+debian/tmp/sbin/*                      sbin
+debian/tmp/usr/sbin/*                  usr/sbin
+debian/tmp/usr/bin/*                   usr/bin
+debian/tmp/usr/lib/*.a                 usr/lib
+debian/tmp/usr/lib/*.la                        usr/lib
+debian/tmp/usr/lib/*.so*               usr/lib
+debian/tmp/usr/lib/lustre/*.so*                usr/lib/lustre
diff --git a/debian/lustre-server-utils.manpages b/debian/lustre-server-utils.manpages
new file mode 100644 (file)
index 0000000..7f0e0c2
--- /dev/null
@@ -0,0 +1 @@
+debian/tmp/usr/share/man/*/*
index c71560b..b32e447 100755 (executable)
@@ -13,6 +13,7 @@
 DEB_BUILD_GNU_SYSTEM = $(shell dpkg-architecture -qDEB_BUILD_GNU_SYSTEM)
 DEB_BUILD_GNU_CPU = $(shell dpkg-architecture -qDEB_BUILD_GNU_CPU)
 
+
 # whether to use module-assistant to build the kernel modules or not
 USE_MA = true
 
@@ -32,7 +33,11 @@ endif
 export DH_COMPAT=7
 
 # Module-assistant stuff
-PACKAGE=lustre-client-modules
+ifneq (,$(findstring server,$(DEB_BUILD_PROFILES)))
+       PACKAGE=lustre-server-modules
+else
+       PACKAGE=lustre-client-modules
+endif
 MA_DIR ?= /usr/share/modass
 -include $(MA_DIR)/include/generic.make
 -include $(MA_DIR)/include/common-rules.make
@@ -56,13 +61,22 @@ KSRC?=$(LINUX_OBJ)
 KSRC_TREE?=$(LINUX)
 IB_OPTIONS?=""
 
-# Packages
+# Packages provided for both client and server builds
 PATCH_PKG=linux-patch-lustre
-UTILS_PKG=lustre-utils
 TESTS_PKG=lustre-tests
 DEV_PKG=lustre-dev
 SOURCE_PKG=lustre-source
-MODS_PKG=lustre-client-modules
+SOURCE_PKG=lustre-source
+
+# Packages that are only built for server OR client builds
+# The difference is that server build contain more modules & utils
+ifneq (,$(findstring server,$(DEB_BUILD_PROFILES)))
+       UTILS_PKG=lustre-server-utils
+       MODS_PKG=lustre-server-modules
+else
+       UTILS_PKG=lustre-client-utils
+       MODS_PKG=lustre-client-modules
+endif
 
 #Build dir
 #BUILDDIR=debian/build
@@ -105,6 +119,18 @@ autogen-stamp: patch-stamp
        fi; \
        touch $@
 
+# This section configures the lustre-utilities packages.
+#
+# Both client and server builds are quite similar, as the utilities build
+# must create the modules too, to produce the "osd_<fstype>.so" libraries.
+#
+# The main difference is that the utilities build does not archive the
+# modules, whereas the modules build omits the utilities, to create two
+# neatly separated debian files.
+#
+# Note: KERNEL_SRC, KERNEL_CFG, ZFS_SRC and SPL_SRC need to be set from the
+# outside. This is done by "make debs". As such, invoking "debuild" or such
+# direct debian build tools will lead to a client-only build.
 configure: configure-stamp
 configure-stamp: autogen-stamp debian/control.main debian/control.modules.in
        dh_testdir
@@ -114,13 +140,33 @@ configure-stamp: autogen-stamp debian/control.main debian/control.modules.in
                mkdir -p $(BUILDDIR)/build $(BUILDDIR)/lustre/contrib; \
                cp build/Makefile $(BUILDDIR)/build/; \
        fi
+       # Determine flags that are different between server/client module builds
+       echo "Enabled Build Profiles: $${DEB_BUILD_PROFILES}"
+       if echo "$${DEB_BUILD_PROFILES}" | grep -q "server"; then \
+               export EXTRAFLAGS="--enable-server"; \
+       else \
+               export EXTRAFLAGS="--disable-server --disable-modules"; \
+       fi; \
+       if echo "$${DEB_BUILD_PROFILES}" | grep -q "zfs"; then \
+               export EXTRAFLAGS="$${EXTRAFLAGS} --with-zfs=$${ZFS_SRC} --with-spl=$${SPL_SRC}"; \
+       else \
+               export EXTRAFLAGS="$${EXTRAFLAGS} --without-zfs" ; \
+       fi; \
+       if echo "$${DEB_BUILD_PROFILES}" | grep -q "ldiskfs"; then \
+               export EXTRAFLAGS="$${EXTRAFLAGS} \
+                       --enable-ldiskfs --enable-quilt"; \
+       else \
+               export EXTRAFLAGS="$${EXTRAFLAGS} --disable-ldiskfs"; \
+       fi; \
+       echo "Final value of EXTRAFLAGS: $${EXTRAFLAGS}"; \
        ( cd $(BUILDDIR) && \
-        $(SRCDIR)/configure --disable-dependency-tracking \
-                  --disable-modules \
-                  --disable-snmp \
-                  --disable-client \
-                  --enable-quota \
-                  --disable-server )
+               $(SRCDIR)/configure --disable-dependency-tracking \
+                       --with-linux=$${KERNEL_SRC} \
+                       --with-linux-config=$${KERNEL_CFG} \
+                       --disable-snmp \
+                       --enable-quota \
+                       $${EXTRAFLAGS} \
+       ); \
        touch $@
 
 build-arch build-indep: build
@@ -130,7 +176,7 @@ build-stamp: patch-stamp configure-stamp
        dh_testdir
        $(MAKE) -C $(BUILDDIR) $(PMAKEARGS)
        $(MAKE) -C $(BUILDDIR) DESTDIR=$(TOP_DIR)/debian/tmp install
-       # jump our lustre-client-modules into the control file if not using m-a
+       # jump our lustre-[client|server]-modules into the control file if not using m-a
        if ! $(USE_MA); then \
                (cat debian/control.main; sed -e '1,/^$$/d' -e "s/_KVERS_/$(KVER)/g" < debian/control.modules.in) > debian/control; \
                for file in debian/*_KVERS_*; do \
@@ -271,6 +317,7 @@ binary-$(SOURCE_PKG): build-stamp
 # if only we could use m-a for this, but this stupid "compliant.list"
 # thing fouls that up
 binary-kern-mods:
+       # Build client or server modules
        mkdir -p debian/m-a_root/usr_src/modules
        ln -s ../../../../ debian/m-a_root/usr_src/modules/lustre
        m-a -t -u debian/m-a_root/ -d -v -k $(KSRC) build lustre
@@ -279,7 +326,7 @@ binary-kern-mods:
 # an alternative (to module-assistant) method of building the kernel modules
 binary-$(MODS_PKG): build-stamp
        if ! $(USE_MA); then \
-               cp debian/lustre-client-modules.install debian/lustre-client-modules-$(KVER).install ; \
+               cp debian/$(MODS_PKG).install debian/$(MODS_PKG)-$(KVER).install ; \
                dh_testdir; \
                dh_testroot; \
                dh_installdirs -p $(MODS_PKG)-$(KVER); \
@@ -296,6 +343,7 @@ binary-$(MODS_PKG): build-stamp
                dh_builddeb -p $(MODS_PKG)-$(KVER); \
        fi
 
+
 ###
 ### For module-assistant
 ###
@@ -333,19 +381,39 @@ kdist_config: prep-deb-files patch-stamp
        #-$(MAKE) -C $(KSRC) prepare scripts
        # touch files to same date, to avoid auto*
        find . -type f -print0 | xargs -0 touch -r COPYING \;
-       # Doesn't seem possible to only build modules...
+       # Determine flags that are different between server/client module builds
+       # Note: It doesn't seem possible to *only* build modules.
+       echo "Enabled Build Profiles: $${DEB_BUILD_PROFILES}"
+       if echo "$${DEB_BUILD_PROFILES}" | grep -q "server"; then \
+               export EXTRAFLAGS="--enable-server"; \
+       else \
+               export EXTRAFLAGS="--disable-server"; \
+       fi; \
+       if echo "$${DEB_BUILD_PROFILES}" | grep -q "zfs"; then \
+               export EXTRAFLAGS="$${EXTRAFLAGS} --with-zfs=$${ZFS_SRC} --with-spl=$${SPL_SRC}"; \
+       else \
+               export EXTRAFLAGS="$${EXTRAFLAGS} --without-zfs" ; \
+       fi; \
+       if echo "$${DEB_BUILD_PROFILES}" | grep -q "ldiskfs"; then \
+               export EXTRAFLAGS="$${EXTRAFLAGS} \
+                       --enable-ldiskfs --enable-quilt"; \
+       else \
+               export EXTRAFLAGS="$${EXTRAFLAGS} \
+                       --disable-ldiskfs --disable-quilt"; \
+       fi; \
+       echo "Final value of EXTRAFLAGS: $${EXTRAFLAGS}"; \
        ./configure --with-linux=$(KSRC_TREE) \
-                   --with-linux-obj=$(KSRC) \
-                   --disable-server \
-                   --disable-quilt  \
-                   --disable-dependency-tracking \
-                   --disable-doc  \
-                   --disable-iokit \
-                   --disable-snmp \
-                   --disable-tests \
-                   --enable-quota \
-                   --with-kmp-moddir=updates \
-                   $(IB_OPTIONS)
+               --with-linux-obj=$(KSRC) \
+               --disable-dependency-tracking \
+               --disable-doc  \
+               --disable-iokit \
+               --disable-snmp \
+               --disable-tests \
+               --enable-quota \
+               --with-kmp-moddir=updates \
+               $${EXTRAFLAGS} \
+               $(IB_OPTIONS)
+
 
 kdist_configure: kdist_config
 
@@ -376,7 +444,7 @@ clean:
        dpatch deapply-all -v
        -$(MAKE) distclean
        rm -rf  debian/substvars debian/*.bak debian/*~ *~  *-stamp debian/$(PATCH_PKG)
-       ls -d debian/lustre-client-modules-* | grep -v _KVERS_ | xargs rm -f || true
+       ls -d debian/lustre-*-modules-* | grep -v _KVERS_ | xargs rm -f || true
        # only remove this if the clean was not called from kdist_clean
        if [ "$$MA_SOURCE_PKG" = "" ]; then \
                rm -rf  debian/m-a_root; \
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-data-in-dirent-001.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-data-in-dirent-001.patch
new file mode 100644 (file)
index 0000000..e014bb0
--- /dev/null
@@ -0,0 +1,789 @@
+diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
+index 1d1bca7..df2a96d 100644
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -67,11 +67,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
+       const int rlen = ext4_rec_len_from_disk(de->rec_len,
+                                               dir->i_sb->s_blocksize);
+-      if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
++      if (unlikely(rlen < __EXT4_DIR_REC_LEN(1)))
+               error_msg = "rec_len is smaller than minimal";
+       else if (unlikely(rlen % 4 != 0))
+               error_msg = "rec_len % 4 != 0";
+-      else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
++      else if (unlikely(rlen < EXT4_DIR_REC_LEN(de)))
+               error_msg = "rec_len is too small for name_len";
+       else if (unlikely(((char *) de - buf) + rlen > size))
+               error_msg = "directory entry across range";
+@@ -205,7 +205,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
+                                * failure will be detected in the
+                                * dirent test below. */
+                               if (ext4_rec_len_from_disk(de->rec_len,
+-                                      sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
++                                  sb->s_blocksize) < __EXT4_DIR_REC_LEN(1))
+                                       break;
+                               i += ext4_rec_len_from_disk(de->rec_len,
+                                                           sb->s_blocksize);
+@@ -424,12 +424,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+       struct fname *fname, *new_fn;
+       struct dir_private_info *info;
+       int len;
++      int extra_data = 0;
+       info = dir_file->private_data;
+       p = &info->root.rb_node;
+       /* Create and allocate the fname structure */
+-      len = sizeof(struct fname) + ent_name->len + 1;
++      if (dirent->file_type & EXT4_DIRENT_LUFID)
++              extra_data = ext4_get_dirent_data_len(dirent);
++
++      len = sizeof(struct fname) + ent_name->len + extra_data + 1;
++
+       new_fn = kzalloc(len, GFP_KERNEL);
+       if (!new_fn)
+               return -ENOMEM;
+@@ -438,7 +443,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+       new_fn->inode = le32_to_cpu(dirent->inode);
+       new_fn->name_len = ent_name->len;
+       new_fn->file_type = dirent->file_type;
+-      memcpy(new_fn->name, ent_name->name, ent_name->len);
++      memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data);
+       new_fn->name[ent_name->len] = 0;
+       while (*p) {
+@@ -621,7 +626,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
+               if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                        buf, buf_size, offset))
+                       return -EFSCORRUPTED;
+-              nlen = EXT4_DIR_REC_LEN(de->name_len);
++              nlen = EXT4_DIR_REC_LEN(de);
+               rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+               de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+               offset += rlen;
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 613538c..10a2a86 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1069,6 +1069,7 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT_POSIX_ACL          0x08000 /* POSIX Access Control Lists */
+ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC   0x10000 /* No auto delalloc mapping */
+ #define EXT4_MOUNT_BARRIER            0x20000 /* Use block barriers */
++#define EXT4_MOUNT_DIRDATA            0x40000 /* Data in directory entries*/
+ #define EXT4_MOUNT_QUOTA              0x80000 /* Some quota option set */
+ #define EXT4_MOUNT_USRQUOTA           0x100000 /* "old" user quota */
+ #define EXT4_MOUNT_GRPQUOTA           0x200000 /* "old" group quota */
+@@ -1781,6 +1782,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,             ENCRYPT)
+                                        EXT4_FEATURE_INCOMPAT_64BIT| \
+                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                        EXT4_FEATURE_INCOMPAT_MMP | \
++                                       EXT4_FEATURE_INCOMPAT_DIRDATA| \
+                                        EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
+                                        EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+@@ -1937,6 +1939,43 @@ struct ext4_dir_entry_tail {
+ #define EXT4_FT_SYMLINK               7
+ #define EXT4_FT_MAX           8
++#define EXT4_FT_MASK          0xf
++
++#if EXT4_FT_MAX > EXT4_FT_MASK
++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK"
++#endif
++
++/*
++ * d_type has 4 unused bits, so it can hold four types data. these different
++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be
++ * stored, in flag order, after file-name in ext4 dirent.
++*/
++/*
++ * this flag is added to d_type if ext4 dirent has extra data after
++ * filename. this data length is variable and length is stored in first byte
++ * of data. data start after filename NUL byte.
++ * This is used by Lustre FS.
++  */
++#define EXT4_DIRENT_LUFID             0x10
++
++#define EXT4_LUFID_MAGIC    0xAD200907UL
++struct ext4_dentry_param {
++      __u32  edp_magic;       /* EXT4_LUFID_MAGIC */
++      char   edp_len;         /* size of edp_data in bytes */
++      char   edp_data[0];     /* packed array of data */
++} __packed;
++
++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb,
++                                                struct ext4_dentry_param *p)
++
++{
++      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA))
++              return NULL;
++      if (p && p->edp_magic == EXT4_LUFID_MAGIC)
++              return &p->edp_len;
++      else
++              return NULL;
++}
+ #define EXT4_FT_DIR_CSUM      0xDE
+@@ -1947,8 +1986,11 @@ struct ext4_dir_entry_tail {
+  */
+ #define EXT4_DIR_PAD                  4
+ #define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
+-#define EXT4_DIR_REC_LEN(name_len)    (((name_len) + 8 + EXT4_DIR_ROUND) & \
++#define __EXT4_DIR_REC_LEN(name_len)  (((name_len) + 8 + EXT4_DIR_ROUND) & \
+                                        ~EXT4_DIR_ROUND)
++#define EXT4_DIR_REC_LEN(de)          (__EXT4_DIR_REC_LEN((de)->name_len +\
++                                      ext4_get_dirent_data_len(de)))
++
+ #define EXT4_MAX_REC_LEN              ((1<<16)-1)
+ /*
+@@ -2407,12 +2449,12 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                            struct buffer_head *bh,
+                            void *buf, int buf_size,
+                            struct ext4_filename *fname,
+-                           struct ext4_dir_entry_2 **dest_de);
++                           struct ext4_dir_entry_2 **dest_de, int *dlen);
+ int ext4_insert_dentry(struct inode *dir,
+                      struct inode *inode,
+                      struct ext4_dir_entry_2 *de,
+                      int buf_size,
+-                     struct ext4_filename *fname);
++                     struct ext4_filename *fname, void *data);
+ static inline void ext4_update_dx_flag(struct inode *inode)
+ {
+       if (!ext4_has_feature_dir_index(inode->i_sb))
+@@ -2424,10 +2466,17 @@ static unsigned char ext4_filetype_table[] = {
+ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
+ {
+-      if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
++      int fl_index = filetype & EXT4_FT_MASK;
++
++      if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX)
+               return DT_UNKNOWN;
+-      return ext4_filetype_table[filetype];
++      if (!test_opt(sb, DIRDATA))
++              return ext4_filetype_table[fl_index];
++
++      return (ext4_filetype_table[fl_index]) |
++              (filetype & EXT4_DIRENT_LUFID);
++
+ }
+ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
+                            void *buf, int buf_size);
+@@ -2575,6 +2624,8 @@ extern struct inode *ext4_create_inode(handle_t *handle,
+ extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
+                            struct ext4_dir_entry_2 *de_del,
+                            struct buffer_head *bh);
++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
++                             struct inode *inode, const void *, const void *);
+ extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+                               __u32 start_minor_hash, __u32 *next_hash);
+ extern int ext4_search_dir(struct buffer_head *bh,
+@@ -3292,6 +3343,28 @@ extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+ extern int ext4_resize_begin(struct super_block *sb);
+ extern void ext4_resize_end(struct super_block *sb);
++/*
++ * Compute the total directory entry data length.
++ * This includes the filename and an implicit NUL terminator (always present),
++ * and optional extensions.  Each extension has a bit set in the high 4 bits of
++ * de->file_type, and the extension length is the first byte in each entry.
++ */
++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de)
++{
++      char *len = de->name + de->name_len + 1 /* NUL terminator */;
++      int dlen = 0;
++      __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
++
++      while (extra_data_flags) {
++              if (extra_data_flags & 1) {
++                      dlen += *len + (dlen == 0);
++                      len += *len;
++              }
++              extra_data_flags >>= 1;
++      }
++      return dlen;
++}
++
+ #endif        /* __KERNEL__ */
+ #define EFSBADCRC     EBADMSG         /* Bad CRC detected */
+diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
+index d884989..53e8a88 100644
+--- a/fs/ext4/inline.c
++++ b/fs/ext4/inline.c
+@@ -1005,7 +1005,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
+       struct ext4_dir_entry_2 *de;
+       err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+-                              inline_size, fname, &de);
++                              inline_size, fname, &de, NULL);
+       if (err)
+               return err;
+@@ -1013,7 +1013,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
+       err = ext4_journal_get_write_access(handle, iloc->bh);
+       if (err)
+               return err;
+-      ext4_insert_dentry(dir, inode, de, inline_size, fname);
++      ext4_insert_dentry(dir, inode, de, inline_size, fname, NULL);
+       ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+@@ -1083,7 +1083,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+       int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+       int new_size = get_max_inline_xattr_value_size(dir, iloc);
+-      if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
++      if (new_size - old_size <= __EXT4_DIR_REC_LEN(1))
+               return -ENOSPC;
+       ret = ext4_update_inline_data(handle, dir,
+@@ -1366,7 +1366,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
+                       fake.name_len = 1;
+                       strcpy(fake.name, ".");
+                       fake.rec_len = ext4_rec_len_to_disk(
+-                                              EXT4_DIR_REC_LEN(fake.name_len),
++                                              EXT4_DIR_REC_LEN(&fake),
+                                               inline_size);
+                       ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                       de = &fake;
+@@ -1376,7 +1376,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
+                       fake.name_len = 2;
+                       strcpy(fake.name, "..");
+                       fake.rec_len = ext4_rec_len_to_disk(
+-                                              EXT4_DIR_REC_LEN(fake.name_len),
++                                              EXT4_DIR_REC_LEN(&fake),
+                                               inline_size);
+                       ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                       de = &fake;
+@@ -1474,8 +1474,8 @@ int ext4_read_inline_dir(struct file *file,
+        * So we will use extra_offset and extra_size to indicate them
+        * during the inline dir iteration.
+        */
+-      dotdot_offset = EXT4_DIR_REC_LEN(1);
+-      dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
++      dotdot_offset = __EXT4_DIR_REC_LEN(1);
++      dotdot_size = dotdot_offset + __EXT4_DIR_REC_LEN(2);
+       extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+       extra_size = extra_offset + inline_size;
+@@ -1510,7 +1510,7 @@ int ext4_read_inline_dir(struct file *file,
+                        * failure will be detected in the
+                        * dirent test below. */
+                       if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+-                              < EXT4_DIR_REC_LEN(1))
++                              < __EXT4_DIR_REC_LEN(1))
+                               break;
+                       i += ext4_rec_len_from_disk(de->rec_len,
+                                                   extra_size);
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index e90dd58..11bc299 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -241,7 +241,8 @@ static unsigned dx_get_count(struct dx_entry *entries);
+ static unsigned dx_get_limit(struct dx_entry *entries);
+ static void dx_set_count(struct dx_entry *entries, unsigned value);
+ static void dx_set_limit(struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
++static inline unsigned dx_root_limit(struct inode *dir,
++              struct ext4_dir_entry_2 *dot_de, unsigned infosize);
+ static unsigned dx_node_limit(struct inode *dir);
+ static struct dx_frame *dx_probe(struct ext4_filename *fname,
+                                struct inode *dir,
+@@ -504,11 +505,12 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+  */
+ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de)
+ {
++      BUG_ON(de->name_len != 1);
+       /* get dotdot first */
+-      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1));
++      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
+       /* dx root info is after dotdot entry */
+-      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2));
++      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
+       return (struct dx_root_info *)de;
+ }
+@@ -553,10 +555,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
+       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
++static inline unsigned dx_root_limit(struct inode *dir,
++              struct ext4_dir_entry_2 *dot_de, unsigned infosize)
+ {
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+-              EXT4_DIR_REC_LEN(2) - infosize;
++      struct ext4_dir_entry_2 *dotdot_de;
++      unsigned entry_space;
++
++      BUG_ON(dot_de->name_len != 1);
++      dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize);
++      entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(dot_de) -
++                       EXT4_DIR_REC_LEN(dotdot_de) - infosize;
+       if (ext4_has_metadata_csum(dir->i_sb))
+               entry_space -= sizeof(struct dx_tail);
+@@ -565,7 +573,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
+ static inline unsigned dx_node_limit(struct inode *dir)
+ {
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
++      unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0);
+       if (ext4_has_metadata_csum(dir->i_sb))
+               entry_space -= sizeof(struct dx_tail);
+@@ -674,7 +682,7 @@ static struct stats dx_show_leaf(struct inode *dir,
+                                      (unsigned) ((char *) de - base));
+ #endif
+                       }
+-                      space += EXT4_DIR_REC_LEN(de->name_len);
++                      space += EXT4_DIR_REC_LEN(de);
+                       names++;
+               }
+               de = ext4_next_entry(de, size);
+@@ -775,11 +783,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+       entries = (struct dx_entry *)(((char *)info) + info->info_length);
+-      if (dx_get_limit(entries) != dx_root_limit(dir,
+-                                                 info->info_length)) {
++      if (dx_get_limit(entries) !=
++          dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data,
++                        info->info_length)) {
+               ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
+                                  dx_get_limit(entries),
+-                                 dx_root_limit(dir, info->info_length));
++                                 dx_root_limit(dir,
++                                        (struct ext4_dir_entry_2 *)frame->bh->b_data,
++                                        info->info_length));
+               goto fail;
+       }
+@@ -963,7 +974,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
+       de = (struct ext4_dir_entry_2 *) bh->b_data;
+       top = (struct ext4_dir_entry_2 *) ((char *) de +
+                                          dir->i_sb->s_blocksize -
+-                                         EXT4_DIR_REC_LEN(0));
++                                         __EXT4_DIR_REC_LEN(0));
+ #ifdef CONFIG_EXT4_FS_ENCRYPTION
+       /* Check if the directory is encrypted */
+       if (ext4_encrypted_inode(dir)) {
+@@ -1665,7 +1676,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+       while (count--) {
+               struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+                                               (from + (map->offs<<2));
+-              rec_len = EXT4_DIR_REC_LEN(de->name_len);
++              rec_len = EXT4_DIR_REC_LEN(de);
+               memcpy (to, de, rec_len);
+               ((struct ext4_dir_entry_2 *) to)->rec_len =
+                               ext4_rec_len_to_disk(rec_len, blocksize);
+@@ -1689,7 +1700,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+       while ((char*)de < base + blocksize) {
+               next = ext4_next_entry(de, blocksize);
+               if (de->inode && de->name_len) {
+-                      rec_len = EXT4_DIR_REC_LEN(de->name_len);
++                      rec_len = EXT4_DIR_REC_LEN(de);
+                       if (de > to)
+                               memmove(to, de, rec_len);
+                       to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+@@ -1820,15 +1831,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                     struct buffer_head *bh,
+                     void *buf, int buf_size,
+                     struct ext4_filename *fname,
+-                    struct ext4_dir_entry_2 **dest_de)
++                    struct ext4_dir_entry_2 **dest_de, int *dlen)
+ {
+       struct ext4_dir_entry_2 *de;
+-      unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
++      unsigned short reclen = __EXT4_DIR_REC_LEN(fname_len(fname)) +
++                                                (dlen ? *dlen : 0);
+       int nlen, rlen;
+       unsigned int offset = 0;
+       char *top;
+       int res;
++      dlen ? *dlen = 0 : 0; /* default set to 0 */
+       de = (struct ext4_dir_entry_2 *)buf;
+       top = buf + buf_size - reclen;
+       while ((char *) de <= top) {
+@@ -1845,10 +1858,26 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                       res = -EEXIST;
+                       goto return_result;
+               }
+-              nlen = EXT4_DIR_REC_LEN(de->name_len);
++              nlen = EXT4_DIR_REC_LEN(de);
+               rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+               if ((de->inode ? rlen - nlen : rlen) >= reclen)
+                       break;
++              /* Then for dotdot entries, check for the smaller space
++               * required for just the entry, no FID */
++              if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) {
++                      if ((de->inode ? rlen - nlen : rlen) >=
++                          __EXT4_DIR_REC_LEN(fname_len(fname))) {
++                              /* set dlen=1 to indicate not
++                               * enough space store fid */
++                              dlen ? *dlen = 1 : 0;
++                              break;
++                      }
++                      /* The new ".." entry must be written over the
++                       * previous ".." entry, which is the first
++                       * entry traversed by this scan. If it doesn't
++                       * fit, something is badly wrong, so -EIO. */
++                      return -EIO;
++              }
+               de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+               offset += rlen;
+       }
+@@ -1867,12 +1896,12 @@ int ext4_insert_dentry(struct inode *dir,
+                      struct inode *inode,
+                      struct ext4_dir_entry_2 *de,
+                      int buf_size,
+-                     struct ext4_filename *fname)
++                     struct ext4_filename *fname, void *data)
+ {
+       int nlen, rlen;
+-      nlen = EXT4_DIR_REC_LEN(de->name_len);
++      nlen = EXT4_DIR_REC_LEN(de);
+       rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+       if (de->inode) {
+               struct ext4_dir_entry_2 *de1 =
+@@ -1886,6 +1915,11 @@ int ext4_insert_dentry(struct inode *dir,
+       ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+       de->name_len = fname_len(fname);
+       memcpy(de->name, fname_name(fname), fname_len(fname));
++      if (data) {
++              de->name[fname_len(fname)] = 0;
++              memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
+       return 0;
+ }
+@@ -1900,18 +1934,23 @@ int ext4_insert_dentry(struct inode *dir,
+ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
+                            struct inode *dir,
+                            struct inode *inode, struct ext4_dir_entry_2 *de,
+-                           struct buffer_head *bh)
++                           struct buffer_head *bh, struct dentry *dentry)
+ {
+       unsigned int    blocksize = dir->i_sb->s_blocksize;
+       int             csum_size = 0;
+-      int             err;
++      int             err, dlen = 0;
++      unsigned char   *data;
++      data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *)
++                                              dentry->d_fsdata);
+       if (ext4_has_metadata_csum(inode->i_sb))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+       if (!de) {
++              if (data)
++                      dlen = (*data) + 1;
+               err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+-                                      blocksize - csum_size, fname, &de);
++                                      blocksize - csum_size, fname, &de, &dlen);
+               if (err)
+                       return err;
+       }
+@@ -1924,7 +1963,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
+       /* By now the buffer is marked for journaling. Due to crypto operations,
+        * the following function call may fail */
+-      err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
++      /* If writing the short form of "dotdot", don't add the data section */
++      if (dlen == 1)
++              data = NULL;
++      err = ext4_insert_dentry(dir, inode, de, blocksize, fname, data);
+       if (err < 0)
+               return err;
+@@ -2036,7 +2078,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+       dx_set_block(entries, 1);
+       dx_set_count(entries, 1);
+-      dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
++      dx_set_limit(entries, dx_root_limit(dir,
++                                       dot_de, sizeof(*dx_info)));
+       /* Initialize as for dx_probe */
+       fname->hinfo.hash_version = dx_info->hash_version;
+@@ -2064,7 +2107,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+               goto out_frames;
+       }
+-      retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2);
++      retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh2, dentry);
+ out_frames:
+       /*
+        * Even if the block split failed, we have to properly write
+@@ -2086,6 +2129,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry,
+       struct buffer_head *dir_block;
+       struct ext4_dir_entry_2 *de;
+       int len, journal = 0, err = 0;
++      int dlen = 0;
++      char *data;
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -2103,19 +2148,24 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry,
+       /* the first item must be "." */
+       assert(de->name_len == 1 && de->name[0] == '.');
+       len = le16_to_cpu(de->rec_len);
+-      assert(len >= EXT4_DIR_REC_LEN(1));
+-      if (len > EXT4_DIR_REC_LEN(1)) {
++      assert(len >= __EXT4_DIR_REC_LEN(1));
++      if (len > __EXT4_DIR_REC_LEN(1)) {
+               BUFFER_TRACE(dir_block, "get_write_access");
+               err = ext4_journal_get_write_access(handle, dir_block);
+               if (err)
+                       goto out_journal;
+               journal = 1;
+-              de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1));
++              de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
+       }
+-      len -= EXT4_DIR_REC_LEN(1);
+-      assert(len == 0 || len >= EXT4_DIR_REC_LEN(2));
++      len -= EXT4_DIR_REC_LEN(de);
++      data = ext4_dentry_get_data(dir->i_sb,
++                      (struct ext4_dentry_param *)dentry->d_fsdata);
++      if (data)
++              dlen = *data + 1;
++      assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen));
++
+       de = (struct ext4_dir_entry_2 *)
+                       ((char *) de + le16_to_cpu(de->rec_len));
+       if (!journal) {
+@@ -2129,10 +2179,15 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry,
+       if (len > 0)
+               de->rec_len = cpu_to_le16(len);
+       else
+-              assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2));
++              assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2));
+       de->name_len = 2;
+       strcpy(de->name, "..");
+-      ext4_set_de_type(dir->i_sb, de, S_IFDIR);
++      if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) {
++              de->name[2] = 0;
++              memcpy(&de->name[2 + 1], data, *data);
++              ext4_set_de_type(dir->i_sb, de, S_IFDIR);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
+ out_journal:
+       if (journal) {
+@@ -2214,7 +2269,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+                       goto out;
+               }
+               retval = add_dirent_to_buf(handle, &fname, dir, inode,
+-                                         NULL, bh);
++                                         NULL, bh, dentry);
+               if (retval != -ENOSPC)
+                       goto out;
+@@ -2242,7 +2297,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+               initialize_dirent_tail(t, blocksize);
+       }
+-      retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
++      retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh, dentry);
+ out:
+       ext4_fname_free_filename(&fname);
+       brelse(bh);
+@@ -2282,7 +2337,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+       if (err)
+               goto journal_error;
+-      err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
++      err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh, dentry);
+       if (err != -ENOSPC)
+               goto cleanup;
+@@ -2386,7 +2441,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+               err = PTR_ERR(de);
+               goto cleanup;
+       }
+-      err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
++      err = add_dirent_to_buf(handle, fname, dir, inode, de, bh, dentry);
+       goto cleanup;
+ journal_error:
+@@ -2661,37 +2716,70 @@ err_unlock_inode:
+       return err;
+ }
++struct tp_block {
++      struct inode *inode;
++      void *data1;
++      void *data2;
++};
++
+ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+                         struct ext4_dir_entry_2 *de,
+                         int blocksize, int csum_size,
+                         unsigned int parent_ino, int dotdot_real_len)
+ {
++      void *data1 = NULL, *data2 = NULL;
++      int dot_reclen = 0;
++
++      if (dotdot_real_len == 10) {
++              struct tp_block *tpb = (struct tp_block *)inode;
++              data1 = tpb->data1;
++              data2 = tpb->data2;
++              inode = tpb->inode;
++              dotdot_real_len = 0;
++      }
+       de->inode = cpu_to_le32(inode->i_ino);
+       de->name_len = 1;
+-      de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+-                                         blocksize);
+       strcpy(de->name, ".");
+       ext4_set_de_type(inode->i_sb, de, S_IFDIR);
++      /* get packed fid data*/
++      data1 = ext4_dentry_get_data(inode->i_sb,
++                              (struct ext4_dentry_param *) data1);
++      if (data1) {
++              de->name[1] = 0;
++              memcpy(&de->name[2], data1, *(char *) data1);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
++      de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
++      dot_reclen = cpu_to_le16(de->rec_len);
+       de = ext4_next_entry(de, blocksize);
+       de->inode = cpu_to_le32(parent_ino);
+       de->name_len = 2;
++      strcpy(de->name, "..");
++      ext4_set_de_type(inode->i_sb, de, S_IFDIR);
++      data2 = ext4_dentry_get_data(inode->i_sb,
++                      (struct ext4_dentry_param *) data2);
++      if (data2) {
++              de->name[2] = 0;
++              memcpy(&de->name[3], data2, *(char *) data2);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
+       if (!dotdot_real_len)
+               de->rec_len = ext4_rec_len_to_disk(blocksize -
+-                                      (csum_size + EXT4_DIR_REC_LEN(1)),
++                                      (csum_size + dot_reclen),
+                                       blocksize);
+       else
+               de->rec_len = ext4_rec_len_to_disk(
+-                              EXT4_DIR_REC_LEN(de->name_len), blocksize);
+-      strcpy(de->name, "..");
+-      ext4_set_de_type(inode->i_sb, de, S_IFDIR);
++                              EXT4_DIR_REC_LEN(de), blocksize);
+       return ext4_next_entry(de, blocksize);
+ }
+ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+-                           struct inode *inode)
++                           struct inode *inode,
++                           const void *data1, const void *data2)
+ {
++      struct tp_block param;
+       struct buffer_head *dir_block = NULL;
+       struct ext4_dir_entry_2 *de;
+       struct ext4_dir_entry_tail *t;
+@@ -2716,7 +2804,11 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+       if (IS_ERR(dir_block))
+               return PTR_ERR(dir_block);
+       de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+-      ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
++      param.inode = inode;
++      param.data1 = (void *)data1;
++      param.data2 = (void *)data2;
++      ext4_init_dot_dotdot((struct inode *)(&param), de, blocksize,
++                           csum_size, dir->i_ino, 10);
+       set_nlink(inode, 2);
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+@@ -2733,6 +2825,29 @@ out:
+       return err;
+ }
++/* Initialize @inode as a subdirectory of @dir, and add the
++ * "." and ".." entries into the first directory block. */
++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
++                      struct inode *inode,
++                      const void *data1, const void *data2)
++{
++      int rc;
++
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++
++      if (IS_DIRSYNC(dir))
++              ext4_handle_sync(handle);
++
++      inode->i_op = &ext4_dir_inode_operations;
++      inode->i_fop = &ext4_dir_operations;
++      rc = ext4_init_new_dir(handle, dir, inode, data1, data2);
++      if (!rc)
++              rc = ext4_mark_inode_dirty(handle, inode);
++      return rc;
++}
++EXPORT_SYMBOL(ext4_add_dot_dotdot);
++
+ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ {
+       handle_t *handle;
+@@ -2759,7 +2874,7 @@ retry:
+       inode->i_op = &ext4_dir_inode_operations;
+       inode->i_fop = &ext4_dir_operations;
+-      err = ext4_init_new_dir(handle, dir, inode);
++      err = ext4_init_new_dir(handle, dir, inode, NULL, NULL);
+       if (err)
+               goto out_clear_inode;
+       err = ext4_mark_inode_dirty(handle, inode);
+@@ -2811,7 +2926,7 @@ int ext4_empty_dir(struct inode *inode)
+       }
+       sb = inode->i_sb;
+-      if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
++      if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2)) {
+               EXT4_ERROR_INODE(inode, "invalid size");
+               return 1;
+       }
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 7715539..f48b36e 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1159,7 +1159,7 @@ enum {
+       Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
+       Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+       Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+-      Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
++      Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata,
+       Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
+       Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+       Opt_lazytime, Opt_nolazytime,
+@@ -1230,6 +1230,7 @@ static const match_table_t tokens = {
+       {Opt_lazytime, "lazytime"},
+       {Opt_nolazytime, "nolazytime"},
+       {Opt_nodelalloc, "nodelalloc"},
++      {Opt_dirdata, "dirdata"},
+       {Opt_removed, "mblk_io_submit"},
+       {Opt_removed, "nomblk_io_submit"},
+       {Opt_block_validity, "block_validity"},
+@@ -1444,6 +1445,7 @@ static const struct mount_opts {
+       {Opt_usrjquota, 0, MOPT_Q},
+       {Opt_grpjquota, 0, MOPT_Q},
+       {Opt_offusrjquota, 0, MOPT_Q},
++      {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET},
+       {Opt_offgrpjquota, 0, MOPT_Q},
+       {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+       {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-data-in-dirent.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-data-in-dirent.patch
new file mode 100644 (file)
index 0000000..a04f784
--- /dev/null
@@ -0,0 +1,789 @@
+diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
+index 1d1bca7..df2a96d 100644
+--- a/fs/ext4/dir.c
++++ b/fs/ext4/dir.c
+@@ -67,11 +67,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
+       const int rlen = ext4_rec_len_from_disk(de->rec_len,
+                                               dir->i_sb->s_blocksize);
+-      if (unlikely(rlen < EXT4_DIR_REC_LEN(1)))
++      if (unlikely(rlen < __EXT4_DIR_REC_LEN(1)))
+               error_msg = "rec_len is smaller than minimal";
+       else if (unlikely(rlen % 4 != 0))
+               error_msg = "rec_len % 4 != 0";
+-      else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
++      else if (unlikely(rlen < EXT4_DIR_REC_LEN(de)))
+               error_msg = "rec_len is too small for name_len";
+       else if (unlikely(((char *) de - buf) + rlen > size))
+               error_msg = "directory entry across range";
+@@ -205,7 +205,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
+                                * failure will be detected in the
+                                * dirent test below. */
+                               if (ext4_rec_len_from_disk(de->rec_len,
+-                                      sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
++                                  sb->s_blocksize) < __EXT4_DIR_REC_LEN(1))
+                                       break;
+                               i += ext4_rec_len_from_disk(de->rec_len,
+                                                           sb->s_blocksize);
+@@ -424,12 +424,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+       struct fname *fname, *new_fn;
+       struct dir_private_info *info;
+       int len;
++      int extra_data = 0;
+       info = dir_file->private_data;
+       p = &info->root.rb_node;
+       /* Create and allocate the fname structure */
+-      len = sizeof(struct fname) + ent_name->len + 1;
++      if (dirent->file_type & EXT4_DIRENT_LUFID)
++              extra_data = ext4_get_dirent_data_len(dirent);
++
++      len = sizeof(struct fname) + ent_name->len + extra_data + 1;
++
+       new_fn = kzalloc(len, GFP_KERNEL);
+       if (!new_fn)
+               return -ENOMEM;
+@@ -438,7 +443,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
+       new_fn->inode = le32_to_cpu(dirent->inode);
+       new_fn->name_len = ent_name->len;
+       new_fn->file_type = dirent->file_type;
+-      memcpy(new_fn->name, ent_name->name, ent_name->len);
++      memcpy(new_fn->name, ent_name->name, ent_name->len + extra_data);
+       new_fn->name[ent_name->len] = 0;
+       while (*p) {
+@@ -621,7 +626,7 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
+               if (ext4_check_dir_entry(dir, NULL, de, bh,
+                                        buf, buf_size, offset))
+                       return -EFSCORRUPTED;
+-              nlen = EXT4_DIR_REC_LEN(de->name_len);
++              nlen = EXT4_DIR_REC_LEN(de);
+               rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+               de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+               offset += rlen;
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 613538c..10a2a86 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1069,6 +1069,7 @@ struct ext4_inode_info {
+ #define EXT4_MOUNT_POSIX_ACL          0x08000 /* POSIX Access Control Lists */
+ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC   0x10000 /* No auto delalloc mapping */
+ #define EXT4_MOUNT_BARRIER            0x20000 /* Use block barriers */
++#define EXT4_MOUNT_DIRDATA            0x40000 /* Data in directory entries*/
+ #define EXT4_MOUNT_QUOTA              0x80000 /* Some quota option set */
+ #define EXT4_MOUNT_USRQUOTA           0x100000 /* "old" user quota */
+ #define EXT4_MOUNT_GRPQUOTA           0x200000 /* "old" group quota */
+@@ -1781,6 +1782,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,             ENCRYPT)
+                                        EXT4_FEATURE_INCOMPAT_64BIT| \
+                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
+                                        EXT4_FEATURE_INCOMPAT_MMP | \
++                                       EXT4_FEATURE_INCOMPAT_DIRDATA| \
+                                        EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
+                                        EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+@@ -1937,6 +1939,43 @@ struct ext4_dir_entry_tail {
+ #define EXT4_FT_SYMLINK               7
+ #define EXT4_FT_MAX           8
++#define EXT4_FT_MASK          0xf
++
++#if EXT4_FT_MAX > EXT4_FT_MASK
++#error "conflicting EXT4_FT_MAX and EXT4_FT_MASK"
++#endif
++
++/*
++ * d_type has 4 unused bits, so it can hold four types data. these different
++ * type of data (e.g. lustre data, high 32 bits of 64-bit inode number) can be
++ * stored, in flag order, after file-name in ext4 dirent.
++*/
++/*
++ * this flag is added to d_type if ext4 dirent has extra data after
++ * filename. this data length is variable and length is stored in first byte
++ * of data. data start after filename NUL byte.
++ * This is used by Lustre FS.
++  */
++#define EXT4_DIRENT_LUFID             0x10
++
++#define EXT4_LUFID_MAGIC    0xAD200907UL
++struct ext4_dentry_param {
++      __u32  edp_magic;       /* EXT4_LUFID_MAGIC */
++      char   edp_len;         /* size of edp_data in bytes */
++      char   edp_data[0];     /* packed array of data */
++} __packed;
++
++static inline unsigned char *ext4_dentry_get_data(struct super_block *sb,
++                                                struct ext4_dentry_param *p)
++
++{
++      if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_DIRDATA))
++              return NULL;
++      if (p && p->edp_magic == EXT4_LUFID_MAGIC)
++              return &p->edp_len;
++      else
++              return NULL;
++}
+ #define EXT4_FT_DIR_CSUM      0xDE
+@@ -1947,8 +1986,11 @@ struct ext4_dir_entry_tail {
+  */
+ #define EXT4_DIR_PAD                  4
+ #define EXT4_DIR_ROUND                        (EXT4_DIR_PAD - 1)
+-#define EXT4_DIR_REC_LEN(name_len)    (((name_len) + 8 + EXT4_DIR_ROUND) & \
++#define __EXT4_DIR_REC_LEN(name_len)  (((name_len) + 8 + EXT4_DIR_ROUND) & \
+                                        ~EXT4_DIR_ROUND)
++#define EXT4_DIR_REC_LEN(de)          (__EXT4_DIR_REC_LEN((de)->name_len +\
++                                      ext4_get_dirent_data_len(de)))
++
+ #define EXT4_MAX_REC_LEN              ((1<<16)-1)
+ /*
+@@ -2407,12 +2449,12 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                            struct buffer_head *bh,
+                            void *buf, int buf_size,
+                            struct ext4_filename *fname,
+-                           struct ext4_dir_entry_2 **dest_de);
++                           struct ext4_dir_entry_2 **dest_de, int *dlen);
+ int ext4_insert_dentry(struct inode *dir,
+                      struct inode *inode,
+                      struct ext4_dir_entry_2 *de,
+                      int buf_size,
+-                     struct ext4_filename *fname);
++                     struct ext4_filename *fname, void *data);
+ static inline void ext4_update_dx_flag(struct inode *inode)
+ {
+       if (!ext4_has_feature_dir_index(inode->i_sb))
+@@ -2424,10 +2466,17 @@ static unsigned char ext4_filetype_table[] = {
+ static inline  unsigned char get_dtype(struct super_block *sb, int filetype)
+ {
+-      if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
++      int fl_index = filetype & EXT4_FT_MASK;
++
++      if (!ext4_has_feature_filetype(sb) || fl_index >= EXT4_FT_MAX)
+               return DT_UNKNOWN;
+-      return ext4_filetype_table[filetype];
++      if (!test_opt(sb, DIRDATA))
++              return ext4_filetype_table[fl_index];
++
++      return (ext4_filetype_table[fl_index]) |
++              (filetype & EXT4_DIRENT_LUFID);
++
+ }
+ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
+                            void *buf, int buf_size);
+@@ -2575,6 +2624,8 @@ extern struct inode *ext4_create_inode(handle_t *handle,
+ extern int ext4_delete_entry(handle_t *handle, struct inode * dir,
+                            struct ext4_dir_entry_2 *de_del,
+                            struct buffer_head *bh);
++extern int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
++                             struct inode *inode, const void *, const void *);
+ extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+                               __u32 start_minor_hash, __u32 *next_hash);
+ extern int ext4_search_dir(struct buffer_head *bh,
+@@ -3292,6 +3343,28 @@ extern struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
+ extern int ext4_resize_begin(struct super_block *sb);
+ extern void ext4_resize_end(struct super_block *sb);
++/*
++ * Compute the total directory entry data length.
++ * This includes the filename and an implicit NUL terminator (always present),
++ * and optional extensions.  Each extension has a bit set in the high 4 bits of
++ * de->file_type, and the extension length is the first byte in each entry.
++ */
++static inline int ext4_get_dirent_data_len(struct ext4_dir_entry_2 *de)
++{
++      char *len = de->name + de->name_len + 1 /* NUL terminator */;
++      int dlen = 0;
++      __u8 extra_data_flags = (de->file_type & ~EXT4_FT_MASK) >> 4;
++
++      while (extra_data_flags) {
++              if (extra_data_flags & 1) {
++                      dlen += *len + (dlen == 0);
++                      len += *len;
++              }
++              extra_data_flags >>= 1;
++      }
++      return dlen;
++}
++
+ #endif        /* __KERNEL__ */
+ #define EFSBADCRC     EBADMSG         /* Bad CRC detected */
+diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
+index d884989..53e8a88 100644
+--- a/fs/ext4/inline.c
++++ b/fs/ext4/inline.c
+@@ -1005,7 +1005,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
+       struct ext4_dir_entry_2 *de;
+       err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start,
+-                              inline_size, fname, &de);
++                              inline_size, fname, &de, NULL);
+       if (err)
+               return err;
+@@ -1013,7 +1013,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
+       err = ext4_journal_get_write_access(handle, iloc->bh);
+       if (err)
+               return err;
+-      ext4_insert_dentry(dir, inode, de, inline_size, fname);
++      ext4_insert_dentry(dir, inode, de, inline_size, fname, NULL);
+       ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size);
+@@ -1083,7 +1083,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir,
+       int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE;
+       int new_size = get_max_inline_xattr_value_size(dir, iloc);
+-      if (new_size - old_size <= EXT4_DIR_REC_LEN(1))
++      if (new_size - old_size <= __EXT4_DIR_REC_LEN(1))
+               return -ENOSPC;
+       ret = ext4_update_inline_data(handle, dir,
+@@ -1366,7 +1366,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
+                       fake.name_len = 1;
+                       strcpy(fake.name, ".");
+                       fake.rec_len = ext4_rec_len_to_disk(
+-                                              EXT4_DIR_REC_LEN(fake.name_len),
++                                              EXT4_DIR_REC_LEN(&fake),
+                                               inline_size);
+                       ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                       de = &fake;
+@@ -1376,7 +1376,7 @@ int htree_inlinedir_to_tree(struct file *dir_file,
+                       fake.name_len = 2;
+                       strcpy(fake.name, "..");
+                       fake.rec_len = ext4_rec_len_to_disk(
+-                                              EXT4_DIR_REC_LEN(fake.name_len),
++                                              EXT4_DIR_REC_LEN(&fake),
+                                               inline_size);
+                       ext4_set_de_type(inode->i_sb, &fake, S_IFDIR);
+                       de = &fake;
+@@ -1474,8 +1474,8 @@ int ext4_read_inline_dir(struct file *file,
+        * So we will use extra_offset and extra_size to indicate them
+        * during the inline dir iteration.
+        */
+-      dotdot_offset = EXT4_DIR_REC_LEN(1);
+-      dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2);
++      dotdot_offset = __EXT4_DIR_REC_LEN(1);
++      dotdot_size = dotdot_offset + __EXT4_DIR_REC_LEN(2);
+       extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
+       extra_size = extra_offset + inline_size;
+@@ -1510,7 +1510,7 @@ int ext4_read_inline_dir(struct file *file,
+                        * failure will be detected in the
+                        * dirent test below. */
+                       if (ext4_rec_len_from_disk(de->rec_len, extra_size)
+-                              < EXT4_DIR_REC_LEN(1))
++                              < __EXT4_DIR_REC_LEN(1))
+                               break;
+                       i += ext4_rec_len_from_disk(de->rec_len,
+                                                   extra_size);
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 73d73fb..f6465b6 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -241,7 +241,8 @@ static unsigned dx_get_count(struct dx_entry *entries);
+ static unsigned dx_get_limit(struct dx_entry *entries);
+ static void dx_set_count(struct dx_entry *entries, unsigned value);
+ static void dx_set_limit(struct dx_entry *entries, unsigned value);
+-static unsigned dx_root_limit(struct inode *dir, unsigned infosize);
++static inline unsigned dx_root_limit(struct inode *dir,
++              struct ext4_dir_entry_2 *dot_de, unsigned infosize);
+ static unsigned dx_node_limit(struct inode *dir);
+ static struct dx_frame *dx_probe(struct ext4_filename *fname,
+                                struct inode *dir,
+@@ -504,11 +505,12 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
+  */
+ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de)
+ {
++      BUG_ON(de->name_len != 1);
+       /* get dotdot first */
+-      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(1));
++      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
+       /* dx root info is after dotdot entry */
+-      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(2));
++      de = (struct ext4_dir_entry_2 *)((char *)de + EXT4_DIR_REC_LEN(de));
+       return (struct dx_root_info *)de;
+ }
+@@ -553,10 +555,16 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value)
+       ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value);
+ }
+-static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
++static inline unsigned dx_root_limit(struct inode *dir,
++              struct ext4_dir_entry_2 *dot_de, unsigned infosize)
+ {
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
+-              EXT4_DIR_REC_LEN(2) - infosize;
++      struct ext4_dir_entry_2 *dotdot_de;
++      unsigned entry_space;
++
++      BUG_ON(dot_de->name_len != 1);
++      dotdot_de = ext4_next_entry(dot_de, dir->i_sb->s_blocksize);
++      entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(dot_de) -
++                       EXT4_DIR_REC_LEN(dotdot_de) - infosize;
+       if (ext4_has_metadata_csum(dir->i_sb))
+               entry_space -= sizeof(struct dx_tail);
+@@ -565,7 +573,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
+ static inline unsigned dx_node_limit(struct inode *dir)
+ {
+-      unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
++      unsigned entry_space = dir->i_sb->s_blocksize - __EXT4_DIR_REC_LEN(0);
+       if (ext4_has_metadata_csum(dir->i_sb))
+               entry_space -= sizeof(struct dx_tail);
+@@ -674,7 +682,7 @@ static struct stats dx_show_leaf(struct inode *dir,
+                                      (unsigned) ((char *) de - base));
+ #endif
+                       }
+-                      space += EXT4_DIR_REC_LEN(de->name_len);
++                      space += EXT4_DIR_REC_LEN(de);
+                       names++;
+               }
+               de = ext4_next_entry(de, size);
+@@ -775,11 +783,14 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+       entries = (struct dx_entry *)(((char *)info) + info->info_length);
+-      if (dx_get_limit(entries) != dx_root_limit(dir,
+-                                                 info->info_length)) {
++      if (dx_get_limit(entries) !=
++          dx_root_limit(dir, (struct ext4_dir_entry_2 *)frame->bh->b_data,
++                        info->info_length)) {
+               ext4_warning_inode(dir, "dx entry: limit %u != root limit %u",
+                                  dx_get_limit(entries),
+-                                 dx_root_limit(dir, info->info_length));
++                                 dx_root_limit(dir,
++                                        (struct ext4_dir_entry_2 *)frame->bh->b_data,
++                                        info->info_length));
+               goto fail;
+       }
+@@ -963,7 +974,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
+       de = (struct ext4_dir_entry_2 *) bh->b_data;
+       top = (struct ext4_dir_entry_2 *) ((char *) de +
+                                          dir->i_sb->s_blocksize -
+-                                         EXT4_DIR_REC_LEN(0));
++                                         __EXT4_DIR_REC_LEN(0));
+ #ifdef CONFIG_EXT4_FS_ENCRYPTION
+       /* Check if the directory is encrypted */
+       if (ext4_encrypted_inode(dir)) {
+@@ -1665,7 +1676,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+       while (count--) {
+               struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
+                                               (from + (map->offs<<2));
+-              rec_len = EXT4_DIR_REC_LEN(de->name_len);
++              rec_len = EXT4_DIR_REC_LEN(de);
+               memcpy (to, de, rec_len);
+               ((struct ext4_dir_entry_2 *) to)->rec_len =
+                               ext4_rec_len_to_disk(rec_len, blocksize);
+@@ -1689,7 +1700,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+       while ((char*)de < base + blocksize) {
+               next = ext4_next_entry(de, blocksize);
+               if (de->inode && de->name_len) {
+-                      rec_len = EXT4_DIR_REC_LEN(de->name_len);
++                      rec_len = EXT4_DIR_REC_LEN(de);
+                       if (de > to)
+                               memmove(to, de, rec_len);
+                       to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
+@@ -1820,15 +1831,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                     struct buffer_head *bh,
+                     void *buf, int buf_size,
+                     struct ext4_filename *fname,
+-                    struct ext4_dir_entry_2 **dest_de)
++                    struct ext4_dir_entry_2 **dest_de, int *dlen)
+ {
+       struct ext4_dir_entry_2 *de;
+-      unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname));
++      unsigned short reclen = __EXT4_DIR_REC_LEN(fname_len(fname)) +
++                                                (dlen ? *dlen : 0);
+       int nlen, rlen;
+       unsigned int offset = 0;
+       char *top;
+       int res;
++      dlen ? *dlen = 0 : 0; /* default set to 0 */
+       de = (struct ext4_dir_entry_2 *)buf;
+       top = buf + buf_size - reclen;
+       while ((char *) de <= top) {
+@@ -1845,10 +1858,26 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
+                       res = -EEXIST;
+                       goto return_result;
+               }
+-              nlen = EXT4_DIR_REC_LEN(de->name_len);
++              nlen = EXT4_DIR_REC_LEN(de);
+               rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+               if ((de->inode ? rlen - nlen : rlen) >= reclen)
+                       break;
++              /* Then for dotdot entries, check for the smaller space
++               * required for just the entry, no FID */
++              if (fname_len(fname) == 2 && memcmp(fname_name(fname), "..", 2) == 0) {
++                      if ((de->inode ? rlen - nlen : rlen) >=
++                          __EXT4_DIR_REC_LEN(fname_len(fname))) {
++                              /* set dlen=1 to indicate not
++                               * enough space store fid */
++                              dlen ? *dlen = 1 : 0;
++                              break;
++                      }
++                      /* The new ".." entry must be written over the
++                       * previous ".." entry, which is the first
++                       * entry traversed by this scan. If it doesn't
++                       * fit, something is badly wrong, so -EIO. */
++                      return -EIO;
++              }
+               de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
+               offset += rlen;
+       }
+@@ -1867,12 +1896,12 @@ int ext4_insert_dentry(struct inode *dir,
+                      struct inode *inode,
+                      struct ext4_dir_entry_2 *de,
+                      int buf_size,
+-                     struct ext4_filename *fname)
++                     struct ext4_filename *fname, void *data)
+ {
+       int nlen, rlen;
+-      nlen = EXT4_DIR_REC_LEN(de->name_len);
++      nlen = EXT4_DIR_REC_LEN(de);
+       rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
+       if (de->inode) {
+               struct ext4_dir_entry_2 *de1 =
+@@ -1886,6 +1915,11 @@ int ext4_insert_dentry(struct inode *dir,
+       ext4_set_de_type(inode->i_sb, de, inode->i_mode);
+       de->name_len = fname_len(fname);
+       memcpy(de->name, fname_name(fname), fname_len(fname));
++      if (data) {
++              de->name[fname_len(fname)] = 0;
++              memcpy(&de->name[fname_len(fname) + 1], data, *(char *)data);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
+       return 0;
+ }
+@@ -1900,18 +1934,23 @@ int ext4_insert_dentry(struct inode *dir,
+ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
+                            struct inode *dir,
+                            struct inode *inode, struct ext4_dir_entry_2 *de,
+-                           struct buffer_head *bh)
++                           struct buffer_head *bh, struct dentry *dentry)
+ {
+       unsigned int    blocksize = dir->i_sb->s_blocksize;
+       int             csum_size = 0;
+-      int             err;
++      int             err, dlen = 0;
++      unsigned char   *data;
++      data = ext4_dentry_get_data(inode->i_sb, (struct ext4_dentry_param *)
++                                              dentry->d_fsdata);
+       if (ext4_has_metadata_csum(inode->i_sb))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+       if (!de) {
++              if (data)
++                      dlen = (*data) + 1;
+               err = ext4_find_dest_de(dir, inode, bh, bh->b_data,
+-                                      blocksize - csum_size, fname, &de);
++                                      blocksize - csum_size, fname, &de, &dlen);
+               if (err)
+                       return err;
+       }
+@@ -1924,7 +1963,10 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
+       /* By now the buffer is marked for journaling. Due to crypto operations,
+        * the following function call may fail */
+-      err = ext4_insert_dentry(dir, inode, de, blocksize, fname);
++      /* If writing the short form of "dotdot", don't add the data section */
++      if (dlen == 1)
++              data = NULL;
++      err = ext4_insert_dentry(dir, inode, de, blocksize, fname, data);
+       if (err < 0)
+               return err;
+@@ -2036,7 +2078,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+       dx_set_block(entries, 1);
+       dx_set_count(entries, 1);
+-      dx_set_limit(entries, dx_root_limit(dir, sizeof(*dx_info)));
++      dx_set_limit(entries, dx_root_limit(dir,
++                                       dot_de, sizeof(*dx_info)));
+       /* Initialize as for dx_probe */
+       fname->hinfo.hash_version = dx_info->hash_version;
+@@ -2066,7 +2109,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+       }
+       dx_release(frames);
+-      retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
++      retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh, dentry);
+       brelse(bh);
+       return retval;
+ out_frames:
+@@ -2088,6 +2131,8 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry,
+       struct buffer_head *dir_block;
+       struct ext4_dir_entry_2 *de;
+       int len, journal = 0, err = 0;
++      int dlen = 0;
++      char *data;
+       if (IS_ERR(handle))
+               return PTR_ERR(handle);
+@@ -2105,19 +2150,24 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry,
+       /* the first item must be "." */
+       assert(de->name_len == 1 && de->name[0] == '.');
+       len = le16_to_cpu(de->rec_len);
+-      assert(len >= EXT4_DIR_REC_LEN(1));
+-      if (len > EXT4_DIR_REC_LEN(1)) {
++      assert(len >= __EXT4_DIR_REC_LEN(1));
++      if (len > __EXT4_DIR_REC_LEN(1)) {
+               BUFFER_TRACE(dir_block, "get_write_access");
+               err = ext4_journal_get_write_access(handle, dir_block);
+               if (err)
+                       goto out_journal;
+               journal = 1;
+-              de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(1));
++              de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
+       }
+-      len -= EXT4_DIR_REC_LEN(1);
+-      assert(len == 0 || len >= EXT4_DIR_REC_LEN(2));
++      len -= EXT4_DIR_REC_LEN(de);
++      data = ext4_dentry_get_data(dir->i_sb,
++                      (struct ext4_dentry_param *)dentry->d_fsdata);
++      if (data)
++              dlen = *data + 1;
++      assert(len == 0 || len >= __EXT4_DIR_REC_LEN(2 + dlen));
++
+       de = (struct ext4_dir_entry_2 *)
+                       ((char *) de + le16_to_cpu(de->rec_len));
+       if (!journal) {
+@@ -2131,10 +2181,15 @@ static int ext4_update_dotdot(handle_t *handle, struct dentry *dentry,
+       if (len > 0)
+               de->rec_len = cpu_to_le16(len);
+       else
+-              assert(le16_to_cpu(de->rec_len) >= EXT4_DIR_REC_LEN(2));
++              assert(le16_to_cpu(de->rec_len) >= __EXT4_DIR_REC_LEN(2));
+       de->name_len = 2;
+       strcpy(de->name, "..");
+-      ext4_set_de_type(dir->i_sb, de, S_IFDIR);
++      if (data != NULL && ext4_get_dirent_data_len(de) >= dlen) {
++              de->name[2] = 0;
++              memcpy(&de->name[2 + 1], data, *data);
++              ext4_set_de_type(dir->i_sb, de, S_IFDIR);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
+ out_journal:
+       if (journal) {
+@@ -2216,7 +2271,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+                       goto out;
+               }
+               retval = add_dirent_to_buf(handle, &fname, dir, inode,
+-                                         NULL, bh);
++                                         NULL, bh, dentry);
+               if (retval != -ENOSPC)
+                       goto out;
+@@ -2244,7 +2299,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+               initialize_dirent_tail(t, blocksize);
+       }
+-      retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh);
++      retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh, dentry);
+ out:
+       ext4_fname_free_filename(&fname);
+       brelse(bh);
+@@ -2284,7 +2339,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+       if (err)
+               goto journal_error;
+-      err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh);
++      err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh, dentry);
+       if (err != -ENOSPC)
+               goto cleanup;
+@@ -2388,7 +2443,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+               err = PTR_ERR(de);
+               goto cleanup;
+       }
+-      err = add_dirent_to_buf(handle, fname, dir, inode, de, bh);
++      err = add_dirent_to_buf(handle, fname, dir, inode, de, bh, dentry);
+       goto cleanup;
+ journal_error:
+@@ -2663,37 +2718,70 @@ err_unlock_inode:
+       return err;
+ }
++struct tp_block {
++      struct inode *inode;
++      void *data1;
++      void *data2;
++};
++
+ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
+                         struct ext4_dir_entry_2 *de,
+                         int blocksize, int csum_size,
+                         unsigned int parent_ino, int dotdot_real_len)
+ {
++      void *data1 = NULL, *data2 = NULL;
++      int dot_reclen = 0;
++
++      if (dotdot_real_len == 10) {
++              struct tp_block *tpb = (struct tp_block *)inode;
++              data1 = tpb->data1;
++              data2 = tpb->data2;
++              inode = tpb->inode;
++              dotdot_real_len = 0;
++      }
+       de->inode = cpu_to_le32(inode->i_ino);
+       de->name_len = 1;
+-      de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+-                                         blocksize);
+       strcpy(de->name, ".");
+       ext4_set_de_type(inode->i_sb, de, S_IFDIR);
++      /* get packed fid data*/
++      data1 = ext4_dentry_get_data(inode->i_sb,
++                              (struct ext4_dentry_param *) data1);
++      if (data1) {
++              de->name[1] = 0;
++              memcpy(&de->name[2], data1, *(char *) data1);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
++      de->rec_len = cpu_to_le16(EXT4_DIR_REC_LEN(de));
++      dot_reclen = cpu_to_le16(de->rec_len);
+       de = ext4_next_entry(de, blocksize);
+       de->inode = cpu_to_le32(parent_ino);
+       de->name_len = 2;
++      strcpy(de->name, "..");
++      ext4_set_de_type(inode->i_sb, de, S_IFDIR);
++      data2 = ext4_dentry_get_data(inode->i_sb,
++                      (struct ext4_dentry_param *) data2);
++      if (data2) {
++              de->name[2] = 0;
++              memcpy(&de->name[3], data2, *(char *) data2);
++              de->file_type |= EXT4_DIRENT_LUFID;
++      }
+       if (!dotdot_real_len)
+               de->rec_len = ext4_rec_len_to_disk(blocksize -
+-                                      (csum_size + EXT4_DIR_REC_LEN(1)),
++                                      (csum_size + dot_reclen),
+                                       blocksize);
+       else
+               de->rec_len = ext4_rec_len_to_disk(
+-                              EXT4_DIR_REC_LEN(de->name_len), blocksize);
+-      strcpy(de->name, "..");
+-      ext4_set_de_type(inode->i_sb, de, S_IFDIR);
++                              EXT4_DIR_REC_LEN(de), blocksize);
+       return ext4_next_entry(de, blocksize);
+ }
+ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+-                           struct inode *inode)
++                           struct inode *inode,
++                           const void *data1, const void *data2)
+ {
++      struct tp_block param;
+       struct buffer_head *dir_block = NULL;
+       struct ext4_dir_entry_2 *de;
+       struct ext4_dir_entry_tail *t;
+@@ -2718,7 +2806,11 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
+       if (IS_ERR(dir_block))
+               return PTR_ERR(dir_block);
+       de = (struct ext4_dir_entry_2 *)dir_block->b_data;
+-      ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0);
++      param.inode = inode;
++      param.data1 = (void *)data1;
++      param.data2 = (void *)data2;
++      ext4_init_dot_dotdot((struct inode *)(&param), de, blocksize,
++                           csum_size, dir->i_ino, 10);
+       set_nlink(inode, 2);
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+@@ -2735,6 +2827,29 @@ out:
+       return err;
+ }
++/* Initialize @inode as a subdirectory of @dir, and add the
++ * "." and ".." entries into the first directory block. */
++int ext4_add_dot_dotdot(handle_t *handle, struct inode *dir,
++                      struct inode *inode,
++                      const void *data1, const void *data2)
++{
++      int rc;
++
++      if (IS_ERR(handle))
++              return PTR_ERR(handle);
++
++      if (IS_DIRSYNC(dir))
++              ext4_handle_sync(handle);
++
++      inode->i_op = &ext4_dir_inode_operations;
++      inode->i_fop = &ext4_dir_operations;
++      rc = ext4_init_new_dir(handle, dir, inode, data1, data2);
++      if (!rc)
++              rc = ext4_mark_inode_dirty(handle, inode);
++      return rc;
++}
++EXPORT_SYMBOL(ext4_add_dot_dotdot);
++
+ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+ {
+       handle_t *handle;
+@@ -2761,7 +2876,7 @@ retry:
+       inode->i_op = &ext4_dir_inode_operations;
+       inode->i_fop = &ext4_dir_operations;
+-      err = ext4_init_new_dir(handle, dir, inode);
++      err = ext4_init_new_dir(handle, dir, inode, NULL, NULL);
+       if (err)
+               goto out_clear_inode;
+       err = ext4_mark_inode_dirty(handle, inode);
+@@ -2813,7 +2928,7 @@ int ext4_empty_dir(struct inode *inode)
+       }
+       sb = inode->i_sb;
+-      if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
++      if (inode->i_size < __EXT4_DIR_REC_LEN(1) + __EXT4_DIR_REC_LEN(2)) {
+               EXT4_ERROR_INODE(inode, "invalid size");
+               return 1;
+       }
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 7715539..f48b36e 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1159,7 +1159,7 @@ enum {
+       Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption,
+       Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
+       Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
+-      Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
++      Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, Opt_dirdata,
+       Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
+       Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+       Opt_lazytime, Opt_nolazytime,
+@@ -1230,6 +1230,7 @@ static const match_table_t tokens = {
+       {Opt_lazytime, "lazytime"},
+       {Opt_nolazytime, "nolazytime"},
+       {Opt_nodelalloc, "nodelalloc"},
++      {Opt_dirdata, "dirdata"},
+       {Opt_removed, "mblk_io_submit"},
+       {Opt_removed, "nomblk_io_submit"},
+       {Opt_block_validity, "block_validity"},
+@@ -1444,6 +1445,7 @@ static const struct mount_opts {
+       {Opt_usrjquota, 0, MOPT_Q},
+       {Opt_grpjquota, 0, MOPT_Q},
+       {Opt_offusrjquota, 0, MOPT_Q},
++      {Opt_dirdata, EXT4_MOUNT_DIRDATA, MOPT_SET},
+       {Opt_offgrpjquota, 0, MOPT_Q},
+       {Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+       {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-disable-mb-cache-001.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-disable-mb-cache-001.patch
new file mode 100644 (file)
index 0000000..758fd04
--- /dev/null
@@ -0,0 +1,131 @@
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index f256696..d7a3413 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1047,6 +1047,7 @@ struct ext4_inode_info {
+ /*
+  * Mount flags set via mount options or defaults
+  */
++#define EXT4_MOUNT_NO_MBCACHE         0x00001 /* Disable mbcache */
+ #define EXT4_MOUNT_GRPID              0x00004 /* Create files with directory's group */
+ #define EXT4_MOUNT_DEBUG              0x00008 /* Some debugging messages */
+ #define EXT4_MOUNT_ERRORS_CONT                0x00010 /* Continue on errors */
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index e286670..97e5e32 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1168,6 +1168,7 @@ enum {
+       Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+       Opt_inode_readahead_blks, Opt_journal_ioprio,
+       Opt_dioread_nolock, Opt_dioread_lock,
++      Opt_no_mbcache,
+       Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+       Opt_max_dir_size_kb, Opt_nojournal_checksum,
+ };
+@@ -1247,6 +1248,7 @@ static const match_table_t tokens = {
+       {Opt_discard, "discard"},
+       {Opt_nodiscard, "nodiscard"},
+       {Opt_init_itable, "init_itable=%u"},
++      {Opt_no_mbcache, "no_mbcache"},
+       {Opt_init_itable, "init_itable"},
+       {Opt_noinit_itable, "noinit_itable"},
+       {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+@@ -1410,6 +1412,7 @@ static const struct mount_opts {
+       {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+       {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+       {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
++      {Opt_no_mbcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+       {Opt_commit, 0, MOPT_GTE0},
+       {Opt_max_batch_time, 0, MOPT_GTE0},
+       {Opt_min_batch_time, 0, MOPT_GTE0},
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 22ba197..d7e225e 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -80,7 +80,7 @@
+ # define ea_bdebug(bh, fmt, ...)      no_printk(fmt, ##__VA_ARGS__)
+ #endif
+-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
++static void _ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
+ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+                                                struct ext4_xattr_header *,
+                                                struct mb_cache_entry **);
+@@ -397,7 +397,8 @@ bad_block:
+               error = -EFSCORRUPTED;
+               goto cleanup;
+       }
+-      ext4_xattr_cache_insert(ext4_mb_cache, bh);
++      if (!test_opt(inode->i_sb, NO_MBCACHE))
++              _ext4_xattr_cache_insert(ext4_mb_cache, bh);
+       entry = BFIRST(bh);
+       error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1,
+                                     inode);
+@@ -561,7 +562,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+               error = -EFSCORRUPTED;
+               goto cleanup;
+       }
+-      ext4_xattr_cache_insert(ext4_mb_cache, bh);
++      if (!test_opt(inode->i_sb, NO_MBCACHE))
++              _ext4_xattr_cache_insert(ext4_mb_cache, bh);
+       error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
+ cleanup:
+@@ -669,7 +671,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+       lock_buffer(bh);
+       if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+               ea_bdebug(bh, "refcount now=0; freeing");
+-              if (ce)
++              if (ce && !test_opt(inode->i_sb, NO_MBCACHE))
+                       mb_cache_entry_free(ce);
+               get_bh(bh);
+               unlock_buffer(bh);
+@@ -1086,7 +1088,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+               lock_buffer(bs->bh);
+               if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+-                      if (ce) {
++                      if (ce && !test_opt(inode->i_sb, NO_MBCACHE)) {
+                               mb_cache_entry_free(ce);
+                               ce = NULL;
+                       }
+@@ -1096,8 +1098,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+                               if (!IS_LAST_ENTRY(s->first))
+                                       ext4_xattr_rehash(header(s->base),
+                                                         s->here);
+-                              ext4_xattr_cache_insert(ext4_mb_cache,
+-                                      bs->bh);
++                              if (!test_opt(inode->i_sb, NO_MBCACHE))
++                                      _ext4_xattr_cache_insert(ext4_mb_cache,
++                                                              bs->bh);
+                       }
+                       ext4_xattr_block_csum_set(inode, bs->bh);
+                       unlock_buffer(bs->bh);
+@@ -1231,7 +1234,8 @@ getblk_failed:
+                       ext4_xattr_block_csum_set(inode, new_bh);
+                       set_buffer_uptodate(new_bh);
+                       unlock_buffer(new_bh);
+-                      ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
++                      if (!test_opt(inode->i_sb, NO_MBCACHE))
++                              _ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+                       error = ext4_handle_dirty_metadata(handle, inode,
+                                                          new_bh);
+                       if (error)
+@@ -2024,7 +2028,7 @@ ext4_xattr_put_super(struct super_block *sb)
+  * Returns 0, or a negative error number on failure.
+  */
+ static void
+-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
++_ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
+ {
+       __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+       struct mb_cache_entry *ce;
+@@ -2103,6 +2107,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+       struct mb_cache_entry *ce;
+       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
++      if (test_opt(inode->i_sb, NO_MBCACHE))
++              return NULL;
+       if (!header->h_hash)
+               return NULL;  /* never share */
+       ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-disable-mb-cache.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-disable-mb-cache.patch
new file mode 100644 (file)
index 0000000..bdae7ff
--- /dev/null
@@ -0,0 +1,131 @@
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 217fdcc..9abdbde 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1046,6 +1046,7 @@ struct ext4_inode_info {
+ /*
+  * Mount flags set via mount options or defaults
+  */
++#define EXT4_MOUNT_NO_MBCACHE         0x00001 /* Disable mbcache */
+ #define EXT4_MOUNT_GRPID              0x00004 /* Create files with directory's group */
+ #define EXT4_MOUNT_DEBUG              0x00008 /* Some debugging messages */
+ #define EXT4_MOUNT_ERRORS_CONT                0x00010 /* Continue on errors */
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index f48b36e..02fe65b 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -1166,6 +1166,7 @@ enum {
+       Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
+       Opt_inode_readahead_blks, Opt_journal_ioprio,
+       Opt_dioread_nolock, Opt_dioread_lock,
++      Opt_no_mbcache,
+       Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
+       Opt_max_dir_size_kb, Opt_nojournal_checksum,
+ };
+@@ -1245,6 +1246,7 @@ static const match_table_t tokens = {
+       {Opt_discard, "discard"},
+       {Opt_nodiscard, "nodiscard"},
+       {Opt_init_itable, "init_itable=%u"},
++      {Opt_no_mbcache, "no_mbcache"},
+       {Opt_init_itable, "init_itable"},
+       {Opt_noinit_itable, "noinit_itable"},
+       {Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+@@ -1408,6 +1410,7 @@ static const struct mount_opts {
+       {Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+       {Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+       {Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
++      {Opt_no_mbcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET},
+       {Opt_commit, 0, MOPT_GTE0},
+       {Opt_max_batch_time, 0, MOPT_GTE0},
+       {Opt_min_batch_time, 0, MOPT_GTE0},
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index b69145b..0e0557e 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -80,7 +80,7 @@
+ # define ea_bdebug(bh, fmt, ...)      no_printk(fmt, ##__VA_ARGS__)
+ #endif
+-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
++static void _ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
+ static struct buffer_head *ext4_xattr_cache_find(struct inode *,
+                                                struct ext4_xattr_header *,
+                                                struct mb_cache_entry **);
+@@ -402,7 +402,8 @@ bad_block:
+               error = -EFSCORRUPTED;
+               goto cleanup;
+       }
+-      ext4_xattr_cache_insert(ext4_mb_cache, bh);
++      if (!test_opt(inode->i_sb, NO_MBCACHE))
++              _ext4_xattr_cache_insert(ext4_mb_cache, bh);
+       entry = BFIRST(bh);
+       error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1,
+                                     inode);
+@@ -566,7 +567,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
+               error = -EFSCORRUPTED;
+               goto cleanup;
+       }
+-      ext4_xattr_cache_insert(ext4_mb_cache, bh);
++      if (!test_opt(inode->i_sb, NO_MBCACHE))
++              _ext4_xattr_cache_insert(ext4_mb_cache, bh);
+       error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
+ cleanup:
+@@ -674,7 +676,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
+       lock_buffer(bh);
+       if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
+               ea_bdebug(bh, "refcount now=0; freeing");
+-              if (ce)
++              if (ce && !test_opt(inode->i_sb, NO_MBCACHE))
+                       mb_cache_entry_free(ce);
+               get_bh(bh);
+               unlock_buffer(bh);
+@@ -1091,7 +1093,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+               lock_buffer(bs->bh);
+               if (header(s->base)->h_refcount == cpu_to_le32(1)) {
+-                      if (ce) {
++                      if (ce && !test_opt(inode->i_sb, NO_MBCACHE)) {
+                               mb_cache_entry_free(ce);
+                               ce = NULL;
+                       }
+@@ -1101,8 +1103,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+                               if (!IS_LAST_ENTRY(s->first))
+                                       ext4_xattr_rehash(header(s->base),
+                                                         s->here);
+-                              ext4_xattr_cache_insert(ext4_mb_cache,
+-                                      bs->bh);
++                              if (!test_opt(inode->i_sb, NO_MBCACHE))
++                                      _ext4_xattr_cache_insert(ext4_mb_cache,
++                                                              bs->bh);
+                       }
+                       unlock_buffer(bs->bh);
+                       if (error == -EFSCORRUPTED)
+@@ -1233,7 +1236,8 @@ getblk_failed:
+                       memcpy(new_bh->b_data, s->base, new_bh->b_size);
+                       set_buffer_uptodate(new_bh);
+                       unlock_buffer(new_bh);
+-                      ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
++                      if (!test_opt(inode->i_sb, NO_MBCACHE))
++                              _ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
+                       error = ext4_handle_dirty_xattr_block(handle,
+                                                             inode, new_bh);
+                       if (error)
+@@ -2026,7 +2030,7 @@ ext4_xattr_put_super(struct super_block *sb)
+  * Returns 0, or a negative error number on failure.
+  */
+ static void
+-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
++_ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
+ {
+       __u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
+       struct mb_cache_entry *ce;
+@@ -2105,6 +2109,8 @@ ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
+       struct mb_cache_entry *ce;
+       struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
++      if (test_opt(inode->i_sb, NO_MBCACHE))
++              return NULL;
+       if (!header->h_hash)
+               return NULL;  /* never share */
+       ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-dir-001.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-dir-001.patch
new file mode 100644 (file)
index 0000000..a156fe2
--- /dev/null
@@ -0,0 +1,330 @@
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 7db2188..0242856 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1789,6 +1789,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,             ENCRYPT)
+                                        EXT4_FEATURE_INCOMPAT_MMP | \
+                                        EXT4_FEATURE_INCOMPAT_DIRDATA| \
+                                        EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
++                                       EXT4_FEATURE_INCOMPAT_LARGEDIR | \
+                                        EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -2262,6 +2263,9 @@ struct mmpd_data {
+ # define NORET_TYPE   /**/
+ # define ATTRIB_NORET __attribute__((noreturn))
+ # define NORET_AND    noreturn,
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL      3
+ struct ext4_xattr_ino_array {
+       unsigned int xia_count;         /* # of used item in the array */
+@@ -2883,13 +2887,16 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++                              struct ext4_inode *raw_inode)
+ {
+-      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) ||
++          (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) &&
++          S_ISDIR(le16_to_cpu(raw_inode->i_mode))))
+               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+                       le32_to_cpu(raw_inode->i_size_lo);
+-      else
+-              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++      return (loff_t)le32_to_cpu(raw_inode->i_size_lo);
+ }
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 08c0cba..44e3ad4 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4305,7 +4305,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+       if (ext4_has_feature_64bit(sb))
+               ei->i_file_acl |=
+                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+-      inode->i_size = ext4_isize(raw_inode);
++      inode->i_size = ext4_isize(sb, raw_inode);
+       if ((size = i_size_read(inode)) < 0) {
+               EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+               ret = -EFSCORRUPTED;
+@@ -4627,7 +4627,7 @@ static int ext4_do_update_inode(handle_t *handle,
+               raw_inode->i_file_acl_high =
+                       cpu_to_le16(ei->i_file_acl >> 32);
+       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+-      if (ei->i_disksize != ext4_isize(raw_inode)) {
++      if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+               ext4_isize_set(raw_inode, ei->i_disksize);
+               need_datasync = 1;
+       }
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 11bc299..2543b8f 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -517,7 +517,14 @@ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de)
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
++      return le32_to_cpu(entry->block) & 0x0fffffff;
++}
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+ }
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -746,6 +753,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+       struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
+       u32 hash;
++      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+       frame->bh = ext4_read_dirblock(dir, 0, INDEX);
+       if (IS_ERR(frame->bh))
+               return (struct dx_frame *) frame->bh;
+@@ -775,9 +783,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+       }
+       indirect = info->indirect_levels;
+-      if (indirect > 1) {
+-              ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
+-                                 info->indirect_levels);
++      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++              ext4_warning_inode(dir, "htree depth: %#06x exceed max depth %u",
++                                 indirect, ext4_dir_htree_level(dir->i_sb));
++              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++                      ext4_warning(dir->i_sb, "Enable large directory "
++                                              "feature to access it");
++              }
+               goto fail;
+       }
+@@ -867,12 +879,20 @@ fail:
+ static void dx_release(struct dx_frame *frames)
+ {
++      int i;
++      struct dx_root_info *info;
++
+       if (frames[0].bh == NULL)
+               return;
+-      if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      for (i = 0, info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data);
++           i <= info->indirect_levels;
++           i++) {
++              if (frames[i].bh == NULL)
++                      break;
++              brelse(frames[i].bh);
++              frames[i].bh = NULL;
++      }
+ }
+ /*
+@@ -1055,7 +1075,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ {
+       struct dx_hash_info hinfo;
+       struct ext4_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct inode *dir;
+       ext4_lblk_t block;
+       int count = 0;
+@@ -1514,7 +1534,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+                       struct ext4_dir_entry_2 **res_dir)
+ {
+       struct super_block * sb = dir->i_sb;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       const struct qstr *d_name = fname->usr_fname;
+       struct buffer_head *bh;
+       ext4_lblk_t block;
+@@ -2002,7 +2022,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+ {
+       struct inode    *dir = d_inode(dentry->d_parent);
+       struct buffer_head *bh2;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries;
+       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+       struct ext4_dir_entry_tail *t;
+@@ -2312,14 +2332,17 @@ out:
+ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                            struct dentry *dentry, struct inode *inode)
+ {
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries, *at;
+       struct buffer_head *bh;
+       struct inode *dir = d_inode(dentry->d_parent);
+       struct super_block *sb = dir->i_sb;
+       struct ext4_dir_entry_2 *de;
++      int restart;
+       int err;
++again:
++      restart = 0;
+       frame = dx_probe(fname, dir, NULL, frames);
+       if (IS_ERR(frame))
+               return PTR_ERR(frame);
+@@ -2332,33 +2355,48 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+               goto cleanup;
+       }
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext4_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
+-
+       err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh, dentry);
+       if (err != -ENOSPC)
+               goto cleanup;
++      err = 0;
+       /* Block full, should compress but for now just split */
+       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* Need to split index? */
+       if (dx_get_count(entries) == dx_get_limit(entries)) {
+               ext4_lblk_t newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
++              int levels = frame - frames + 1;
++              unsigned icount;
++              int add_level = 1;
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
+-                      ext4_warning_inode(dir, "Directory index full!");
++              while (frame > frames) {
++                      if (dx_get_count((frame - 1)->entries) <
++                          dx_get_limit((frame - 1)->entries)) {
++                              add_level = 0;
++                              break;
++                      }
++                      frame--; /* split higher index block */
++                      at = frame->at;
++                      entries = frame->entries;
++                      restart = 1;
++              }
++              if (add_level && levels == ext4_dir_htree_level(sb)) {
++                      ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u",
++                                       dir->i_ino, current->comm, levels,
++                                       ext4_dir_htree_level(sb));
++                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++                              ext4_warning(sb, "Large directory feature is"
++                                               "not enabled on this "
++                                               "filesystem");
++                      }
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
++              icount = dx_get_count(entries);
+               bh2 = ext4_append(handle, dir, &newblock);
+               if (IS_ERR(bh2)) {
+                       err = PTR_ERR(bh2);
+@@ -2373,7 +2411,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+               err = ext4_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
++              if (!add_level) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
+                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -2381,7 +2419,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+                       err = ext4_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++                                                          (frame - 1)->bh);
+                       if (err)
+                               goto journal_error;
+@@ -2397,19 +2435,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                       }
+-                      dx_insert_block(frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index("node", frames[1].entries));
++                      dx_insert_block(frame - 1, hash2, newblock);
++                      dxtrace(dx_show_index("node", frame->entries));
+                       dxtrace(dx_show_index("node",
+-                             ((struct dx_node *) bh2->b_data)->entries));
++                             ((struct dx_node *)bh2->b_data)->entries));
+                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+                       if (err)
+                               goto journal_error;
+                       brelse (bh2);
++                      ext4_handle_dirty_dirent_node(handle, dir,
++                                                    (frame - 1)->bh);
++                      if (restart) {
++                              ext4_handle_dirty_dirent_node(handle, dir,
++                                                            frame->bh);
++                              goto cleanup;
++                      }
+               } else {
+                       struct dx_root_info *info;
+-                      dxtrace(printk(KERN_DEBUG
+-                                     "Creating second level index...\n"));
+-                      memcpy((char *) entries2, (char *) entries,
++
++                      memcpy((char *)entries2, (char *)entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+@@ -2418,21 +2462,14 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                       dx_set_block(entries + 0, newblock);
+                       info = dx_get_dx_info((struct ext4_dir_entry_2 *)
+                                             frames[0].bh->b_data);
+-                      info->indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext4_journal_get_write_access(handle,
+-                                                           frame->bh);
+-                      if (err)
+-                              goto journal_error;
+-              }
+-              err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+-              if (err) {
+-                      ext4_std_error(inode->i_sb, err);
++                      info->indirect_levels += 1;
++                      dxtrace(printk(KERN_DEBUG
++                                     "Creating %d level index...\n",
++                                     info->indirect_levels));
++                      ext4_handle_dirty_dirent_node(handle, dir, frame->bh);
++                      ext4_handle_dirty_dirent_node(handle, dir, bh2);
++                      brelse(bh2);
++                      restart = 1;
+                       goto cleanup;
+               }
+       }
+@@ -2449,6 +2486,10 @@ journal_error:
+ cleanup:
+       brelse(bh);
+       dx_release(frames);
++      /* @restart is true means htree-path has been changed, we need to
++       * repeat dx_probe() to find out valid htree-path */
++      if (restart && err == 0)
++              goto again;
+       return err;
+ }
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-dir.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-dir.patch
new file mode 100644 (file)
index 0000000..59ef2ed
--- /dev/null
@@ -0,0 +1,330 @@
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 5b6ec8f..2d22f1a 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1788,6 +1788,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,             ENCRYPT)
+                                        EXT4_FEATURE_INCOMPAT_MMP | \
+                                        EXT4_FEATURE_INCOMPAT_DIRDATA| \
+                                        EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
++                                       EXT4_FEATURE_INCOMPAT_LARGEDIR | \
+                                        EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+                                        EXT4_FEATURE_INCOMPAT_CSUM_SEED)
+ #define EXT4_FEATURE_RO_COMPAT_SUPP   (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
+@@ -2261,6 +2262,9 @@ struct mmpd_data {
+ # define NORET_TYPE   /**/
+ # define ATTRIB_NORET __attribute__((noreturn))
+ # define NORET_AND    noreturn,
++/* htree levels for ext4 */
++#define EXT4_HTREE_LEVEL_COMPAT 2
++#define EXT4_HTREE_LEVEL      3
+ struct ext4_xattr_ino_array {
+       unsigned int xia_count;         /* # of used item in the array */
+@@ -2882,13 +2886,16 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
+       es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
+ }
+-static inline loff_t ext4_isize(struct ext4_inode *raw_inode)
++static inline loff_t ext4_isize(struct super_block *sb,
++                              struct ext4_inode *raw_inode)
+ {
+-      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)))
++      if (S_ISREG(le16_to_cpu(raw_inode->i_mode)) ||
++          (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) &&
++          S_ISDIR(le16_to_cpu(raw_inode->i_mode))))
+               return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
+                       le32_to_cpu(raw_inode->i_size_lo);
+-      else
+-              return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
++
++      return (loff_t)le32_to_cpu(raw_inode->i_size_lo);
+ }
+ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 3034ceb..7c24ae1 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -4304,7 +4304,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
+       if (ext4_has_feature_64bit(sb))
+               ei->i_file_acl |=
+                       ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
+-      inode->i_size = ext4_isize(raw_inode);
++      inode->i_size = ext4_isize(sb, raw_inode);
+       ei->i_disksize = inode->i_size;
+ #ifdef CONFIG_QUOTA
+       ei->i_reserved_quota = 0;
+@@ -4621,7 +4621,7 @@ static int ext4_do_update_inode(handle_t *handle,
+               raw_inode->i_file_acl_high =
+                       cpu_to_le16(ei->i_file_acl >> 32);
+       raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
+-      if (ei->i_disksize != ext4_isize(raw_inode)) {
++      if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
+               ext4_isize_set(raw_inode, ei->i_disksize);
+               need_datasync = 1;
+       }
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index f6465b6..3f70bca 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -517,7 +517,14 @@ struct dx_root_info *dx_get_dx_info(struct ext4_dir_entry_2 *de)
+ static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
+ {
+-      return le32_to_cpu(entry->block) & 0x00ffffff;
++      return le32_to_cpu(entry->block) & 0x0fffffff;
++}
++
++static inline int
++ext4_dir_htree_level(struct super_block *sb)
++{
++      return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_LARGEDIR) ?
++              EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
+ }
+ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
+@@ -746,6 +753,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+       struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
+       u32 hash;
++      memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
+       frame->bh = ext4_read_dirblock(dir, 0, INDEX);
+       if (IS_ERR(frame->bh))
+               return (struct dx_frame *) frame->bh;
+@@ -775,9 +783,13 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+       }
+       indirect = info->indirect_levels;
+-      if (indirect > 1) {
+-              ext4_warning_inode(dir, "Unimplemented hash depth: %#06x",
+-                                 info->indirect_levels);
++      if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
++              ext4_warning_inode(dir, "htree depth: %#06x exceed max depth %u",
++                                 indirect, ext4_dir_htree_level(dir->i_sb));
++              if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
++                      ext4_warning(dir->i_sb, "Enable large directory "
++                                              "feature to access it");
++              }
+               goto fail;
+       }
+@@ -867,12 +879,20 @@ fail:
+ static void dx_release(struct dx_frame *frames)
+ {
++      int i;
++      struct dx_root_info *info;
++
+       if (frames[0].bh == NULL)
+               return;
+-      if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels)
+-              brelse(frames[1].bh);
+-      brelse(frames[0].bh);
++      for (i = 0, info = dx_get_dx_info((struct ext4_dir_entry_2 *)frames[0].bh->b_data);
++           i <= info->indirect_levels;
++           i++) {
++              if (frames[i].bh == NULL)
++                      break;
++              brelse(frames[i].bh);
++              frames[i].bh = NULL;
++      }
+ }
+ /*
+@@ -1055,7 +1075,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+ {
+       struct dx_hash_info hinfo;
+       struct ext4_dir_entry_2 *de;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct inode *dir;
+       ext4_lblk_t block;
+       int count = 0;
+@@ -1514,7 +1534,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+                       struct ext4_dir_entry_2 **res_dir)
+ {
+       struct super_block * sb = dir->i_sb;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       const struct qstr *d_name = fname->usr_fname;
+       struct buffer_head *bh;
+       ext4_lblk_t block;
+@@ -2002,7 +2022,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+ {
+       struct inode    *dir = d_inode(dentry->d_parent);
+       struct buffer_head *bh2;
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries;
+       struct ext4_dir_entry_2 *de, *de2, *dot_de, *dotdot_de;
+       struct ext4_dir_entry_tail *t;
+@@ -2314,14 +2334,17 @@ out:
+ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                            struct dentry *dentry, struct inode *inode)
+ {
+-      struct dx_frame frames[2], *frame;
++      struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries, *at;
+       struct buffer_head *bh;
+       struct inode *dir = d_inode(dentry->d_parent);
+       struct super_block *sb = dir->i_sb;
+       struct ext4_dir_entry_2 *de;
++      int restart;
+       int err;
++again:
++      restart = 0;
+       frame = dx_probe(fname, dir, NULL, frames);
+       if (IS_ERR(frame))
+               return PTR_ERR(frame);
+@@ -2334,33 +2357,48 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+               goto cleanup;
+       }
+-      BUFFER_TRACE(bh, "get_write_access");
+-      err = ext4_journal_get_write_access(handle, bh);
+-      if (err)
+-              goto journal_error;
+-
+       err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh, dentry);
+       if (err != -ENOSPC)
+               goto cleanup;
++      err = 0;
+       /* Block full, should compress but for now just split */
+       dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
+                      dx_get_count(entries), dx_get_limit(entries)));
+       /* Need to split index? */
+       if (dx_get_count(entries) == dx_get_limit(entries)) {
+               ext4_lblk_t newblock;
+-              unsigned icount = dx_get_count(entries);
+-              int levels = frame - frames;
++              int levels = frame - frames + 1;
++              unsigned icount;
++              int add_level = 1;
+               struct dx_entry *entries2;
+               struct dx_node *node2;
+               struct buffer_head *bh2;
+-              if (levels && (dx_get_count(frames->entries) ==
+-                             dx_get_limit(frames->entries))) {
+-                      ext4_warning_inode(dir, "Directory index full!");
++              while (frame > frames) {
++                      if (dx_get_count((frame - 1)->entries) <
++                          dx_get_limit((frame - 1)->entries)) {
++                              add_level = 0;
++                              break;
++                      }
++                      frame--; /* split higher index block */
++                      at = frame->at;
++                      entries = frame->entries;
++                      restart = 1;
++              }
++              if (add_level && levels == ext4_dir_htree_level(sb)) {
++                      ext4_warning(sb, "inode %lu: comm %s: index %u: reach max htree level %u",
++                                       dir->i_ino, current->comm, levels,
++                                       ext4_dir_htree_level(sb));
++                      if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
++                              ext4_warning(sb, "Large directory feature is"
++                                               "not enabled on this "
++                                               "filesystem");
++                      }
+                       err = -ENOSPC;
+                       goto cleanup;
+               }
++              icount = dx_get_count(entries);
+               bh2 = ext4_append(handle, dir, &newblock);
+               if (IS_ERR(bh2)) {
+                       err = PTR_ERR(bh2);
+@@ -2375,7 +2413,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+               err = ext4_journal_get_write_access(handle, frame->bh);
+               if (err)
+                       goto journal_error;
+-              if (levels) {
++              if (!add_level) {
+                       unsigned icount1 = icount/2, icount2 = icount - icount1;
+                       unsigned hash2 = dx_get_hash(entries + icount1);
+                       dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
+@@ -2383,7 +2421,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                       BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
+                       err = ext4_journal_get_write_access(handle,
+-                                                           frames[0].bh);
++                                                          (frame - 1)->bh);
+                       if (err)
+                               goto journal_error;
+@@ -2399,19 +2437,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                               frame->entries = entries = entries2;
+                               swap(frame->bh, bh2);
+                       }
+-                      dx_insert_block(frames + 0, hash2, newblock);
+-                      dxtrace(dx_show_index("node", frames[1].entries));
++                      dx_insert_block(frame - 1, hash2, newblock);
++                      dxtrace(dx_show_index("node", frame->entries));
+                       dxtrace(dx_show_index("node",
+-                             ((struct dx_node *) bh2->b_data)->entries));
++                             ((struct dx_node *)bh2->b_data)->entries));
+                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
+                       if (err)
+                               goto journal_error;
+                       brelse (bh2);
++                      ext4_handle_dirty_dirent_node(handle, dir,
++                                                    (frame - 1)->bh);
++                      if (restart) {
++                              ext4_handle_dirty_dirent_node(handle, dir,
++                                                            frame->bh);
++                              goto cleanup;
++                      }
+               } else {
+                       struct dx_root_info *info;
+-                      dxtrace(printk(KERN_DEBUG
+-                                     "Creating second level index...\n"));
+-                      memcpy((char *) entries2, (char *) entries,
++
++                      memcpy((char *)entries2, (char *)entries,
+                              icount * sizeof(struct dx_entry));
+                       dx_set_limit(entries2, dx_node_limit(dir));
+@@ -2420,21 +2464,14 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+                       dx_set_block(entries + 0, newblock);
+                       info = dx_get_dx_info((struct ext4_dir_entry_2 *)
+                                             frames[0].bh->b_data);
+-                      info->indirect_levels = 1;
+-
+-                      /* Add new access path frame */
+-                      frame = frames + 1;
+-                      frame->at = at = at - entries + entries2;
+-                      frame->entries = entries = entries2;
+-                      frame->bh = bh2;
+-                      err = ext4_journal_get_write_access(handle,
+-                                                           frame->bh);
+-                      if (err)
+-                              goto journal_error;
+-              }
+-              err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
+-              if (err) {
+-                      ext4_std_error(inode->i_sb, err);
++                      info->indirect_levels += 1;
++                      dxtrace(printk(KERN_DEBUG
++                                     "Creating %d level index...\n",
++                                     info->indirect_levels));
++                      ext4_handle_dirty_dirent_node(handle, dir, frame->bh);
++                      ext4_handle_dirty_dirent_node(handle, dir, bh2);
++                      brelse(bh2);
++                      restart = 1;
+                       goto cleanup;
+               }
+       }
+@@ -2451,6 +2488,10 @@ journal_error:
+ cleanup:
+       brelse(bh);
+       dx_release(frames);
++      /* @restart is true means htree-path has been changed, we need to
++       * repeat dx_probe() to find out valid htree-path */
++      if (restart && err == 0)
++              goto again;
+       return err;
+ }
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-eas.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-large-eas.patch
new file mode 100644 (file)
index 0000000..1b848b9
--- /dev/null
@@ -0,0 +1,1081 @@
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 10a2a86..217fdcc 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1781,6 +1781,7 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,             ENCRYPT)
+                                        EXT4_FEATURE_INCOMPAT_EXTENTS| \
+                                        EXT4_FEATURE_INCOMPAT_64BIT| \
+                                        EXT4_FEATURE_INCOMPAT_FLEX_BG| \
++                                       EXT4_FEATURE_INCOMPAT_EA_INODE| \
+                                        EXT4_FEATURE_INCOMPAT_MMP | \
+                                        EXT4_FEATURE_INCOMPAT_DIRDATA| \
+                                        EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
+@@ -2241,6 +2242,12 @@ struct mmpd_data {
+ #define EXT4_MMP_MAX_CHECK_INTERVAL   300UL
+ /*
++ * Maximum size of xattr attributes for FEATURE_INCOMPAT_EA_INODE 1Mb
++ * This limit is arbitrary, but is reasonable for the xattr API.
++ */
++#define EXT4_XATTR_MAX_LARGE_EA_SIZE    (1024 * 1024)
++
++/*
+  * Function prototypes
+  */
+@@ -2252,6 +2259,10 @@ struct mmpd_data {
+ # define ATTRIB_NORET __attribute__((noreturn))
+ # define NORET_AND    noreturn,
++struct ext4_xattr_ino_array {
++      unsigned int xia_count;         /* # of used item in the array */
++      unsigned int xia_inodes[0];
++};
+ /* bitmap.c */
+ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars);
+ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+@@ -2582,6 +2593,7 @@ extern void ext4_set_inode_flags(struct inode *);
+ extern void ext4_get_inode_flags(struct ext4_inode_info *);
+ extern int ext4_alloc_da_blocks(struct inode *inode);
+ extern void ext4_set_aops(struct inode *inode);
++extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int chunk);
+ extern int ext4_writepage_trans_blocks(struct inode *);
+ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+ extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
+diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
+index 0cccda3..43ca376 100644
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -293,7 +293,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
+        * as writing the quota to disk may need the lock as well.
+        */
+       dquot_initialize(inode);
+-      ext4_xattr_delete_inode(handle, inode);
+       dquot_free_inode(inode);
+       dquot_drop(inode);
+diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
+index 53e8a88..5112c5a 100644
+--- a/fs/ext4/inline.c
++++ b/fs/ext4/inline.c
+@@ -61,7 +61,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
+       /* Compute min_offs. */
+       for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+-              if (!entry->e_value_block && entry->e_value_size) {
++              if (!entry->e_value_inum && entry->e_value_size) {
+                       size_t offs = le16_to_cpu(entry->e_value_offs);
+                       if (offs < min_offs)
+                               min_offs = offs;
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 04c5f63..3034ceb 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -139,8 +139,6 @@ static void ext4_invalidatepage(struct page *page, unsigned int offset,
+                               unsigned int length);
+ static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+ static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
+-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+-                                int pextents);
+ /*
+  * Test whether an inode is a fast symlink.
+@@ -189,6 +187,8 @@ void ext4_evict_inode(struct inode *inode)
+ {
+       handle_t *handle;
+       int err;
++      int extra_credits = 3;
++      struct ext4_xattr_ino_array *lea_ino_array = NULL;
+       trace_ext4_evict_inode(inode);
+@@ -241,8 +241,8 @@ void ext4_evict_inode(struct inode *inode)
+        * protection against it
+        */
+       sb_start_intwrite(inode->i_sb);
+-      handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
+-                                  ext4_blocks_for_truncate(inode)+3);
++
++      handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, extra_credits);
+       if (IS_ERR(handle)) {
+               ext4_std_error(inode->i_sb, PTR_ERR(handle));
+               /*
+@@ -254,9 +254,36 @@ void ext4_evict_inode(struct inode *inode)
+               sb_end_intwrite(inode->i_sb);
+               goto no_delete;
+       }
+-
+       if (IS_SYNC(inode))
+               ext4_handle_sync(handle);
++
++      /*
++       * Delete xattr inode before deleting the main inode.
++       */
++      err = ext4_xattr_delete_inode(handle, inode, &lea_ino_array);
++      if (err) {
++              ext4_warning(inode->i_sb,
++                           "couldn't delete inode's xattr (err %d)", err);
++              goto stop_handle;
++      }
++
++      if (!IS_NOQUOTA(inode))
++              extra_credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
++
++      if (!ext4_handle_has_enough_credits(handle,
++                      ext4_blocks_for_truncate(inode) + extra_credits)) {
++              err = ext4_journal_extend(handle,
++                      ext4_blocks_for_truncate(inode) + extra_credits);
++              if (err > 0)
++                      err = ext4_journal_restart(handle,
++                      ext4_blocks_for_truncate(inode) + extra_credits);
++              if (err != 0) {
++                      ext4_warning(inode->i_sb,
++                                   "couldn't extend journal (err %d)", err);
++                      goto stop_handle;
++              }
++      }
++
+       inode->i_size = 0;
+       err = ext4_mark_inode_dirty(handle, inode);
+       if (err) {
+@@ -273,10 +300,10 @@ void ext4_evict_inode(struct inode *inode)
+        * enough credits left in the handle to remove the inode from
+        * the orphan list and set the dtime field.
+        */
+-      if (!ext4_handle_has_enough_credits(handle, 3)) {
+-              err = ext4_journal_extend(handle, 3);
++      if (!ext4_handle_has_enough_credits(handle, extra_credits)) {
++              err = ext4_journal_extend(handle, extra_credits);
+               if (err > 0)
+-                      err = ext4_journal_restart(handle, 3);
++                      err = ext4_journal_restart(handle, extra_credits);
+               if (err != 0) {
+                       ext4_warning(inode->i_sb,
+                                    "couldn't extend journal (err %d)", err);
+@@ -311,8 +338,12 @@ void ext4_evict_inode(struct inode *inode)
+               ext4_clear_inode(inode);
+       else
+               ext4_free_inode(handle, inode);
++
+       ext4_journal_stop(handle);
+       sb_end_intwrite(inode->i_sb);
++
++      if (lea_ino_array != NULL)
++              ext4_xattr_inode_array_free(inode, lea_ino_array);
+       return;
+ no_delete:
+       ext4_clear_inode(inode);        /* We must guarantee clearing of inode... */
+@@ -5008,7 +5039,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
+  *
+  * Also account for superblock, inode, quota and xattr blocks
+  */
+-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
++int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+                                 int pextents)
+ {
+       ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 263002f..b69145b 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -202,6 +202,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
+       while (!IS_LAST_ENTRY(entry)) {
+               if (entry->e_value_size != 0 &&
++                  entry->e_value_inum == 0 &&
+                   (value_start + le16_to_cpu(entry->e_value_offs) <
+                    (void *)e + sizeof(__u32) ||
+                    value_start + le16_to_cpu(entry->e_value_offs) +
+@@ -234,19 +235,26 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
+ }
+ static inline int
+-ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
++ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size,
++                     struct inode *inode)
+ {
+       size_t value_size = le32_to_cpu(entry->e_value_size);
+-      if (entry->e_value_block != 0 || value_size > size ||
++      if (!entry->e_value_inum &&
+           le16_to_cpu(entry->e_value_offs) + value_size > size)
+               return -EFSCORRUPTED;
++      if (entry->e_value_inum &&
++          (le32_to_cpu(entry->e_value_inum) < EXT4_FIRST_INO(inode->i_sb) ||
++           le32_to_cpu(entry->e_value_inum) >
++           le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_inodes_count)))
++              return -EFSCORRUPTED;
+       return 0;
+ }
+ static int
+ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+-                    const char *name, size_t size, int sorted)
++                    const char *name, size_t size, int sorted,
++                    struct inode *inode)
+ {
+       struct ext4_xattr_entry *entry;
+       size_t name_len;
+@@ -266,11 +274,104 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
+                       break;
+       }
+       *pentry = entry;
+-      if (!cmp && ext4_xattr_check_entry(entry, size))
++      if (!cmp && ext4_xattr_check_entry(entry, size, inode))
+               return -EFSCORRUPTED;
+       return cmp ? -ENODATA : 0;
+ }
++/*
++ * Read the EA value from an inode.
++ */
++static int
++ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t *size)
++{
++      unsigned long block = 0;
++      struct buffer_head *bh = NULL;
++      int blocksize;
++      size_t csize, ret_size = 0;
++
++      if (*size == 0)
++              return 0;
++
++      blocksize = ea_inode->i_sb->s_blocksize;
++
++      while (ret_size < *size) {
++              csize = (*size - ret_size) > blocksize ? blocksize :
++                                                      *size - ret_size;
++              bh = ext4_bread(NULL, ea_inode, block, 0);
++              if (IS_ERR(bh)) {
++                      *size = ret_size;
++                      return PTR_ERR(bh);
++              }
++              memcpy(buf, bh->b_data, csize);
++              brelse(bh);
++
++              buf += csize;
++              block += 1;
++              ret_size += csize;
++      }
++
++      *size = ret_size;
++
++      return 0;
++}
++
++struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, int *err)
++{
++      struct inode *ea_inode = NULL;
++
++      ea_inode = ext4_iget(parent->i_sb, ea_ino);
++      if (IS_ERR(ea_inode) || is_bad_inode(ea_inode)) {
++              int rc = IS_ERR(ea_inode) ? PTR_ERR(ea_inode) : 0;
++              ext4_error(parent->i_sb, "error while reading EA inode %lu "
++                         "/ %d %d", ea_ino, rc, is_bad_inode(ea_inode));
++              *err = rc != 0 ? rc : -EIO;
++              return NULL;
++      }
++
++      if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != parent->i_ino ||
++          ea_inode->i_generation != parent->i_generation) {
++              ext4_error(parent->i_sb, "Backpointer from EA inode %lu "
++                         "to parent invalid.", ea_ino);
++              *err = -EINVAL;
++              goto error;
++      }
++
++      if (!(EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL)) {
++              ext4_error(parent->i_sb, "EA inode %lu does not have "
++                         "EXT4_EA_INODE_FL flag set.\n", ea_ino);
++              *err = -EINVAL;
++              goto error;
++      }
++
++      *err = 0;
++      return ea_inode;
++
++error:
++      iput(ea_inode);
++      return NULL;
++}
++
++/*
++ * Read the value from the EA inode.
++ */
++static int
++ext4_xattr_inode_get(struct inode *inode, unsigned long ea_ino, void *buffer,
++                   size_t *size)
++{
++      struct inode *ea_inode = NULL;
++      int err;
++
++      ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
++      if (err)
++              return err;
++
++      err = ext4_xattr_inode_read(ea_inode, buffer, size);
++      iput(ea_inode);
++
++      return err;
++}
++
+ static int
+ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
+                    void *buffer, size_t buffer_size)
+@@ -303,7 +404,8 @@ bad_block:
+       }
+       ext4_xattr_cache_insert(ext4_mb_cache, bh);
+       entry = BFIRST(bh);
+-      error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
++      error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1,
++                                    inode);
+       if (error == -EFSCORRUPTED)
+               goto bad_block;
+       if (error)
+@@ -313,8 +415,16 @@ bad_block:
+               error = -ERANGE;
+               if (size > buffer_size)
+                       goto cleanup;
+-              memcpy(buffer, bh->b_data + le16_to_cpu(entry->e_value_offs),
+-                     size);
++              if (entry->e_value_inum) {
++                      error = ext4_xattr_inode_get(inode,
++                                           le32_to_cpu(entry->e_value_inum),
++                                           buffer, &size);
++                      if (error)
++                              goto cleanup;
++              } else {
++                      memcpy(buffer, bh->b_data +
++                             le16_to_cpu(entry->e_value_offs), size);
++              }
+       }
+       error = size;
+@@ -348,7 +458,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+       if (error)
+               goto cleanup;
+       error = ext4_xattr_find_entry(&entry, name_index, name,
+-                                    end - (void *)entry, 0);
++                                    end - (void *)entry, 0, inode);
+       if (error)
+               goto cleanup;
+       size = le32_to_cpu(entry->e_value_size);
+@@ -356,8 +466,16 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name,
+               error = -ERANGE;
+               if (size > buffer_size)
+                       goto cleanup;
+-              memcpy(buffer, (void *)IFIRST(header) +
+-                     le16_to_cpu(entry->e_value_offs), size);
++              if (entry->e_value_inum) {
++                      error = ext4_xattr_inode_get(inode,
++                                           le32_to_cpu(entry->e_value_inum),
++                                           buffer, &size);
++                      if (error)
++                              goto cleanup;
++              } else {
++                      memcpy(buffer, (void *)IFIRST(header) +
++                             le16_to_cpu(entry->e_value_offs), size);
++              }
+       }
+       error = size;
+@@ -603,7 +721,7 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+                                   size_t *min_offs, void *base, int *total)
+ {
+       for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+-              if (!last->e_value_block && last->e_value_size) {
++              if (!last->e_value_inum && last->e_value_size) {
+                       size_t offs = le16_to_cpu(last->e_value_offs);
+                       if (offs < *min_offs)
+                               *min_offs = offs;
+@@ -614,16 +732,176 @@ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last,
+       return (*min_offs - ((void *)last - base) - sizeof(__u32));
+ }
++/*
++ * Write the value of the EA in an inode.
++ */
+ static int
+-ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
++ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode,
++                     const void *buf, int bufsize)
++{
++      struct buffer_head *bh = NULL;
++      unsigned long block = 0;
++      unsigned blocksize = ea_inode->i_sb->s_blocksize;
++      unsigned max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits;
++      int csize, wsize = 0;
++      int ret = 0;
++      int retries = 0;
++
++retry:
++      while (ret >= 0 && ret < max_blocks) {
++              struct ext4_map_blocks map;
++              map.m_lblk = block += ret;
++              map.m_len = max_blocks -= ret;
++
++              ret = ext4_map_blocks(handle, ea_inode, &map,
++                                    EXT4_GET_BLOCKS_CREATE);
++              if (ret <= 0) {
++                      ext4_mark_inode_dirty(handle, ea_inode);
++                      if (ret == -ENOSPC &&
++                          ext4_should_retry_alloc(ea_inode->i_sb, &retries)) {
++                              ret = 0;
++                              goto retry;
++                      }
++                      break;
++              }
++      }
++
++      if (ret < 0)
++              return ret;
++
++      block = 0;
++      while (wsize < bufsize) {
++              if (bh != NULL)
++                      brelse(bh);
++              csize = (bufsize - wsize) > blocksize ? blocksize :
++                                                              bufsize - wsize;
++              bh = ext4_getblk(handle, ea_inode, block, 0);
++              if (IS_ERR(bh)) {
++                      ret = PTR_ERR(bh);
++                      goto out;
++              }
++              ret = ext4_journal_get_write_access(handle, bh);
++              if (ret)
++                      goto out;
++
++              memcpy(bh->b_data, buf, csize);
++              set_buffer_uptodate(bh);
++              ext4_handle_dirty_metadata(handle, ea_inode, bh);
++
++              buf += csize;
++              wsize += csize;
++              block += 1;
++      }
++
++      mutex_lock(&ea_inode->i_mutex);
++      i_size_write(ea_inode, wsize);
++      ext4_update_i_disksize(ea_inode, wsize);
++      mutex_unlock(&ea_inode->i_mutex);
++
++      ext4_mark_inode_dirty(handle, ea_inode);
++
++out:
++      brelse(bh);
++
++      return ret;
++}
++
++/*
++ * Create an inode to store the value of a large EA.
++ */
++static struct inode *
++ext4_xattr_inode_create(handle_t *handle, struct inode *inode)
++{
++      struct inode *ea_inode = NULL;
++
++      /*
++       * Let the next inode be the goal, so we try and allocate the EA inode
++       * in the same group, or nearby one.
++       */
++      ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
++                                S_IFREG|0600, NULL, inode->i_ino + 1, NULL);
++
++      if (!IS_ERR(ea_inode)) {
++              ea_inode->i_op = &ext4_file_inode_operations;
++              ea_inode->i_fop = &ext4_file_operations;
++              ext4_set_aops(ea_inode);
++              ea_inode->i_generation = inode->i_generation;
++              EXT4_I(ea_inode)->i_flags |= EXT4_EA_INODE_FL;
++
++              /*
++               * A back-pointer from EA inode to parent inode will be useful
++               * for e2fsck.
++               */
++              EXT4_XATTR_INODE_SET_PARENT(ea_inode, inode->i_ino);
++              unlock_new_inode(ea_inode);
++      }
++
++      return ea_inode;
++}
++
++/*
++ * Unlink the inode storing the value of the EA.
++ */
++int
++ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino)
++{
++      struct inode *ea_inode = NULL;
++      int err;
++
++      ea_inode = ext4_xattr_inode_iget(inode, ea_ino, &err);
++      if (err)
++              return err;
++
++      clear_nlink(ea_inode);
++      iput(ea_inode);
++
++      return 0;
++}
++
++/*
++ * Add value of the EA in an inode.
++ */
++static int
++ext4_xattr_inode_set(handle_t *handle, struct inode *inode, unsigned long *ea_ino,
++                   const void *value, size_t value_len)
++{
++      struct inode *ea_inode = NULL;
++      int err;
++
++      /* Create an inode for the EA value */
++      ea_inode = ext4_xattr_inode_create(handle, inode);
++      if (IS_ERR(ea_inode))
++              return -1;
++
++      err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
++      if (err)
++              clear_nlink(ea_inode);
++      else
++              *ea_ino = ea_inode->i_ino;
++
++      iput(ea_inode);
++
++      return err;
++}
++
++static int
++ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s,
++                   handle_t *handle, struct inode *inode)
+ {
+       struct ext4_xattr_entry *last;
+       size_t free, min_offs = s->end - s->base, name_len = strlen(i->name);
++      int in_inode = i->in_inode;
++
++      if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
++               EXT4_FEATURE_INCOMPAT_EA_INODE) &&
++          (EXT4_XATTR_SIZE(i->value_len) >
++           EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize)))
++              in_inode = 1;
+       /* Compute min_offs and last. */
+       last = s->first;
+       for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+-              if (!last->e_value_block && last->e_value_size) {
++              if (!last->e_value_inum && last->e_value_size) {
+                       size_t offs = le16_to_cpu(last->e_value_offs);
+                       if (offs < min_offs)
+                               min_offs = offs;
+@@ -631,15 +909,20 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+       }
+       free = min_offs - ((void *)last - s->base) - sizeof(__u32);
+       if (!s->not_found) {
+-              if (!s->here->e_value_block && s->here->e_value_size) {
++              if (!in_inode &&
++                  !s->here->e_value_inum && s->here->e_value_size) {
+                       size_t size = le32_to_cpu(s->here->e_value_size);
+                       free += EXT4_XATTR_SIZE(size);
+               }
+               free += EXT4_XATTR_LEN(name_len);
+       }
+       if (i->value) {
+-              if (free < EXT4_XATTR_LEN(name_len) +
+-                         EXT4_XATTR_SIZE(i->value_len))
++              size_t value_len = EXT4_XATTR_SIZE(i->value_len);
++
++              if (in_inode)
++                      value_len = 0;
++
++              if (free < EXT4_XATTR_LEN(name_len) + value_len)
+                       return -ENOSPC;
+       }
+@@ -653,7 +936,8 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+               s->here->e_name_len = name_len;
+               memcpy(s->here->e_name, i->name, name_len);
+       } else {
+-              if (!s->here->e_value_block && s->here->e_value_size) {
++              if (!s->here->e_value_inum && s->here->e_value_size &&
++                  s->here->e_value_offs > 0) {
+                       void *first_val = s->base + min_offs;
+                       size_t offs = le16_to_cpu(s->here->e_value_offs);
+                       void *val = s->base + offs;
+@@ -687,13 +971,18 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+                       last = s->first;
+                       while (!IS_LAST_ENTRY(last)) {
+                               size_t o = le16_to_cpu(last->e_value_offs);
+-                              if (!last->e_value_block &&
++                              if (!last->e_value_inum &&
+                                   last->e_value_size && o < offs)
+                                       last->e_value_offs =
+                                               cpu_to_le16(o + size);
+                               last = EXT4_XATTR_NEXT(last);
+                       }
+               }
++              if (s->here->e_value_inum) {
++                      ext4_xattr_inode_unlink(inode,
++                                      le32_to_cpu(s->here->e_value_inum));
++                      s->here->e_value_inum = 0;
++              }
+               if (!i->value) {
+                       /* Remove the old name. */
+                       size_t size = EXT4_XATTR_LEN(name_len);
+@@ -707,10 +996,17 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s)
+       if (i->value) {
+               /* Insert the new value. */
+               s->here->e_value_size = cpu_to_le32(i->value_len);
+-              if (i->value_len) {
++              if (in_inode) {
++                      unsigned long ea_ino = le32_to_cpu(s->here->e_value_inum);
++                      ext4_xattr_inode_set(handle, inode, &ea_ino, i->value,
++                                           i->value_len);
++                      s->here->e_value_inum = cpu_to_le32(ea_ino);
++                      s->here->e_value_offs = 0;
++              } else if (i->value_len) {
+                       size_t size = EXT4_XATTR_SIZE(i->value_len);
+                       void *val = s->base + min_offs - size;
+                       s->here->e_value_offs = cpu_to_le16(min_offs - size);
++                      s->here->e_value_inum = 0;
+                       if (i->value == EXT4_ZERO_XATTR_VALUE) {
+                               memset(val, 0, size);
+                       } else {
+@@ -760,7 +1056,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
+               bs->s.end = bs->bh->b_data + bs->bh->b_size;
+               bs->s.here = bs->s.first;
+               error = ext4_xattr_find_entry(&bs->s.here, i->name_index,
+-                                            i->name, bs->bh->b_size, 1);
++                                           i->name, bs->bh->b_size, 1, inode);
+               if (error && error != -ENODATA)
+                       goto cleanup;
+               bs->s.not_found = error;
+@@ -785,8 +1081,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+ #define header(x) ((struct ext4_xattr_header *)(x))
+-      if (i->value && i->value_len > sb->s_blocksize)
+-              return -ENOSPC;
+       if (s->base) {
+               ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
+                                       bs->bh->b_blocknr);
+@@ -802,7 +1096,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+                               ce = NULL;
+                       }
+                       ea_bdebug(bs->bh, "modifying in-place");
+-                      error = ext4_xattr_set_entry(i, s);
++                      error = ext4_xattr_set_entry(i, s, handle, inode);
+                       if (!error) {
+                               if (!IS_LAST_ENTRY(s->first))
+                                       ext4_xattr_rehash(header(s->base),
+@@ -854,7 +1148,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
+               s->end = s->base + sb->s_blocksize;
+       }
+-      error = ext4_xattr_set_entry(i, s);
++      error = ext4_xattr_set_entry(i, s, handle, inode);
+       if (error == -EFSCORRUPTED)
+               goto bad_block;
+       if (error)
+@@ -998,7 +1292,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
+               /* Find the named attribute. */
+               error = ext4_xattr_find_entry(&is->s.here, i->name_index,
+                                             i->name, is->s.end -
+-                                            (void *)is->s.base, 0);
++                                            (void *)is->s.base, 0, inode);
+               if (error && error != -ENODATA)
+                       return error;
+               is->s.not_found = error;
+@@ -1016,7 +1310,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+       if (EXT4_I(inode)->i_extra_isize == 0)
+               return -ENOSPC;
+-      error = ext4_xattr_set_entry(i, s);
++      error = ext4_xattr_set_entry(i, s, handle, inode);
+       if (error) {
+               if (error == -ENOSPC &&
+                   ext4_has_inline_data(inode)) {
+@@ -1028,7 +1322,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+                       error = ext4_xattr_ibody_find(inode, i, is);
+                       if (error)
+                               return error;
+-                      error = ext4_xattr_set_entry(i, s);
++                      error = ext4_xattr_set_entry(i, s, handle, inode);
+               }
+               if (error)
+                       return error;
+@@ -1054,7 +1348,7 @@ static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+       if (EXT4_I(inode)->i_extra_isize == 0)
+               return -ENOSPC;
+-      error = ext4_xattr_set_entry(i, s);
++      error = ext4_xattr_set_entry(i, s, handle, inode);
+       if (error)
+               return error;
+       header = IHDR(inode, ext4_raw_inode(&is->iloc));
+@@ -1090,7 +1384,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+               .name = name,
+               .value = value,
+               .value_len = value_len,
+-
++              .in_inode = 0,
+       };
+       struct ext4_xattr_ibody_find is = {
+               .s = { .not_found = -ENODATA, },
+@@ -1155,6 +1449,15 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
+                                       goto cleanup;
+                       }
+                       error = ext4_xattr_block_set(handle, inode, &i, &bs);
++                      if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
++                                      EXT4_FEATURE_INCOMPAT_EA_INODE) &&
++                          error == -ENOSPC) {
++                              /* xattr not fit to block, store at external
++                               * inode */
++                              i.in_inode = 1;
++                              error = ext4_xattr_ibody_set(handle, inode,
++                                                           &i, &is);
++                      }
+                       if (error)
+                               goto cleanup;
+                       if (!is.s.not_found) {
+@@ -1201,9 +1504,22 @@ ext4_xattr_set(struct inode *inode, int name_index, const char *name,
+              const void *value, size_t value_len, int flags)
+ {
+       handle_t *handle;
++      struct super_block *sb = inode->i_sb;
+       int error, retries = 0;
+       int credits = ext4_jbd2_credits_xattr(inode);
++      if ((value_len >= EXT4_XATTR_MIN_LARGE_EA_SIZE(sb->s_blocksize)) &&
++          EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EA_INODE)) {
++              int nrblocks = (value_len + sb->s_blocksize - 1) >>
++                                      sb->s_blocksize_bits;
++
++              /* For new inode */
++              credits += EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + 3;
++
++              /* For data blocks of EA inode */
++              credits += ext4_meta_trans_blocks(inode, nrblocks, 0);
++      }
++
+ retry:
+       handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits);
+       if (IS_ERR(handle)) {
+@@ -1215,7 +1531,7 @@ retry:
+                                             value, value_len, flags);
+               error2 = ext4_journal_stop(handle);
+               if (error == -ENOSPC &&
+-                  ext4_should_retry_alloc(inode->i_sb, &retries))
++                  ext4_should_retry_alloc(sb, &retries))
+                       goto retry;
+               if (error == 0)
+                       error = error2;
+@@ -1237,7 +1553,7 @@ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry,
+       /* Adjust the value offsets of the entries */
+       for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) {
+-              if (!last->e_value_block && last->e_value_size) {
++              if (!last->e_value_inum && last->e_value_size) {
+                       new_offs = le16_to_cpu(last->e_value_offs) +
+                                                       value_offs_shift;
+                       BUG_ON(new_offs + le32_to_cpu(last->e_value_size)
+@@ -1484,21 +1800,135 @@ cleanup:
+ }
++#define EIA_INCR 16 /* must be 2^n */
++#define EIA_MASK (EIA_INCR - 1)
++/* Add the large xattr @ino into @lea_ino_array for later deletion.
++ * If @lea_ino_array is new or full it will be grown and the old
++ * contents copied over.
++ */
++static int
++ext4_expand_ino_array(struct ext4_xattr_ino_array **lea_ino_array, __u32 ino)
++{
++      if (*lea_ino_array == NULL) {
++              /*
++               * Start with 15 inodes, so it fits into a power-of-two size.
++               * If *lea_ino_array is NULL, this is essentially offsetof()
++               */
++              (*lea_ino_array) =
++                      kmalloc(offsetof(struct ext4_xattr_ino_array,
++                                       xia_inodes[EIA_MASK]),
++                              GFP_NOFS);
++              if (*lea_ino_array == NULL)
++                      return -ENOMEM;
++              (*lea_ino_array)->xia_count = 0;
++      } else if (((*lea_ino_array)->xia_count & EIA_MASK) == EIA_MASK) {
++              /* expand the array once all 15 + n * 16 slots are full */
++              struct ext4_xattr_ino_array *new_array = NULL;
++              int count = (*lea_ino_array)->xia_count;
++
++              /* if new_array is NULL, this is essentially offsetof() */
++              new_array = kmalloc(
++                              offsetof(struct ext4_xattr_ino_array,
++                                       xia_inodes[count + EIA_INCR]),
++                              GFP_NOFS);
++              if (new_array == NULL)
++                      return -ENOMEM;
++              memcpy(new_array, *lea_ino_array,
++                     offsetof(struct ext4_xattr_ino_array,
++                              xia_inodes[count]));
++              kfree(*lea_ino_array);
++              *lea_ino_array = new_array;
++      }
++      (*lea_ino_array)->xia_inodes[(*lea_ino_array)->xia_count++] = ino;
++      return 0;
++}
++
++/**
++ * Add xattr inode to orphan list
++ */
++static int
++ext4_xattr_inode_orphan_add(handle_t *handle, struct inode *inode,
++                      int credits, struct ext4_xattr_ino_array *lea_ino_array)
++{
++      struct inode *ea_inode = NULL;
++      int idx = 0, error = 0;
++
++      if (lea_ino_array == NULL)
++              return 0;
++
++      for (; idx < lea_ino_array->xia_count; ++idx) {
++              if (!ext4_handle_has_enough_credits(handle, credits)) {
++                      error = ext4_journal_extend(handle, credits);
++                      if (error > 0)
++                              error = ext4_journal_restart(handle, credits);
++
++                      if (error != 0) {
++                              ext4_warning(inode->i_sb,
++                                      "couldn't extend journal "
++                                      "(err %d)", error);
++                              return error;
++                      }
++              }
++              ea_inode = ext4_xattr_inode_iget(inode,
++                              lea_ino_array->xia_inodes[idx], &error);
++              if (error)
++                      continue;
++              ext4_orphan_add(handle, ea_inode);
++              /* the inode's i_count will be released by caller */
++      }
++
++      return 0;
++}
+ /*
+  * ext4_xattr_delete_inode()
+  *
+- * Free extended attribute resources associated with this inode. This
++ * Free extended attribute resources associated with this inode. Traverse
++ * all entries and unlink any xattr inodes associated with this inode. This
+  * is called immediately before an inode is freed. We have exclusive
+- * access to the inode.
++ * access to the inode. If an orphan inode is deleted it will also delete any
++ * xattr block and all xattr inodes. They are checked by ext4_xattr_inode_iget()
++ * to ensure they belong to the parent inode and were not deleted already.
+  */
+-void
+-ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
++int
++ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
++                      struct ext4_xattr_ino_array **lea_ino_array)
+ {
+       struct buffer_head *bh = NULL;
++      struct ext4_xattr_ibody_header *header;
++      struct ext4_inode *raw_inode;
++      struct ext4_iloc iloc;
++      struct ext4_xattr_entry *entry;
++      int credits = 3, error = 0;
+-      if (!EXT4_I(inode)->i_file_acl)
++      if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR))
++              goto delete_external_ea;
++
++      error = ext4_get_inode_loc(inode, &iloc);
++      if (error)
++              goto cleanup;
++      raw_inode = ext4_raw_inode(&iloc);
++      header = IHDR(inode, raw_inode);
++      for (entry = IFIRST(header); !IS_LAST_ENTRY(entry);
++           entry = EXT4_XATTR_NEXT(entry)) {
++              if (!entry->e_value_inum)
++                      continue;
++              if (ext4_expand_ino_array(lea_ino_array,
++                                        entry->e_value_inum) != 0) {
++                      brelse(iloc.bh);
++                      goto cleanup;
++              }
++              entry->e_value_inum = 0;
++      }
++      brelse(iloc.bh);
++
++delete_external_ea:
++      if (!EXT4_I(inode)->i_file_acl) {
++              /* add xattr inode to orphan list */
++              ext4_xattr_inode_orphan_add(handle, inode, credits,
++                                              *lea_ino_array);
+               goto cleanup;
++      }
+       bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
+       if (!bh) {
+               EXT4_ERROR_INODE(inode, "block %llu read error",
+@@ -1511,11 +1941,69 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
+                                EXT4_I(inode)->i_file_acl);
+               goto cleanup;
+       }
++
++      for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry);
++           entry = EXT4_XATTR_NEXT(entry)) {
++              if (!entry->e_value_inum)
++                      continue;
++              if (ext4_expand_ino_array(lea_ino_array,
++                                        entry->e_value_inum) != 0)
++                      goto cleanup;
++              entry->e_value_inum = 0;
++      }
++
++      /* add xattr inode to orphan list */
++      error = ext4_xattr_inode_orphan_add(handle, inode, credits,
++                                      *lea_ino_array);
++      if (error != 0)
++              goto cleanup;
++
++      if (!IS_NOQUOTA(inode))
++              credits += 2 * EXT4_QUOTA_DEL_BLOCKS(inode->i_sb);
++
++      if (!ext4_handle_has_enough_credits(handle, credits)) {
++              error = ext4_journal_extend(handle, credits);
++              if (error > 0)
++                      error = ext4_journal_restart(handle, credits);
++              if (error != 0) {
++                      ext4_warning(inode->i_sb,
++                              "couldn't extend journal (err %d)", error);
++                      goto cleanup;
++              }
++      }
++
+       ext4_xattr_release_block(handle, inode, bh);
+       EXT4_I(inode)->i_file_acl = 0;
+ cleanup:
+       brelse(bh);
++
++      return error;
++}
++
++void
++ext4_xattr_inode_array_free(struct inode *inode,
++                          struct ext4_xattr_ino_array *lea_ino_array)
++{
++      struct inode    *ea_inode = NULL;
++      int             idx = 0;
++      int             err;
++
++      if (lea_ino_array == NULL)
++              return;
++
++      for (; idx < lea_ino_array->xia_count; ++idx) {
++              ea_inode = ext4_xattr_inode_iget(inode,
++                              lea_ino_array->xia_inodes[idx], &err);
++              if (err)
++                      continue;
++              /* for inode's i_count get from ext4_xattr_delete_inode */
++              if (!list_empty(&EXT4_I(ea_inode)->i_orphan))
++                      iput(ea_inode);
++              clear_nlink(ea_inode);
++              iput(ea_inode);
++      }
++      kfree(lea_ino_array);
+ }
+ /*
+@@ -1585,10 +2073,9 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
+                   entry1->e_name_index != entry2->e_name_index ||
+                   entry1->e_name_len != entry2->e_name_len ||
+                   entry1->e_value_size != entry2->e_value_size ||
++                  entry1->e_value_inum != entry2->e_value_inum ||
+                   memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
+                       return 1;
+-              if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
+-                      return -EFSCORRUPTED;
+               if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
+                          (char *)header2 + le16_to_cpu(entry2->e_value_offs),
+                          le32_to_cpu(entry1->e_value_size)))
+@@ -1673,7 +2160,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
+                      *name++;
+       }
+-      if (entry->e_value_block == 0 && entry->e_value_size != 0) {
++      if (!entry->e_value_inum && entry->e_value_size) {
+               __le32 *value = (__le32 *)((char *)header +
+                       le16_to_cpu(entry->e_value_offs));
+               for (n = (le32_to_cpu(entry->e_value_size) +
+diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
+index ddc0957..57c7ad5 100644
+--- a/fs/ext4/xattr.h
++++ b/fs/ext4/xattr.h
+@@ -43,7 +43,7 @@ struct ext4_xattr_entry {
+       __u8    e_name_len;     /* length of name */
+       __u8    e_name_index;   /* attribute name index */
+       __le16  e_value_offs;   /* offset in disk block of value */
+-      __le32  e_value_block;  /* disk block attribute is stored on (n/i) */
++      __le32  e_value_inum;   /* inode in which the value is stored */
+       __le32  e_value_size;   /* size of attribute value */
+       __le32  e_hash;         /* hash value of name and value */
+       char    e_name[0];      /* attribute name */
+@@ -68,6 +68,26 @@ struct ext4_xattr_entry {
+               EXT4_I(inode)->i_extra_isize))
+ #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1))
++/*
++ * Link EA inode back to parent one using i_mtime field.
++ * Extra integer type conversion added to ignore higher
++ * bits in i_mtime.tv_sec which might be set by ext4_get()
++ */
++#define EXT4_XATTR_INODE_SET_PARENT(inode, inum)      \
++do {                                                  \
++      (inode)->i_mtime.tv_sec = inum;                 \
++} while(0)
++
++#define EXT4_XATTR_INODE_GET_PARENT(inode)            \
++((__u32)(inode)->i_mtime.tv_sec)
++
++/*
++ * The minimum size of EA value when you start storing it in an external inode
++ * size of block - size of header - size of 1 entry - 4 null bytes
++*/
++#define EXT4_XATTR_MIN_LARGE_EA_SIZE(b)                                       \
++      ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4)
++
+ #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
+ #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
+ #define BFIRST(bh) ENTRY(BHDR(bh)+1)
+@@ -76,10 +96,11 @@ struct ext4_xattr_entry {
+ #define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+ struct ext4_xattr_info {
+-      int name_index;
+       const char *name;
+       const void *value;
+       size_t value_len;
++      int name_index;
++      int in_inode;
+ };
+ struct ext4_xattr_search {
+@@ -107,7 +128,14 @@ extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t);
+ extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int);
+ extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int);
+-extern void ext4_xattr_delete_inode(handle_t *, struct inode *);
++extern struct inode *ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
++                                         int *err);
++extern int ext4_xattr_inode_unlink(struct inode *inode, unsigned long ea_ino);
++extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode,
++                                 struct ext4_xattr_ino_array **array);
++extern void ext4_xattr_inode_array_free(struct inode *inode,
++                                      struct ext4_xattr_ino_array *array);
++
+ extern void ext4_xattr_put_super(struct super_block *);
+ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-misc.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-misc.patch
new file mode 100644 (file)
index 0000000..3691150
--- /dev/null
@@ -0,0 +1,156 @@
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 574a6c9..97d3432 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -1581,6 +1581,8 @@ static inline int ext4_encrypted_inode(struct inode *inode)
+ #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime
++#define JOURNAL_START_HAS_3ARGS       1
++
+ /*
+  * Codes for operating systems
+  */
+@@ -1812,7 +1814,21 @@ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_bl
+ EXTN_FEATURE_FUNCS(2)
+ EXTN_FEATURE_FUNCS(3)
+-EXTN_FEATURE_FUNCS(4)
++static inline bool ext4_has_unknown_ext4_compat_features(struct super_block *sb)
++{
++      return ((EXT4_SB(sb)->s_es->s_feature_compat &
++              cpu_to_le32(~EXT4_FEATURE_COMPAT_SUPP)) != 0);
++}
++static inline bool ext4_has_unknown_ext4_ro_compat_features(struct super_block *sb)
++{
++      return ((EXT4_SB(sb)->s_es->s_feature_ro_compat &
++              cpu_to_le32(~EXT4_FEATURE_RO_COMPAT_SUPP)) != 0);
++}
++static inline bool ext4_has_unknown_ext4_incompat_features(struct super_block *sb)
++{
++      return ((EXT4_SB(sb)->s_es->s_feature_incompat &
++              cpu_to_le32(~EXT4_FEATURE_INCOMPAT_SUPP)) != 0);
++}
+ static inline bool ext4_has_compat_features(struct super_block *sb)
+ {
+@@ -3149,6 +3165,11 @@ struct ext4_extent;
+ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
++extern struct buffer_head *ext4_read_inode_bitmap(struct super_block *sb,
++                                                ext4_group_t block_group);
++extern struct buffer_head *ext4_append(handle_t *handle,
++                                     struct inode *inode,
++                                     ext4_lblk_t *block);
+ extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
+ extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
+                              struct ext4_map_blocks *map, int flags);
+diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
+index 38a740f..0cccda3 100644
+--- a/fs/ext4/ialloc.c
++++ b/fs/ext4/ialloc.c
+@@ -153,7 +153,7 @@ static int ext4_validate_inode_bitmap(struct super_block *sb,
+  *
+  * Return buffer_head of bitmap on success or NULL.
+  */
+-static struct buffer_head *
++struct buffer_head *
+ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
+ {
+       struct ext4_group_desc *desc;
+@@ -233,6 +233,7 @@ out:
+       put_bh(bh);
+       return ERR_PTR(err);
+ }
++EXPORT_SYMBOL(ext4_read_inode_bitmap);
+ /*
+  * NOTE! When we get the inode, we're the only people
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 296760b..04c5f63 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -5452,6 +5452,20 @@ out:
+       sb_end_pagefault(inode->i_sb);
+       return ret;
+ }
++EXPORT_SYMBOL(ext4_map_blocks);
++EXPORT_SYMBOL(ext4_truncate);
++EXPORT_SYMBOL(ext4_iget);
++EXPORT_SYMBOL(ext4_bread);
++EXPORT_SYMBOL(ext4_itable_unused_count);
++EXPORT_SYMBOL(ext4_force_commit);
++EXPORT_SYMBOL(ext4_mark_inode_dirty);
++EXPORT_SYMBOL(ext4_get_group_desc);
++EXPORT_SYMBOL(__ext4_journal_get_write_access);
++EXPORT_SYMBOL(__ext4_journal_start_sb);
++EXPORT_SYMBOL(__ext4_journal_stop);
++EXPORT_SYMBOL(__ext4_handle_dirty_metadata);
++EXPORT_SYMBOL(__ext4_std_error);
++EXPORT_SYMBOL(ext4fs_dirhash);
+ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+ {
+diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
+index 6bcf0ca..02368cb 100644
+--- a/fs/ext4/mballoc.c
++++ b/fs/ext4/mballoc.c
+@@ -723,7 +723,6 @@ void ext4_mb_generate_buddy(struct super_block *sb,
+                               void *buddy, void *bitmap, ext4_group_t group)
+ {
+       struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+-      struct ext4_sb_info *sbi = EXT4_SB(sb);
+       ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+       ext4_grpblk_t i = 0;
+       ext4_grpblk_t first;
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index aaa388a..36635b6 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -47,7 +47,7 @@
+ #define NAMEI_RA_BLOCKS  4
+ #define NAMEI_RA_SIZE      (NAMEI_RA_CHUNKS * NAMEI_RA_BLOCKS)
+-static struct buffer_head *ext4_append(handle_t *handle,
++struct buffer_head *ext4_append(handle_t *handle,
+                                       struct inode *inode,
+                                       ext4_lblk_t *block)
+ {
+@@ -157,6 +157,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
+       }
+       return bh;
+ }
++EXPORT_SYMBOL(ext4_append);
+ #ifndef assert
+ #define assert(test) J_ASSERT(test)
+@@ -2407,7 +2408,7 @@ EXPORT_SYMBOL(ext4_delete_entry);
+  * DIR_NLINK feature is set if 1) nlinks > EXT4_LINK_MAX or 2) nlinks == 2,
+  * since this indicates that nlinks count was previously 1.
+  */
+-static void ext4_inc_count(handle_t *handle, struct inode *inode)
++void ext4_inc_count(handle_t *handle, struct inode *inode)
+ {
+       inc_nlink(inode);
+       if (is_dx(inode) && inode->i_nlink > 1) {
+@@ -2418,16 +2419,18 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
+               }
+       }
+ }
++EXPORT_SYMBOL(ext4_inc_count);
+ /*
+  * If a directory had nlink == 1, then we should let it be 1. This indicates
+  * directory has >EXT4_LINK_MAX subdirs.
+  */
+-static void ext4_dec_count(handle_t *handle, struct inode *inode)
++void ext4_dec_count(handle_t *handle, struct inode *inode)
+ {
+       if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+               drop_nlink(inode);
+ }
++EXPORT_SYMBOL(ext4_dec_count);
+ static int ext4_add_nondir(handle_t *handle,
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-pdirop-001.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-pdirop-001.patch
new file mode 100644 (file)
index 0000000..ad8800f
--- /dev/null
@@ -0,0 +1,1916 @@
+diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
+index f52cf54..3f16939 100644
+--- a/fs/ext4/Makefile
++++ b/fs/ext4/Makefile
+@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
+ ext4-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+               ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++              htree_lock.o \
+               ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+               mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+               xattr_trusted.o inline.o readpage.o sysfs.o
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 2d22f1a..005c9b3 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -28,6 +28,7 @@
+ #include <linux/timer.h>
+ #include <linux/version.h>
+ #include <linux/wait.h>
++#include <linux/htree_lock.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
+ #include <linux/ratelimit.h>
+@@ -880,6 +881,9 @@ struct ext4_inode_info {
+       __u32   i_dtime;
+       ext4_fsblk_t    i_file_acl;
++      /* following fields for parallel directory operations -bzzz */
++      struct semaphore i_append_sem;
++
+       /*
+        * i_block_group is the number of the block group which contains
+        * this file's inode.  Constant across the lifetime of the inode,
+@@ -2086,6 +2090,71 @@ struct dx_hash_info
+       u32             *seed;
+ };
++/* assume name-hash is protected by upper layer */
++#define EXT4_HTREE_LOCK_HASH  0
++
++enum ext4_pdo_lk_types {
++#if EXT4_HTREE_LOCK_HASH
++      EXT4_LK_HASH,
++#endif
++      EXT4_LK_DX,             /* index block */
++      EXT4_LK_DE,             /* directory entry block */
++      EXT4_LK_SPIN,           /* spinlock */
++      EXT4_LK_MAX,
++};
++
++/* read-only bit */
++#define EXT4_LB_RO(b)         (1 << (b))
++/* read + write, high bits for writer */
++#define EXT4_LB_RW(b)         ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
++
++enum ext4_pdo_lock_bits {
++      /* DX lock bits */
++      EXT4_LB_DX_RO           = EXT4_LB_RO(EXT4_LK_DX),
++      EXT4_LB_DX              = EXT4_LB_RW(EXT4_LK_DX),
++      /* DE lock bits */
++      EXT4_LB_DE_RO           = EXT4_LB_RO(EXT4_LK_DE),
++      EXT4_LB_DE              = EXT4_LB_RW(EXT4_LK_DE),
++      /* DX spinlock bits */
++      EXT4_LB_SPIN_RO         = EXT4_LB_RO(EXT4_LK_SPIN),
++      EXT4_LB_SPIN            = EXT4_LB_RW(EXT4_LK_SPIN),
++      /* accurate searching */
++      EXT4_LB_EXACT           = EXT4_LB_RO(EXT4_LK_MAX << 1),
++};
++
++enum ext4_pdo_lock_opc {
++      /* external */
++      EXT4_HLOCK_READDIR      = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
++      EXT4_HLOCK_LOOKUP       = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
++                                 EXT4_LB_EXACT),
++      EXT4_HLOCK_DEL          = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
++                                 EXT4_LB_EXACT),
++      EXT4_HLOCK_ADD          = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
++
++      /* internal */
++      EXT4_HLOCK_LOOKUP_SAFE  = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
++                                 EXT4_LB_EXACT),
++      EXT4_HLOCK_DEL_SAFE     = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
++      EXT4_HLOCK_SPLIT        = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
++};
++
++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
++#define ext4_htree_lock_head_free(lhead)      htree_lock_head_free(lhead)
++
++extern struct htree_lock *ext4_htree_lock_alloc(void);
++#define ext4_htree_lock_free(lck)             htree_lock_free(lck)
++
++extern void ext4_htree_lock(struct htree_lock *lck,
++                          struct htree_lock_head *lhead,
++                          struct inode *dir, unsigned flags);
++#define ext4_htree_unlock(lck)                  htree_unlock(lck)
++
++extern struct buffer_head *__ext4_find_entry(struct inode *dir,
++                                      const struct qstr *d_name,
++                                      struct ext4_dir_entry_2 **res_dir,
++                                      int *inlined, struct htree_lock *lck);
++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++                    struct inode *inode, struct htree_lock *lck);
+ /* 32 and 64 bit signed EOF for dx directories */
+ #define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
+@@ -2475,8 +2544,16 @@ int ext4_insert_dentry(struct inode *dir,
+                      struct ext4_filename *fname, void *data);
+ static inline void ext4_update_dx_flag(struct inode *inode)
+ {
++      /* Disable it for ldiskfs, because going from a DX directory to
++       * a non-DX directory while it is in use will completely break
++       * the htree-locking.
++       * If we really want to support this operation in the future,
++       * we need to exclusively lock the directory at here which will
++       * increase complexity of code */
++#if 0
+       if (!ext4_has_feature_dir_index(inode->i_sb))
+               ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
++#endif
+ }
+ static unsigned char ext4_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+diff --git a/fs/ext4/htree_lock.c b/fs/ext4/htree_lock.c
+new file mode 100644
+index 0000000..99e7375
+--- /dev/null
++++ b/fs/ext4/htree_lock.c
+@@ -0,0 +1,880 @@
++/*
++ * fs/ext4/htree_lock.c
++ *
++ * Copyright (c) 2011, 2012, Intel Corporation.
++ *
++ * Author: Liang Zhen <liang@whamcloud.com>
++ */
++#include <linux/jbd2.h>
++#include <linux/hash.h>
++#include <linux/module.h>
++#include <linux/htree_lock.h>
++
++enum {
++      HTREE_LOCK_BIT_EX       = (1 << HTREE_LOCK_EX),
++      HTREE_LOCK_BIT_PW       = (1 << HTREE_LOCK_PW),
++      HTREE_LOCK_BIT_PR       = (1 << HTREE_LOCK_PR),
++      HTREE_LOCK_BIT_CW       = (1 << HTREE_LOCK_CW),
++      HTREE_LOCK_BIT_CR       = (1 << HTREE_LOCK_CR),
++};
++
++enum {
++      HTREE_LOCK_COMPAT_EX    = 0,
++      HTREE_LOCK_COMPAT_PW    = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
++      HTREE_LOCK_COMPAT_PR    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
++      HTREE_LOCK_COMPAT_CW    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
++      HTREE_LOCK_COMPAT_CR    = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
++                                HTREE_LOCK_BIT_PW,
++};
++
++static int htree_lock_compat[] = {
++      [HTREE_LOCK_EX]         HTREE_LOCK_COMPAT_EX,
++      [HTREE_LOCK_PW]         HTREE_LOCK_COMPAT_PW,
++      [HTREE_LOCK_PR]         HTREE_LOCK_COMPAT_PR,
++      [HTREE_LOCK_CW]         HTREE_LOCK_COMPAT_CW,
++      [HTREE_LOCK_CR]         HTREE_LOCK_COMPAT_CR,
++};
++
++/* max allowed htree-lock depth.
++ * We only need depth=3 for ext4 although user can have higher value. */
++#define HTREE_LOCK_DEP_MAX    16
++
++#ifdef HTREE_LOCK_DEBUG
++
++static char *hl_name[] = {
++      [HTREE_LOCK_EX]         "EX",
++      [HTREE_LOCK_PW]         "PW",
++      [HTREE_LOCK_PR]         "PR",
++      [HTREE_LOCK_CW]         "CW",
++      [HTREE_LOCK_CR]         "CR",
++};
++
++/* lock stats */
++struct htree_lock_node_stats {
++      unsigned long long      blocked[HTREE_LOCK_MAX];
++      unsigned long long      granted[HTREE_LOCK_MAX];
++      unsigned long long      retried[HTREE_LOCK_MAX];
++      unsigned long long      events;
++};
++
++struct htree_lock_stats {
++      struct htree_lock_node_stats    nodes[HTREE_LOCK_DEP_MAX];
++      unsigned long long      granted[HTREE_LOCK_MAX];
++      unsigned long long      blocked[HTREE_LOCK_MAX];
++};
++
++static struct htree_lock_stats hl_stats;
++
++void htree_lock_stat_reset(void)
++{
++      memset(&hl_stats, 0, sizeof(hl_stats));
++}
++
++void htree_lock_stat_print(int depth)
++{
++      int     i;
++      int     j;
++
++      printk(KERN_DEBUG "HTREE LOCK STATS:\n");
++      for (i = 0; i < HTREE_LOCK_MAX; i++) {
++              printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
++                     hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
++      }
++      for (i = 0; i < depth; i++) {
++              printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
++              for (j = 0; j < HTREE_LOCK_MAX; j++) {
++                      printk(KERN_DEBUG
++                              "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
++                              hl_name[j], hl_stats.nodes[i].granted[j],
++                              hl_stats.nodes[i].blocked[j],
++                              hl_stats.nodes[i].retried[j]);
++              }
++      }
++}
++
++#define lk_grant_inc(m)       do { hl_stats.granted[m]++; } while (0)
++#define lk_block_inc(m)       do { hl_stats.blocked[m]++; } while (0)
++#define ln_grant_inc(d, m)    do { hl_stats.nodes[d].granted[m]++; } while (0)
++#define ln_block_inc(d, m)    do { hl_stats.nodes[d].blocked[m]++; } while (0)
++#define ln_retry_inc(d, m)    do { hl_stats.nodes[d].retried[m]++; } while (0)
++#define ln_event_inc(d)       do { hl_stats.nodes[d].events++; } while (0)
++
++#else /* !DEBUG */
++
++void htree_lock_stat_reset(void) {}
++void htree_lock_stat_print(int depth) {}
++
++#define lk_grant_inc(m)             do {} while (0)
++#define lk_block_inc(m)             do {} while (0)
++#define ln_grant_inc(d, m)    do {} while (0)
++#define ln_block_inc(d, m)    do {} while (0)
++#define ln_retry_inc(d, m)    do {} while (0)
++#define ln_event_inc(d)             do {} while (0)
++
++#endif /* DEBUG */
++
++EXPORT_SYMBOL(htree_lock_stat_reset);
++EXPORT_SYMBOL(htree_lock_stat_print);
++
++#define HTREE_DEP_ROOT                  (-1)
++
++#define htree_spin_lock(lhead, dep)                           \
++      bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
++#define htree_spin_unlock(lhead, dep)                         \
++      bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
++
++#define htree_key_event_ignore(child, ln)                     \
++      (!((child)->lc_events & (1 << (ln)->ln_mode)))
++
++static int
++htree_key_list_empty(struct htree_lock_node *ln)
++{
++      return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
++}
++
++static void
++htree_key_list_del_init(struct htree_lock_node *ln)
++{
++      struct htree_lock_node *tmp = NULL;
++
++      if (!list_empty(&ln->ln_minor_list)) {
++              tmp = list_entry(ln->ln_minor_list.next,
++                               struct htree_lock_node, ln_minor_list);
++              list_del_init(&ln->ln_minor_list);
++      }
++
++      if (list_empty(&ln->ln_major_list))
++              return;
++
++      if (tmp == NULL) { /* not on minor key list */
++              list_del_init(&ln->ln_major_list);
++      } else {
++              BUG_ON(!list_empty(&tmp->ln_major_list));
++              list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
++      }
++}
++
++static void
++htree_key_list_replace_init(struct htree_lock_node *old,
++                          struct htree_lock_node *new)
++{
++      if (!list_empty(&old->ln_major_list))
++              list_replace_init(&old->ln_major_list, &new->ln_major_list);
++
++      if (!list_empty(&old->ln_minor_list))
++              list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
++}
++
++static void
++htree_key_event_enqueue(struct htree_lock_child *child,
++                      struct htree_lock_node *ln, int dep, void *event)
++{
++      struct htree_lock_node *tmp;
++
++      /* NB: ALWAYS called holding lhead::lh_lock(dep) */
++      BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
++      if (event == NULL || htree_key_event_ignore(child, ln))
++              return;
++
++      /* shouldn't be a very long list */
++      list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
++              if (tmp->ln_mode == HTREE_LOCK_NL) {
++                      ln_event_inc(dep);
++                      if (child->lc_callback != NULL)
++                              child->lc_callback(tmp->ln_ev_target, event);
++              }
++      }
++}
++
++static int
++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
++                      unsigned dep, int wait, void *event)
++{
++      struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
++      struct htree_lock_node *newln = &newlk->lk_nodes[dep];
++      struct htree_lock_node *curln = &curlk->lk_nodes[dep];
++
++      /* NB: ALWAYS called holding lhead::lh_lock(dep) */
++      /* NB: we only expect PR/PW lock mode at here, only these two modes are
++       * allowed for htree_node_lock(asserted in htree_node_lock_internal),
++       * NL is only used for listener, user can't directly require NL mode */
++      if ((curln->ln_mode == HTREE_LOCK_NL) ||
++          (curln->ln_mode != HTREE_LOCK_PW &&
++           newln->ln_mode != HTREE_LOCK_PW)) {
++              /* no conflict, attach it on granted list of @curlk */
++              if (curln->ln_mode != HTREE_LOCK_NL) {
++                      list_add(&newln->ln_granted_list,
++                               &curln->ln_granted_list);
++              } else {
++                      /* replace key owner */
++                      htree_key_list_replace_init(curln, newln);
++              }
++
++              list_add(&newln->ln_alive_list, &curln->ln_alive_list);
++              htree_key_event_enqueue(child, newln, dep, event);
++              ln_grant_inc(dep, newln->ln_mode);
++              return 1; /* still hold lh_lock */
++      }
++
++      if (!wait) { /* can't grant and don't want to wait */
++              ln_retry_inc(dep, newln->ln_mode);
++              newln->ln_mode = HTREE_LOCK_INVAL;
++              return -1; /* don't wait and just return -1 */
++      }
++
++      newlk->lk_task = current;
++      set_current_state(TASK_UNINTERRUPTIBLE);
++      /* conflict, attach it on blocked list of curlk */
++      list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
++      list_add(&newln->ln_alive_list, &curln->ln_alive_list);
++      ln_block_inc(dep, newln->ln_mode);
++
++      htree_spin_unlock(newlk->lk_head, dep);
++      /* wait to be given the lock */
++      if (newlk->lk_task != NULL)
++              schedule();
++      /* granted, no doubt, wake up will set me RUNNING */
++      if (event == NULL || htree_key_event_ignore(child, newln))
++              return 0; /* granted without lh_lock */
++
++      htree_spin_lock(newlk->lk_head, dep);
++      htree_key_event_enqueue(child, newln, dep, event);
++      return 1; /* still hold lh_lock */
++}
++
++/*
++ * get PR/PW access to particular tree-node according to @dep and @key,
++ * it will return -1 if @wait is false and can't immediately grant this lock.
++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
++ * @event if it's not NULL.
++ * NB: ALWAYS called holding lhead::lh_lock
++ */
++static int
++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
++                       htree_lock_mode_t mode, u32 key, unsigned dep,
++                       int wait, void *event)
++{
++      LIST_HEAD(list);
++      struct htree_lock       *tmp;
++      struct htree_lock       *tmp2;
++      u16                     major;
++      u16                     minor;
++      u8                      reverse;
++      u8                      ma_bits;
++      u8                      mi_bits;
++
++      BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
++      BUG_ON(htree_node_is_granted(lck, dep));
++
++      key = hash_long(key, lhead->lh_hbits);
++
++      mi_bits = lhead->lh_hbits >> 1;
++      ma_bits = lhead->lh_hbits - mi_bits;
++
++      lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
++      lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
++      lck->lk_nodes[dep].ln_mode = mode;
++
++      /*
++       * The major key list is an ordered list, so searches are started
++       * at the end of the list that is numerically closer to major_key,
++       * so at most half of the list will be walked (for well-distributed
++       * keys). The list traversal aborts early if the expected key
++       * location is passed.
++       */
++      reverse = (major >= (1 << (ma_bits - 1)));
++
++      if (reverse) {
++              list_for_each_entry_reverse(tmp,
++                                      &lhead->lh_children[dep].lc_list,
++                                      lk_nodes[dep].ln_major_list) {
++                      if (tmp->lk_nodes[dep].ln_major_key == major) {
++                              goto search_minor;
++
++                      } else if (tmp->lk_nodes[dep].ln_major_key < major) {
++                              /* attach _after_ @tmp */
++                              list_add(&lck->lk_nodes[dep].ln_major_list,
++                                       &tmp->lk_nodes[dep].ln_major_list);
++                              goto out_grant_major;
++                      }
++              }
++
++              list_add(&lck->lk_nodes[dep].ln_major_list,
++                       &lhead->lh_children[dep].lc_list);
++              goto out_grant_major;
++
++      } else {
++              list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
++                                  lk_nodes[dep].ln_major_list) {
++                      if (tmp->lk_nodes[dep].ln_major_key == major) {
++                              goto search_minor;
++
++                      } else if (tmp->lk_nodes[dep].ln_major_key > major) {
++                              /* insert _before_ @tmp */
++                              list_add_tail(&lck->lk_nodes[dep].ln_major_list,
++                                      &tmp->lk_nodes[dep].ln_major_list);
++                              goto out_grant_major;
++                      }
++              }
++
++              list_add_tail(&lck->lk_nodes[dep].ln_major_list,
++                            &lhead->lh_children[dep].lc_list);
++              goto out_grant_major;
++      }
++
++ search_minor:
++      /*
++       * NB: minor_key list doesn't have a "head", @list is just a
++       * temporary stub for helping list searching, make sure it's removed
++       * after searching.
++       * minor_key list is an ordered list too.
++       */
++      list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
++
++      reverse = (minor >= (1 << (mi_bits - 1)));
++
++      if (reverse) {
++              list_for_each_entry_reverse(tmp2, &list,
++                                          lk_nodes[dep].ln_minor_list) {
++                      if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
++                              goto out_enqueue;
++
++                      } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
++                              /* attach _after_ @tmp2 */
++                              list_add(&lck->lk_nodes[dep].ln_minor_list,
++                                       &tmp2->lk_nodes[dep].ln_minor_list);
++                              goto out_grant_minor;
++                      }
++              }
++
++              list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
++
++      } else {
++              list_for_each_entry(tmp2, &list,
++                                  lk_nodes[dep].ln_minor_list) {
++                      if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
++                              goto out_enqueue;
++
++                      } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
++                              /* insert _before_ @tmp2 */
++                              list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
++                                      &tmp2->lk_nodes[dep].ln_minor_list);
++                              goto out_grant_minor;
++                      }
++              }
++
++              list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
++      }
++
++ out_grant_minor:
++      if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
++              /* new lock @lck is the first one on minor_key list, which
++               * means it has the smallest minor_key and it should
++               * replace @tmp as minor_key owner */
++              list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
++                                &lck->lk_nodes[dep].ln_major_list);
++      }
++      /* remove the temporary head */
++      list_del(&list);
++
++ out_grant_major:
++      ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
++      return 1; /* granted with holding lh_lock */
++
++ out_enqueue:
++      list_del(&list); /* remove temprary head */
++      return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
++}
++
++/*
++ * release the key of @lck at level @dep, and grant any blocked locks.
++ * caller will still listen on @key if @event is not NULL, which means
++ * caller can see a event (by event_cb) while granting any lock with
++ * the same key at level @dep.
++ * NB: ALWAYS called holding lhead::lh_lock
++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
++ */
++static void
++htree_node_unlock_internal(struct htree_lock_head *lhead,
++                         struct htree_lock *curlk, unsigned dep, void *event)
++{
++      struct htree_lock_node  *curln = &curlk->lk_nodes[dep];
++      struct htree_lock       *grtlk = NULL;
++      struct htree_lock_node  *grtln;
++      struct htree_lock       *poslk;
++      struct htree_lock       *tmplk;
++
++      if (!htree_node_is_granted(curlk, dep))
++              return;
++
++      if (!list_empty(&curln->ln_granted_list)) {
++              /* there is another granted lock */
++              grtlk = list_entry(curln->ln_granted_list.next,
++                                 struct htree_lock,
++                                 lk_nodes[dep].ln_granted_list);
++              list_del_init(&curln->ln_granted_list);
++      }
++
++      if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
++              /*
++               * @curlk is the only granted lock, so we confirmed:
++               * a) curln is key owner (attached on major/minor_list),
++               *    so if there is any blocked lock, it should be attached
++               *    on curln->ln_blocked_list
++               * b) we always can grant the first blocked lock
++               */
++              grtlk = list_entry(curln->ln_blocked_list.next,
++                                 struct htree_lock,
++                                 lk_nodes[dep].ln_blocked_list);
++              BUG_ON(grtlk->lk_task == NULL);
++              wake_up_process(grtlk->lk_task);
++      }
++
++      if (event != NULL &&
++          lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
++              curln->ln_ev_target = event;
++              curln->ln_mode = HTREE_LOCK_NL; /* listen! */
++      } else {
++              curln->ln_mode = HTREE_LOCK_INVAL;
++      }
++
++      if (grtlk == NULL) { /* I must be the only one locking this key */
++              struct htree_lock_node *tmpln;
++
++              BUG_ON(htree_key_list_empty(curln));
++
++              if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
++                      return;
++
++              /* not listening */
++              if (list_empty(&curln->ln_alive_list)) { /* no more listener */
++                      htree_key_list_del_init(curln);
++                      return;
++              }
++
++              tmpln = list_entry(curln->ln_alive_list.next,
++                                 struct htree_lock_node, ln_alive_list);
++
++              BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
++
++              htree_key_list_replace_init(curln, tmpln);
++              list_del_init(&curln->ln_alive_list);
++
++              return;
++      }
++
++      /* have a granted lock */
++      grtln = &grtlk->lk_nodes[dep];
++      if (!list_empty(&curln->ln_blocked_list)) {
++              /* only key owner can be on both lists */
++              BUG_ON(htree_key_list_empty(curln));
++
++              if (list_empty(&grtln->ln_blocked_list)) {
++                      list_add(&grtln->ln_blocked_list,
++                               &curln->ln_blocked_list);
++              }
++              list_del_init(&curln->ln_blocked_list);
++      }
++      /*
++       * NB: this is the tricky part:
++       * We have only two modes for child-lock (PR and PW), also,
++       * only owner of the key (attached on major/minor_list) can be on
++       * both blocked_list and granted_list, so @grtlk must be one
++       * of these two cases:
++       *
++       * a) @grtlk is taken from granted_list, which means we've granted
++       *    more than one lock so @grtlk has to be PR, the first blocked
++       *    lock must be PW and we can't grant it at all.
++       *    So even @grtlk is not owner of the key (empty blocked_list),
++       *    we don't care because we can't grant any lock.
++       * b) we just grant a new lock which is taken from head of blocked
++       *    list, and it should be the first granted lock, and it should
++       *    be the first one linked on blocked_list.
++       *
++       * Either way, we can get correct result by iterating blocked_list
++       * of @grtlk, and don't have to bother on how to find out
++       * owner of current key.
++       */
++      list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
++                               lk_nodes[dep].ln_blocked_list) {
++              if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
++                  poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
++                      break;
++              /* grant all readers */
++              list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
++              list_add(&poslk->lk_nodes[dep].ln_granted_list,
++                       &grtln->ln_granted_list);
++
++              BUG_ON(poslk->lk_task == NULL);
++              wake_up_process(poslk->lk_task);
++      }
++
++      /* if @curln is the owner of this key, replace it with @grtln */
++      if (!htree_key_list_empty(curln))
++              htree_key_list_replace_init(curln, grtln);
++
++      if (curln->ln_mode == HTREE_LOCK_INVAL)
++              list_del_init(&curln->ln_alive_list);
++}
++
++/*
++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
++ * and 0 only if @wait is false and can't grant it immediately
++ */
++int
++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
++                  u32 key, unsigned dep, int wait, void *event)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      int rc;
++
++      BUG_ON(dep >= lck->lk_depth);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++      htree_spin_lock(lhead, dep);
++      rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
++      if (rc != 0)
++              htree_spin_unlock(lhead, dep);
++      return rc >= 0;
++}
++EXPORT_SYMBOL(htree_node_lock_try);
++
++/* it's wrapper of htree_node_unlock_internal */
++void
++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++
++      BUG_ON(dep >= lck->lk_depth);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++      htree_spin_lock(lhead, dep);
++      htree_node_unlock_internal(lhead, lck, dep, event);
++      htree_spin_unlock(lhead, dep);
++}
++EXPORT_SYMBOL(htree_node_unlock);
++
++/* stop listening on child-lock level @dep */
++void
++htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
++{
++      struct htree_lock_node *ln = &lck->lk_nodes[dep];
++      struct htree_lock_node *tmp;
++
++      BUG_ON(htree_node_is_granted(lck, dep));
++      BUG_ON(!list_empty(&ln->ln_blocked_list));
++      BUG_ON(!list_empty(&ln->ln_granted_list));
++
++      if (!htree_node_is_listening(lck, dep))
++              return;
++
++      htree_spin_lock(lck->lk_head, dep);
++      ln->ln_mode = HTREE_LOCK_INVAL;
++      ln->ln_ev_target = NULL;
++
++      if (htree_key_list_empty(ln)) { /* not owner */
++              list_del_init(&ln->ln_alive_list);
++              goto out;
++      }
++
++      /* I'm the owner... */
++      if (list_empty(&ln->ln_alive_list)) { /* no more listener */
++              htree_key_list_del_init(ln);
++              goto out;
++      }
++
++      tmp = list_entry(ln->ln_alive_list.next,
++                       struct htree_lock_node, ln_alive_list);
++
++      BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
++      htree_key_list_replace_init(ln, tmp);
++      list_del_init(&ln->ln_alive_list);
++ out:
++      htree_spin_unlock(lck->lk_head, dep);
++}
++EXPORT_SYMBOL(htree_node_stop_listen);
++
++/* release all child-locks if we have any */
++static void
++htree_node_release_all(struct htree_lock *lck)
++{
++      int     i;
++
++      for (i = 0; i < lck->lk_depth; i++) {
++              if (htree_node_is_granted(lck, i))
++                      htree_node_unlock(lck, i, NULL);
++              else if (htree_node_is_listening(lck, i))
++                      htree_node_stop_listen(lck, i);
++      }
++}
++
++/*
++ * obtain htree lock, it could be blocked inside if there's conflict
++ * with any granted or blocked lock and @wait is true.
++ * NB: ALWAYS called holding lhead::lh_lock
++ */
++static int
++htree_lock_internal(struct htree_lock *lck, int wait)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      int     granted = 0;
++      int     blocked = 0;
++      int     i;
++
++      for (i = 0; i < HTREE_LOCK_MAX; i++) {
++              if (lhead->lh_ngranted[i] != 0)
++                      granted |= 1 << i;
++              if (lhead->lh_nblocked[i] != 0)
++                      blocked |= 1 << i;
++      }
++      if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
++          (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
++              /* will block current lock even it just conflicts with any
++               * other blocked lock, so lock like EX wouldn't starve */
++              if (!wait)
++                      return -1;
++              lhead->lh_nblocked[lck->lk_mode]++;
++              lk_block_inc(lck->lk_mode);
++
++              lck->lk_task = current;
++              list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
++
++              set_current_state(TASK_UNINTERRUPTIBLE);
++              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++              /* wait to be given the lock */
++              if (lck->lk_task != NULL)
++                      schedule();
++              /* granted, no doubt. wake up will set me RUNNING */
++              return 0; /* without lh_lock */
++      }
++      lhead->lh_ngranted[lck->lk_mode]++;
++      lk_grant_inc(lck->lk_mode);
++      return 1;
++}
++
++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
++static void
++htree_unlock_internal(struct htree_lock *lck)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      struct htree_lock *tmp;
++      struct htree_lock *tmp2;
++      int granted = 0;
++      int i;
++
++      BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
++
++      lhead->lh_ngranted[lck->lk_mode]--;
++      lck->lk_mode = HTREE_LOCK_INVAL;
++
++      for (i = 0; i < HTREE_LOCK_MAX; i++) {
++              if (lhead->lh_ngranted[i] != 0)
++                      granted |= 1 << i;
++      }
++      list_for_each_entry_safe(tmp, tmp2,
++                               &lhead->lh_blocked_list, lk_blocked_list) {
++              /* conflict with any granted lock? */
++              if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
++                      break;
++
++              list_del_init(&tmp->lk_blocked_list);
++
++              BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
++
++              lhead->lh_nblocked[tmp->lk_mode]--;
++              lhead->lh_ngranted[tmp->lk_mode]++;
++              granted |= 1 << tmp->lk_mode;
++
++              BUG_ON(tmp->lk_task == NULL);
++              wake_up_process(tmp->lk_task);
++      }
++}
++
++/* it's wrapper of htree_lock_internal and exported interface.
++ * It always return 1 with granted lock if @wait is true, it can return 0
++ * if @wait is false and locking request can't be granted immediately */
++int
++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
++             htree_lock_mode_t mode, int wait)
++{
++      int     rc;
++
++      BUG_ON(lck->lk_depth > lhead->lh_depth);
++      BUG_ON(lck->lk_head != NULL);
++      BUG_ON(lck->lk_task != NULL);
++
++      lck->lk_head = lhead;
++      lck->lk_mode = mode;
++
++      htree_spin_lock(lhead, HTREE_DEP_ROOT);
++      rc = htree_lock_internal(lck, wait);
++      if (rc != 0)
++              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++      return rc >= 0;
++}
++EXPORT_SYMBOL(htree_lock_try);
++
++/* it's wrapper of htree_unlock_internal and exported interface.
++ * It will release all htree_node_locks and htree_lock */
++void
++htree_unlock(struct htree_lock *lck)
++{
++      BUG_ON(lck->lk_head == NULL);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++      htree_node_release_all(lck);
++
++      htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
++      htree_unlock_internal(lck);
++      htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
++      lck->lk_head = NULL;
++      lck->lk_task = NULL;
++}
++EXPORT_SYMBOL(htree_unlock);
++
++/* change lock mode */
++void
++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
++{
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++      lck->lk_mode = mode;
++}
++EXPORT_SYMBOL(htree_change_mode);
++
++/* release htree lock, and lock it again with new mode.
++ * This function will first release all htree_node_locks and htree_lock,
++ * then try to gain htree_lock with new @mode.
++ * It always return 1 with granted lock if @wait is true, it can return 0
++ * if @wait is false and locking request can't be granted immediately */
++int
++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      int rc;
++
++      BUG_ON(lhead == NULL);
++      BUG_ON(lck->lk_mode == mode);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
++
++      htree_node_release_all(lck);
++
++      htree_spin_lock(lhead, HTREE_DEP_ROOT);
++      htree_unlock_internal(lck);
++      lck->lk_mode = mode;
++      rc = htree_lock_internal(lck, wait);
++      if (rc != 0)
++              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++      return rc >= 0;
++}
++EXPORT_SYMBOL(htree_change_lock_try);
++
++/* create a htree_lock head with @depth levels (number of child-locks),
++ * it is a per resoruce structure */
++struct htree_lock_head *
++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
++{
++      struct htree_lock_head *lhead;
++      int  i;
++
++      if (depth > HTREE_LOCK_DEP_MAX) {
++              printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
++                      depth, HTREE_LOCK_DEP_MAX);
++              return NULL;
++      }
++
++      lhead = kzalloc(offsetof(struct htree_lock_head,
++                               lh_children[depth]) + priv, GFP_NOFS);
++      if (lhead == NULL)
++              return NULL;
++
++      if (hbits < HTREE_HBITS_MIN)
++              lhead->lh_hbits = HTREE_HBITS_MIN;
++      else if (hbits > HTREE_HBITS_MAX)
++              lhead->lh_hbits = HTREE_HBITS_MAX;
++
++      lhead->lh_lock = 0;
++      lhead->lh_depth = depth;
++      INIT_LIST_HEAD(&lhead->lh_blocked_list);
++      if (priv > 0) {
++              lhead->lh_private = (void *)lhead +
++                      offsetof(struct htree_lock_head, lh_children[depth]);
++      }
++
++      for (i = 0; i < depth; i++) {
++              INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
++              lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
++      }
++      return lhead;
++}
++EXPORT_SYMBOL(htree_lock_head_alloc);
++
++/* free the htree_lock head */
++void
++htree_lock_head_free(struct htree_lock_head *lhead)
++{
++      int     i;
++
++      BUG_ON(!list_empty(&lhead->lh_blocked_list));
++      for (i = 0; i < lhead->lh_depth; i++)
++              BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
++      kfree(lhead);
++}
++EXPORT_SYMBOL(htree_lock_head_free);
++
++/* register event callback for @events of child-lock at level @dep */
++void
++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
++                      unsigned events, htree_event_cb_t callback)
++{
++      BUG_ON(lhead->lh_depth <= dep);
++      lhead->lh_children[dep].lc_events = events;
++      lhead->lh_children[dep].lc_callback = callback;
++}
++EXPORT_SYMBOL(htree_lock_event_attach);
++
++/* allocate a htree_lock, which is per-thread structure, @pbytes is some
++ * extra-bytes as private data for caller */
++struct htree_lock *
++htree_lock_alloc(unsigned depth, unsigned pbytes)
++{
++      struct htree_lock *lck;
++      int i = offsetof(struct htree_lock, lk_nodes[depth]);
++
++      if (depth > HTREE_LOCK_DEP_MAX) {
++              printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
++                      depth, HTREE_LOCK_DEP_MAX);
++              return NULL;
++      }
++      lck = kzalloc(i + pbytes, GFP_NOFS);
++      if (lck == NULL)
++              return NULL;
++
++      if (pbytes != 0)
++              lck->lk_private = (void *)lck + i;
++      lck->lk_mode = HTREE_LOCK_INVAL;
++      lck->lk_depth = depth;
++      INIT_LIST_HEAD(&lck->lk_blocked_list);
++
++      for (i = 0; i < depth; i++) {
++              struct htree_lock_node *node = &lck->lk_nodes[i];
++
++              node->ln_mode = HTREE_LOCK_INVAL;
++              INIT_LIST_HEAD(&node->ln_major_list);
++              INIT_LIST_HEAD(&node->ln_minor_list);
++              INIT_LIST_HEAD(&node->ln_alive_list);
++              INIT_LIST_HEAD(&node->ln_blocked_list);
++              INIT_LIST_HEAD(&node->ln_granted_list);
++      }
++
++      return lck;
++}
++EXPORT_SYMBOL(htree_lock_alloc);
++
++/* free htree_lock node */
++void
++htree_lock_free(struct htree_lock *lck)
++{
++      BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
++      kfree(lck);
++}
++EXPORT_SYMBOL(htree_lock_free);
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 2543b8f..e70b61a 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -52,6 +52,7 @@ struct buffer_head *ext4_append(handle_t *handle,
+                                       ext4_lblk_t *block)
+ {
+       struct buffer_head *bh;
++      struct ext4_inode_info *ei = EXT4_I(inode);
+       int err;
+       if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+@@ -59,15 +60,22 @@ struct buffer_head *ext4_append(handle_t *handle,
+                     EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+               return ERR_PTR(-ENOSPC);
++      /* with parallel dir operations all appends
++      * have to be serialized -bzzz */
++      down(&ei->i_append_sem);
++
+       *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+       bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
+-      if (IS_ERR(bh))
++      if (IS_ERR(bh)) {
++              up(&ei->i_append_sem);
+               return bh;
++      }
+       inode->i_size += inode->i_sb->s_blocksize;
+       EXT4_I(inode)->i_disksize = inode->i_size;
+       BUFFER_TRACE(bh, "get_write_access");
+       err = ext4_journal_get_write_access(handle, bh);
++      up(&ei->i_append_sem);
+       if (err) {
+               brelse(bh);
+               ext4_std_error(inode->i_sb, err);
+@@ -247,7 +255,8 @@ static unsigned dx_node_limit(struct inode *dir);
+ static struct dx_frame *dx_probe(struct ext4_filename *fname,
+                                struct inode *dir,
+                                struct dx_hash_info *hinfo,
+-                               struct dx_frame *frame);
++                               struct dx_frame *frame,
++                               struct htree_lock *lck);
+ static void dx_release(struct dx_frame *frames);
+ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+                      unsigned blocksize, struct dx_hash_info *hinfo,
+@@ -261,12 +270,13 @@ static void dx_insert_block(struct dx_frame *frame,
+ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_frame *frame,
+                                struct dx_frame *frames,
+-                               __u32 *start_hash);
++                               __u32 *start_hash, struct htree_lock *lck);
+ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+               struct ext4_filename *fname,
+-              struct ext4_dir_entry_2 **res_dir);
++              struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck);
+ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+-                           struct dentry *dentry, struct inode *inode);
++                           struct dentry *dentry, struct inode *inode,
++                           struct htree_lock *lck);
+ /* checksumming functions */
+ void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+@@ -733,6 +743,227 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+ }
+ #endif /* DX_DEBUG */
++/* private data for htree_lock */
++struct ext4_dir_lock_data {
++      unsigned                ld_flags;  /* bits-map for lock types */
++      unsigned                ld_count;  /* # entries of the last DX block */
++      struct dx_entry         ld_at_entry; /* copy of leaf dx_entry */
++      struct dx_entry         *ld_at;    /* position of leaf dx_entry */
++};
++
++#define ext4_htree_lock_data(l)       ((struct ext4_dir_lock_data *)(l)->lk_private)
++#define ext4_find_entry(dir, name, dirent, inline) \
++                      __ext4_find_entry(dir, name, dirent, inline, NULL)
++#define ext4_add_entry(handle, dentry, inode) \
++                      __ext4_add_entry(handle, dentry, inode, NULL)
++
++/* NB: ext4_lblk_t is 32 bits so we use high bits to identify invalid blk */
++#define EXT4_HTREE_NODE_CHANGED       (0xcafeULL << 32)
++
++static void ext4_htree_event_cb(void *target, void *event)
++{
++      u64 *block = (u64 *)target;
++
++      if (*block == dx_get_block((struct dx_entry *)event))
++              *block = EXT4_HTREE_NODE_CHANGED;
++}
++
++struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits)
++{
++      struct htree_lock_head *lhead;
++
++      lhead = htree_lock_head_alloc(EXT4_LK_MAX, hbits, 0);
++      if (lhead != NULL) {
++              htree_lock_event_attach(lhead, EXT4_LK_SPIN, HTREE_EVENT_WR,
++                                      ext4_htree_event_cb);
++      }
++      return lhead;
++}
++EXPORT_SYMBOL(ext4_htree_lock_head_alloc);
++
++struct htree_lock *ext4_htree_lock_alloc(void)
++{
++      return htree_lock_alloc(EXT4_LK_MAX,
++                              sizeof(struct ext4_dir_lock_data));
++}
++EXPORT_SYMBOL(ext4_htree_lock_alloc);
++
++static htree_lock_mode_t ext4_htree_mode(unsigned flags)
++{
++      switch (flags) {
++      default: /* 0 or unknown flags require EX lock */
++              return HTREE_LOCK_EX;
++      case EXT4_HLOCK_READDIR:
++              return HTREE_LOCK_PR;
++      case EXT4_HLOCK_LOOKUP:
++              return HTREE_LOCK_CR;
++      case EXT4_HLOCK_DEL:
++      case EXT4_HLOCK_ADD:
++              return HTREE_LOCK_CW;
++      }
++}
++
++/* return PR for read-only operations, otherwise return EX */
++static inline htree_lock_mode_t ext4_htree_safe_mode(unsigned flags)
++{
++      int writer = (flags & EXT4_LB_DE) == EXT4_LB_DE;
++
++      /* 0 requires EX lock */
++      return (flags == 0 || writer) ? HTREE_LOCK_EX : HTREE_LOCK_PR;
++}
++
++static int ext4_htree_safe_locked(struct htree_lock *lck)
++{
++      int writer;
++
++      if (lck == NULL || lck->lk_mode == HTREE_LOCK_EX)
++              return 1;
++
++      writer = (ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_DE) ==
++               EXT4_LB_DE;
++      if (writer) /* all readers & writers are excluded? */
++              return lck->lk_mode == HTREE_LOCK_EX;
++
++      /* all writers are excluded? */
++      return lck->lk_mode == HTREE_LOCK_PR ||
++             lck->lk_mode == HTREE_LOCK_PW ||
++             lck->lk_mode == HTREE_LOCK_EX;
++}
++
++/* relock htree_lock with EX mode if it's change operation, otherwise
++ * relock it with PR mode. It's noop if PDO is disabled. */
++static void ext4_htree_safe_relock(struct htree_lock *lck)
++{
++      if (!ext4_htree_safe_locked(lck)) {
++              unsigned flags = ext4_htree_lock_data(lck)->ld_flags;
++
++              htree_change_lock(lck, ext4_htree_safe_mode(flags));
++      }
++}
++
++void ext4_htree_lock(struct htree_lock *lck, struct htree_lock_head *lhead,
++                   struct inode *dir, unsigned flags)
++{
++      htree_lock_mode_t mode = is_dx(dir) ? ext4_htree_mode(flags) :
++                                            ext4_htree_safe_mode(flags);
++
++      ext4_htree_lock_data(lck)->ld_flags = flags;
++      htree_lock(lck, lhead, mode);
++      if (!is_dx(dir))
++              ext4_htree_safe_relock(lck); /* make sure it's safe locked */
++}
++EXPORT_SYMBOL(ext4_htree_lock);
++
++static int ext4_htree_node_lock(struct htree_lock *lck, struct dx_entry *at,
++                              unsigned lmask, int wait, void *ev)
++{
++      u32     key = (at == NULL) ? 0 : dx_get_block(at);
++      u32     mode;
++
++      /* NOOP if htree is well protected or caller doesn't require the lock */
++      if (ext4_htree_safe_locked(lck) ||
++         !(ext4_htree_lock_data(lck)->ld_flags & lmask))
++              return 1;
++
++      mode = (ext4_htree_lock_data(lck)->ld_flags & lmask) == lmask ?
++              HTREE_LOCK_PW : HTREE_LOCK_PR;
++      while (1) {
++              if (htree_node_lock_try(lck, mode, key, ffz(~lmask), wait, ev))
++                      return 1;
++              if (!(lmask & EXT4_LB_SPIN)) /* not a spinlock */
++                      return 0;
++              cpu_relax(); /* spin until granted */
++      }
++}
++
++static int ext4_htree_node_locked(struct htree_lock *lck, unsigned lmask)
++{
++      return ext4_htree_safe_locked(lck) ||
++             htree_node_is_granted(lck, ffz(~lmask));
++}
++
++static void ext4_htree_node_unlock(struct htree_lock *lck,
++                                 unsigned lmask, void *buf)
++{
++      /* NB: it's safe to call mutiple times or even it's not locked */
++      if (!ext4_htree_safe_locked(lck) &&
++           htree_node_is_granted(lck, ffz(~lmask)))
++              htree_node_unlock(lck, ffz(~lmask), buf);
++}
++
++#define ext4_htree_dx_lock(lck, key)          \
++      ext4_htree_node_lock(lck, key, EXT4_LB_DX, 1, NULL)
++#define ext4_htree_dx_lock_try(lck, key)      \
++      ext4_htree_node_lock(lck, key, EXT4_LB_DX, 0, NULL)
++#define ext4_htree_dx_unlock(lck)             \
++      ext4_htree_node_unlock(lck, EXT4_LB_DX, NULL)
++#define ext4_htree_dx_locked(lck)             \
++      ext4_htree_node_locked(lck, EXT4_LB_DX)
++
++static void ext4_htree_dx_need_lock(struct htree_lock *lck)
++{
++      struct ext4_dir_lock_data *ld;
++
++      if (ext4_htree_safe_locked(lck))
++              return;
++
++      ld = ext4_htree_lock_data(lck);
++      switch (ld->ld_flags) {
++      default:
++              return;
++      case EXT4_HLOCK_LOOKUP:
++              ld->ld_flags = EXT4_HLOCK_LOOKUP_SAFE;
++              return;
++      case EXT4_HLOCK_DEL:
++              ld->ld_flags = EXT4_HLOCK_DEL_SAFE;
++              return;
++      case EXT4_HLOCK_ADD:
++              ld->ld_flags = EXT4_HLOCK_SPLIT;
++              return;
++      }
++}
++
++#define ext4_htree_de_lock(lck, key)          \
++      ext4_htree_node_lock(lck, key, EXT4_LB_DE, 1, NULL)
++#define ext4_htree_de_unlock(lck)             \
++      ext4_htree_node_unlock(lck, EXT4_LB_DE, NULL)
++
++#define ext4_htree_spin_lock(lck, key, event) \
++      ext4_htree_node_lock(lck, key, EXT4_LB_SPIN, 0, event)
++#define ext4_htree_spin_unlock(lck)           \
++      ext4_htree_node_unlock(lck, EXT4_LB_SPIN, NULL)
++#define ext4_htree_spin_unlock_listen(lck, p) \
++      ext4_htree_node_unlock(lck, EXT4_LB_SPIN, p)
++
++static void ext4_htree_spin_stop_listen(struct htree_lock *lck)
++{
++      if (!ext4_htree_safe_locked(lck) &&
++          htree_node_is_listening(lck, ffz(~EXT4_LB_SPIN)))
++              htree_node_stop_listen(lck, ffz(~EXT4_LB_SPIN));
++}
++
++enum {
++      DX_HASH_COL_IGNORE,     /* ignore collision while probing frames */
++      DX_HASH_COL_YES,        /* there is collision and it does matter */
++      DX_HASH_COL_NO,         /* there is no collision */
++};
++
++static int dx_probe_hash_collision(struct htree_lock *lck,
++                                 struct dx_entry *entries,
++                                 struct dx_entry *at, u32 hash)
++{
++      if (!(lck && ext4_htree_lock_data(lck)->ld_flags & EXT4_LB_EXACT)) {
++              return DX_HASH_COL_IGNORE; /* don't care about collision */
++
++      } else if (at == entries + dx_get_count(entries) - 1) {
++              return DX_HASH_COL_IGNORE; /* not in any leaf of this DX */
++
++      } else { /* hash collision? */
++              return ((dx_get_hash(at + 1) & ~1) == hash) ?
++                      DX_HASH_COL_YES : DX_HASH_COL_NO;
++      }
++}
++
+ /*
+  * Probe for a directory leaf block to search.
+  *
+@@ -744,10 +975,11 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
+  */
+ static struct dx_frame *
+ dx_probe(struct ext4_filename *fname, struct inode *dir,
+-       struct dx_hash_info *hinfo, struct dx_frame *frame_in)
++       struct dx_hash_info *hinfo, struct dx_frame *frame_in,
++       struct htree_lock *lck)
+ {
+       unsigned count, indirect;
+-      struct dx_entry *at, *entries, *p, *q, *m;
++      struct dx_entry *at, *entries, *p, *q, *m, *dx = NULL;
+       struct dx_root_info *info;
+       struct dx_frame *frame = frame_in;
+       struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
+@@ -808,8 +1040,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+       dxtrace(printk("Look up %x", hash));
+       while (1) {
++              if (indirect == 0) { /* the last index level */
++                      /* NB: ext4_htree_dx_lock() could be noop if
++                       * DX-lock flag is not set for current operation */
++                      ext4_htree_dx_lock(lck, dx);
++                      ext4_htree_spin_lock(lck, dx, NULL);
++              }
+               count = dx_get_count(entries);
+-              if (!count || count > dx_get_limit(entries)) {
++              if (count == 0 || count > dx_get_limit(entries)) {
++                      ext4_htree_spin_unlock(lck); /* release spin */
+                       ext4_warning_inode(dir,
+                                          "dx entry: count %u beyond limit %u",
+                                          count, dx_get_limit(entries));
+@@ -847,8 +1086,70 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
+                              dx_get_block(at)));
+               frame->entries = entries;
+               frame->at = at;
+-              if (!indirect--)
++
++              if (indirect == 0) { /* the last index level */
++                      struct ext4_dir_lock_data *ld;
++                      u64 myblock;
++
++                      /* By default we only lock DE-block, however, we will
++                       * also lock the last level DX-block if:
++                       * a) there is hash collision
++                       *    we will set DX-lock flag (a few lines below)
++                       *    and redo to lock DX-block
++                       *    see detail in dx_probe_hash_collision()
++                       * b) it's a retry from splitting
++                       *    we need to lock the last level DX-block so nobody
++                       *    else can split any leaf blocks under the same
++                       *    DX-block, see detail in ext4_dx_add_entry()
++                       */
++                      if (ext4_htree_dx_locked(lck)) {
++                              /* DX-block is locked, just lock DE-block
++                               * and return */
++                              ext4_htree_spin_unlock(lck);
++                              if (!ext4_htree_safe_locked(lck))
++                                      ext4_htree_de_lock(lck, frame->at);
++                              return frame;
++                      }
++                      /* it's pdirop and no DX lock */
++                      if (dx_probe_hash_collision(lck, entries, at, hash) ==
++                          DX_HASH_COL_YES) {
++                              /* found hash collision, set DX-lock flag
++                               * and retry to abtain DX-lock */
++                              ext4_htree_spin_unlock(lck);
++                              ext4_htree_dx_need_lock(lck);
++                              continue;
++                      }
++                      ld = ext4_htree_lock_data(lck);
++                      /* because I don't lock DX, so @at can't be trusted
++                       * after I release spinlock so I have to save it */
++                      ld->ld_at = at;
++                      ld->ld_at_entry = *at;
++                      ld->ld_count = dx_get_count(entries);
++
++                      frame->at = &ld->ld_at_entry;
++                      myblock = dx_get_block(at);
++
++                      /* NB: ordering locking */
++                      ext4_htree_spin_unlock_listen(lck, &myblock);
++                      /* other thread can split this DE-block because:
++                       * a) I don't have lock for the DE-block yet
++                       * b) I released spinlock on DX-block
++                       * if it happened I can detect it by listening
++                       * splitting event on this DE-block */
++                      ext4_htree_de_lock(lck, frame->at);
++                      ext4_htree_spin_stop_listen(lck);
++
++                      if (myblock == EXT4_HTREE_NODE_CHANGED) {
++                              /* someone split this DE-block before
++                               * I locked it, I need to retry and lock
++                               * valid DE-block */
++                              ext4_htree_de_unlock(lck);
++                              continue;
++                      }
+                       return frame;
++              }
++              dx = at;
++              indirect--;
+               frame++;
+               frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+               if (IS_ERR(frame->bh)) {
+@@ -915,7 +1216,7 @@ static void dx_release(struct dx_frame *frames)
+ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_frame *frame,
+                                struct dx_frame *frames,
+-                               __u32 *start_hash)
++                               __u32 *start_hash, struct htree_lock *lck)
+ {
+       struct dx_frame *p;
+       struct buffer_head *bh;
+@@ -930,12 +1231,22 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+        * this loop, num_frames indicates the number of interior
+        * nodes need to be read.
+        */
++      ext4_htree_de_unlock(lck);
+       while (1) {
+-              if (++(p->at) < p->entries + dx_get_count(p->entries))
+-                      break;
++              if (num_frames > 0 || ext4_htree_dx_locked(lck)) {
++                      /* num_frames > 0 :
++                       *   DX block
++                       * ext4_htree_dx_locked:
++                       *   frame->at is reliable pointer returned by dx_probe,
++                       *   otherwise dx_probe already knew no collision */
++                      if (++(p->at) < p->entries + dx_get_count(p->entries))
++                              break;
++              }
+               if (p == frames)
+                       return 0;
+               num_frames++;
++              if (num_frames == 1)
++                      ext4_htree_dx_unlock(lck);
+               p--;
+       }
+@@ -958,6 +1269,13 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+        * block so no check is necessary
+        */
+       while (num_frames--) {
++              if (num_frames == 0) {
++                      /* it's not always necessary, we just don't want to
++                       * detect hash collision again */
++                      ext4_htree_dx_need_lock(lck);
++                      ext4_htree_dx_lock(lck, p->at);
++              }
++
+               bh = ext4_read_dirblock(dir, dx_get_block(p->at), INDEX);
+               if (IS_ERR(bh))
+                       return PTR_ERR(bh);
+@@ -966,6 +1284,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+               p->bh = bh;
+               p->at = p->entries = ((struct dx_node *) bh->b_data)->entries;
+       }
++      ext4_htree_de_lock(lck, p->at);
+       return 1;
+ }
+@@ -1110,10 +1429,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+       }
+       hinfo.hash = start_hash;
+       hinfo.minor_hash = 0;
+-      frame = dx_probe(NULL, dir, &hinfo, frames);
++      /* assume it's PR locked */
++      frame = dx_probe(NULL, dir, &hinfo, frames, NULL);
+       if (IS_ERR(frame))
+               return PTR_ERR(frame);
+-
+       /* Add '.' and '..' from the htree header */
+       if (!start_hash && !start_minor_hash) {
+               de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
+@@ -1148,7 +1467,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
+               count += ret;
+               hashval = ~0;
+               ret = ext4_htree_next_block(dir, HASH_NB_ALWAYS,
+-                                          frame, frames, &hashval);
++                                          frame, frames, &hashval, NULL);
+               *next_hash = hashval;
+               if (ret < 0) {
+                       err = ret;
+@@ -1372,10 +1691,10 @@ static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block,
+  * The returned buffer_head has ->b_count elevated.  The caller is expected
+  * to brelse() it when appropriate.
+  */
+-static struct buffer_head * ext4_find_entry (struct inode *dir,
++struct buffer_head *__ext4_find_entry(struct inode *dir,
+                                       const struct qstr *d_name,
+                                       struct ext4_dir_entry_2 **res_dir,
+-                                      int *inlined)
++                                      int *inlined, struct htree_lock *lck)
+ {
+       struct super_block *sb;
+       struct buffer_head *bh_use[NAMEI_RA_SIZE];
+@@ -1423,7 +1742,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
+               goto restart;
+       }
+       if (is_dx(dir)) {
+-              ret = ext4_dx_find_entry(dir, &fname, res_dir);
++              ret = ext4_dx_find_entry(dir, &fname, res_dir, lck);
+               /*
+                * On success, or if the error was file not found,
+                * return.  Otherwise, fall back to doing a search the
+@@ -1433,6 +1752,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
+                       goto cleanup_and_exit;
+               dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
+                              "falling back\n"));
++              ext4_htree_safe_relock(lck);
+       }
+       nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
+       start = EXT4_I(dir)->i_dir_start_lookup;
+@@ -1528,10 +1848,12 @@ cleanup_and_exit:
+       ext4_fname_free_filename(&fname);
+       return ret;
+ }
++EXPORT_SYMBOL(__ext4_find_entry);
+ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+                       struct ext4_filename *fname,
+-                      struct ext4_dir_entry_2 **res_dir)
++                      struct ext4_dir_entry_2 **res_dir,
++                      struct htree_lock *lck)
+ {
+       struct super_block * sb = dir->i_sb;
+       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+@@ -1543,7 +1865,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+ #ifdef CONFIG_EXT4_FS_ENCRYPTION
+       *res_dir = NULL;
+ #endif
+-      frame = dx_probe(fname, dir, NULL, frames);
++      frame = dx_probe(fname, dir, NULL, frames, lck);
+       if (IS_ERR(frame))
+               return (struct buffer_head *) frame;
+       do {
+@@ -1565,7 +1887,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+               /* Check to see if we should continue to search */
+               retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame,
+-                                             frames, NULL);
++                                             frames, NULL, lck);
+               if (retval < 0) {
+                       ext4_warning_inode(dir,
+                               "error %d reading directory index block",
+@@ -1738,8 +2060,9 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
+  * Returns pointer to de in block into which the new entry will be inserted.
+  */
+ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+-                      struct buffer_head **bh,struct dx_frame *frame,
+-                      struct dx_hash_info *hinfo)
++                      struct buffer_head **bh, struct dx_frame *frames,
++                      struct dx_frame *frame, struct dx_hash_info *hinfo,
++                      struct htree_lock *lck)
+ {
+       unsigned blocksize = dir->i_sb->s_blocksize;
+       unsigned count, continued;
+@@ -1801,8 +2124,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+                                       hash2, split, count-split));
+       /* Fancy dance to stay within two buffers */
+-      de2 = dx_move_dirents(data1, data2, map + split, count - split,
+-                            blocksize);
++      if (hinfo->hash < hash2) {
++              de2 = dx_move_dirents(data1, data2, map + split,
++                                    count - split, blocksize);
++      } else {
++              /* make sure we will add entry to the same block which
++               * we have already locked */
++              de2 = dx_move_dirents(data1, data2, map, split, blocksize);
++      }
+       de = dx_pack_dirents(data1, blocksize);
+       de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+                                          (char *) de,
+@@ -1823,12 +2152,21 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
+       dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2,
+                       blocksize, 1));
+-      /* Which block gets the new entry? */
+-      if (hinfo->hash >= hash2) {
+-              swap(*bh, bh2);
+-              de = de2;
++      ext4_htree_spin_lock(lck, frame > frames ? (frame - 1)->at : NULL,
++                           frame->at); /* notify block is being split */
++      if (hinfo->hash < hash2) {
++              dx_insert_block(frame, hash2 + continued, newblock);
++
++      } else {
++              /* switch block number */
++              dx_insert_block(frame, hash2 + continued,
++                              dx_get_block(frame->at));
++              dx_set_block(frame->at, newblock);
++              (frame->at)++;
+       }
+-      dx_insert_block(frame, hash2 + continued, newblock);
++      ext4_htree_spin_unlock(lck);
++      ext4_htree_dx_unlock(lck);
++
+       err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
+       if (err)
+               goto journal_error;
+@@ -2121,7 +2459,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
+       if (retval)
+               goto out_frames;        
+-      de = do_split(handle,dir, &bh2, frame, &fname->hinfo);
++      de = do_split(handle,dir, &bh2, frames, frame, &fname->hinfo, NULL);
+       if (IS_ERR(de)) {
+               retval = PTR_ERR(de);
+               goto out_frames;
+@@ -2231,8 +2569,8 @@ out:
+  * may not sleep between calling this and putting something into
+  * the entry, as someone else might have used it while you slept.
+  */
+-static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+-                        struct inode *inode)
++int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++                    struct inode *inode, struct htree_lock *lck)
+ {
+       struct inode *dir = d_inode(dentry->d_parent);
+       struct buffer_head *bh = NULL;
+@@ -2273,9 +2611,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
+               if (dentry->d_name.len == 2 &&
+                   memcmp(dentry->d_name.name, "..", 2) == 0)
+                       return ext4_update_dotdot(handle, dentry, inode);
+-              retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
++              retval = ext4_dx_add_entry(handle, &fname, dentry, inode, lck);
+               if (!retval || (retval != ERR_BAD_DX_DIR))
+                       goto out;
++              ext4_htree_safe_relock(lck);
+               ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
+               dx_fallback++;
+               ext4_mark_inode_dirty(handle, dir);
+@@ -2325,12 +2664,14 @@ out:
+               ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
+       return retval;
+ }
++EXPORT_SYMBOL(__ext4_add_entry);
+ /*
+  * Returns 0 for success, or a negative error value
+  */
+ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+-                           struct dentry *dentry, struct inode *inode)
++                           struct dentry *dentry, struct inode *inode,
++                           struct htree_lock *lck)
+ {
+       struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
+       struct dx_entry *entries, *at;
+@@ -2343,7 +2684,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+ again:
+       restart = 0;
+-      frame = dx_probe(fname, dir, NULL, frames);
++      frame = dx_probe(fname, dir, NULL, frames, lck);
+       if (IS_ERR(frame))
+               return PTR_ERR(frame);
+       entries = frame->entries;
+@@ -2373,6 +2714,11 @@ again:
+               struct dx_node *node2;
+               struct buffer_head *bh2;
++              if (!ext4_htree_safe_locked(lck)) { /* retry with EX lock */
++                      ext4_htree_safe_relock(lck);
++                      restart = 1;
++                      goto cleanup;
++              }
+               while (frame > frames) {
+                       if (dx_get_count((frame - 1)->entries) <
+                           dx_get_limit((frame - 1)->entries)) {
+@@ -2472,8 +2818,32 @@ again:
+                       restart = 1;
+                       goto cleanup;
+               }
++      } else if (!ext4_htree_dx_locked(lck)) {
++              struct ext4_dir_lock_data *ld = ext4_htree_lock_data(lck);
++
++              /* not well protected, require DX lock */
++              ext4_htree_dx_need_lock(lck);
++              at = frame > frames ? (frame - 1)->at : NULL;
++
++              /* NB: no risk of deadlock because it's just a try.
++               *
++               * NB: we check ld_count for twice, the first time before
++               * having DX lock, the second time after holding DX lock.
++               *
++               * NB: We never free blocks for directory so far, which
++               * means value returned by dx_get_count() should equal to
++               * ld->ld_count if nobody split any DE-block under @at,
++               * and ld->ld_at still points to valid dx_entry. */
++              if ((ld->ld_count != dx_get_count(entries)) ||
++                  !ext4_htree_dx_lock_try(lck, at) ||
++                  (ld->ld_count != dx_get_count(entries))) {
++                      restart = 1;
++                      goto cleanup;
++              }
++              /* OK, I've got DX lock and nothing changed */
++              frame->at = ld->ld_at;
+       }
+-      de = do_split(handle, dir, &bh, frame, &fname->hinfo);
++      de = do_split(handle, dir, &bh, frames, frame, &fname->hinfo, lck);
+       if (IS_ERR(de)) {
+               err = PTR_ERR(de);
+               goto cleanup;
+@@ -2484,6 +2854,8 @@ again:
+ journal_error:
+       ext4_std_error(dir->i_sb, err);
+ cleanup:
++      ext4_htree_dx_unlock(lck);
++      ext4_htree_de_unlock(lck);
+       brelse(bh);
+       dx_release(frames);
+       /* @restart is true means htree-path has been changed, we need to
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 02fe65b..be65ad4 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -896,6 +896,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
+       ei->vfs_inode.i_version = 1;
+       spin_lock_init(&ei->i_raw_lock);
++      sema_init(&ei->i_append_sem, 1);
+       INIT_LIST_HEAD(&ei->i_prealloc_list);
+       spin_lock_init(&ei->i_prealloc_lock);
+       ext4_es_init_tree(&ei->i_es_tree);
+diff --git a/include/linux/htree_lock.h b/include/linux/htree_lock.h
+new file mode 100644
+index 0000000..9dc7788
+--- /dev/null
++++ b/include/linux/htree_lock.h
+@@ -0,0 +1,187 @@
++/*
++ * include/linux/htree_lock.h
++ *
++ * Copyright (c) 2011, 2012, Intel Corporation.
++ *
++ * Author: Liang Zhen <liang@whamcloud.com>
++ */
++
++/*
++ * htree lock
++ *
++ * htree_lock is an advanced lock, it can support five lock modes (concept is
++ * taken from DLM) and it's a sleeping lock.
++ *
++ * most common use case is:
++ * - create a htree_lock_head for data
++ * - each thread (contender) creates it's own htree_lock
++ * - contender needs to call htree_lock(lock_node, mode) to protect data and
++ *   call htree_unlock to release lock
++ *
++ * Also, there is advanced use-case which is more complex, user can have
++ * PW/PR lock on particular key, it's mostly used while user holding shared
++ * lock on the htree (CW, CR)
++ *
++ * htree_lock(lock_node, HTREE_LOCK_CR); lock the htree with CR
++ * htree_node_lock(lock_node, HTREE_LOCK_PR, key...); lock @key with PR
++ * ...
++ * htree_node_unlock(lock_node);; unlock the key
++ *
++ * Another tip is, we can have N-levels of this kind of keys, all we need to
++ * do is specifying N-levels while creating htree_lock_head, then we can
++ * lock/unlock a specific level by:
++ * htree_node_lock(lock_node, mode1, key1, level1...);
++ * do something;
++ * htree_node_lock(lock_node, mode1, key2, level2...);
++ * do something;
++ * htree_node_unlock(lock_node, level2);
++ * htree_node_unlock(lock_node, level1);
++ *
++ * NB: for multi-level, should be careful about locking order to avoid deadlock
++ */
++
++#ifndef _LINUX_HTREE_LOCK_H
++#define _LINUX_HTREE_LOCK_H
++
++#include <linux/list.h>
++#include <linux/spinlock.h>
++#include <linux/sched.h>
++
++/*
++ * Lock Modes
++ * more details can be found here:
++ * http://en.wikipedia.org/wiki/Distributed_lock_manager
++ */
++typedef enum {
++      HTREE_LOCK_EX   = 0, /* exclusive lock: incompatible with all others */
++      HTREE_LOCK_PW,       /* protected write: allows only CR users */
++      HTREE_LOCK_PR,       /* protected read: allow PR, CR users */
++      HTREE_LOCK_CW,       /* concurrent write: allow CR, CW users */
++      HTREE_LOCK_CR,       /* concurrent read: allow all but EX users */
++      HTREE_LOCK_MAX,      /* number of lock modes */
++} htree_lock_mode_t;
++
++#define HTREE_LOCK_NL         HTREE_LOCK_MAX
++#define HTREE_LOCK_INVAL      0xdead10c
++
++enum {
++      HTREE_HBITS_MIN         = 2,
++      HTREE_HBITS_DEF         = 14,
++      HTREE_HBITS_MAX         = 32,
++};
++
++enum {
++      HTREE_EVENT_DISABLE     = (0),
++      HTREE_EVENT_RD          = (1 << HTREE_LOCK_PR),
++      HTREE_EVENT_WR          = (1 << HTREE_LOCK_PW),
++      HTREE_EVENT_RDWR        = (HTREE_EVENT_RD | HTREE_EVENT_WR),
++};
++
++struct htree_lock;
++
++typedef void (*htree_event_cb_t)(void *target, void *event);
++
++struct htree_lock_child {
++      struct list_head        lc_list;        /* granted list */
++      htree_event_cb_t        lc_callback;    /* event callback */
++      unsigned                lc_events;      /* event types */
++};
++
++struct htree_lock_head {
++      unsigned long           lh_lock;        /* bits lock */
++      /* blocked lock list (htree_lock) */
++      struct list_head        lh_blocked_list;
++      /* # key levels */
++      u16                     lh_depth;
++      /* hash bits for key and limit number of locks */
++      u16                     lh_hbits;
++      /* counters for blocked locks */
++      u16                     lh_nblocked[HTREE_LOCK_MAX];
++      /* counters for granted locks */
++      u16                     lh_ngranted[HTREE_LOCK_MAX];
++      /* private data */
++      void                    *lh_private;
++      /* array of children locks */
++      struct htree_lock_child lh_children[0];
++};
++
++/* htree_lock_node_t is child-lock for a specific key (ln_value) */
++struct htree_lock_node {
++      htree_lock_mode_t       ln_mode;
++      /* major hash key */
++      u16                     ln_major_key;
++      /* minor hash key */
++      u16                     ln_minor_key;
++      struct list_head        ln_major_list;
++      struct list_head        ln_minor_list;
++      /* alive list, all locks (granted, blocked, listening) are on it */
++      struct list_head        ln_alive_list;
++      /* blocked list */
++      struct list_head        ln_blocked_list;
++      /* granted list */
++      struct list_head        ln_granted_list;
++      void                    *ln_ev_target;
++};
++
++struct htree_lock {
++      struct task_struct      *lk_task;
++      struct htree_lock_head  *lk_head;
++      void                    *lk_private;
++      unsigned                lk_depth;
++      htree_lock_mode_t       lk_mode;
++      struct list_head        lk_blocked_list;
++      struct htree_lock_node  lk_nodes[0];
++};
++
++/* create a lock head, which stands for a resource */
++struct htree_lock_head *htree_lock_head_alloc(unsigned depth,
++                                            unsigned hbits, unsigned priv);
++/* free a lock head */
++void htree_lock_head_free(struct htree_lock_head *lhead);
++/* register event callback for child lock at level @depth */
++void htree_lock_event_attach(struct htree_lock_head *lhead, unsigned depth,
++                           unsigned events, htree_event_cb_t callback);
++/* create a lock handle, which stands for a thread */
++struct htree_lock *htree_lock_alloc(unsigned depth, unsigned pbytes);
++/* free a lock handle */
++void htree_lock_free(struct htree_lock *lck);
++/* lock htree, when @wait is true, 0 is returned if the lock can't
++ * be granted immediately */
++int htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
++                 htree_lock_mode_t mode, int wait);
++/* unlock htree */
++void htree_unlock(struct htree_lock *lck);
++/* unlock and relock htree with @new_mode */
++int htree_change_lock_try(struct htree_lock *lck,
++                        htree_lock_mode_t new_mode, int wait);
++void htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode);
++/* require child lock (key) of htree at level @dep, @event will be sent to all
++ * listeners on this @key while lock being granted */
++int htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
++                      u32 key, unsigned dep, int wait, void *event);
++/* release child lock at level @dep, this lock will listen on it's key
++ * if @event isn't NULL, event_cb will be called against @lck while granting
++ * any other lock at level @dep with the same key */
++void htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event);
++/* stop listening on child lock at level @dep */
++void htree_node_stop_listen(struct htree_lock *lck, unsigned dep);
++/* for debug */
++void htree_lock_stat_print(int depth);
++void htree_lock_stat_reset(void);
++
++#define htree_lock(lck, lh, mode)     htree_lock_try(lck, lh, mode, 1)
++#define htree_change_lock(lck, mode)  htree_change_lock_try(lck, mode, 1)
++
++#define htree_lock_mode(lck)          ((lck)->lk_mode)
++
++#define htree_node_lock(lck, mode, key, dep)  \
++      htree_node_lock_try(lck, mode, key, dep, 1, NULL)
++/* this is only safe in thread context of lock owner */
++#define htree_node_is_granted(lck, dep)               \
++      ((lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_INVAL && \
++       (lck)->lk_nodes[dep].ln_mode != HTREE_LOCK_NL)
++/* this is only safe in thread context of lock owner */
++#define htree_node_is_listening(lck, dep)     \
++      ((lck)->lk_nodes[dep].ln_mode == HTREE_LOCK_NL)
++
++#endif
diff --git a/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-pdirop.patch b/ldiskfs/kernel_patches/patches/ubuntu14+16/ext4-pdirop.patch
new file mode 100644 (file)
index 0000000..508e396
--- /dev/null
@@ -0,0 +1,1916 @@
+diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
+index f52cf54..3f16939 100644
+--- a/fs/ext4/Makefile
++++ b/fs/ext4/Makefile
+@@ -6,6 +6,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o
+ ext4-y        := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
+               ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
++              htree_lock.o \
+               ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
+               mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
+               xattr_trusted.o inline.o readpage.o sysfs.o
+diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
+index 2d22f1a..005c9b3 100644
+--- a/fs/ext4/ext4.h
++++ b/fs/ext4/ext4.h
+@@ -28,6 +28,7 @@
+ #include <linux/timer.h>
+ #include <linux/version.h>
+ #include <linux/wait.h>
++#include <linux/htree_lock.h>
+ #include <linux/blockgroup_lock.h>
+ #include <linux/percpu_counter.h>
+ #include <linux/ratelimit.h>
+@@ -880,6 +881,9 @@ struct ext4_inode_info {
+       __u32   i_dtime;
+       ext4_fsblk_t    i_file_acl;
++      /* following fields for parallel directory operations -bzzz */
++      struct semaphore i_append_sem;
++
+       /*
+        * i_block_group is the number of the block group which contains
+        * this file's inode.  Constant across the lifetime of the inode,
+@@ -2086,6 +2090,71 @@ struct dx_hash_info
+       u32             *seed;
+ };
++/* assume name-hash is protected by upper layer */
++#define EXT4_HTREE_LOCK_HASH  0
++
++enum ext4_pdo_lk_types {
++#if EXT4_HTREE_LOCK_HASH
++      EXT4_LK_HASH,
++#endif
++      EXT4_LK_DX,             /* index block */
++      EXT4_LK_DE,             /* directory entry block */
++      EXT4_LK_SPIN,           /* spinlock */
++      EXT4_LK_MAX,
++};
++
++/* read-only bit */
++#define EXT4_LB_RO(b)         (1 << (b))
++/* read + write, high bits for writer */
++#define EXT4_LB_RW(b)         ((1 << (b)) | (1 << (EXT4_LK_MAX + (b))))
++
++enum ext4_pdo_lock_bits {
++      /* DX lock bits */
++      EXT4_LB_DX_RO           = EXT4_LB_RO(EXT4_LK_DX),
++      EXT4_LB_DX              = EXT4_LB_RW(EXT4_LK_DX),
++      /* DE lock bits */
++      EXT4_LB_DE_RO           = EXT4_LB_RO(EXT4_LK_DE),
++      EXT4_LB_DE              = EXT4_LB_RW(EXT4_LK_DE),
++      /* DX spinlock bits */
++      EXT4_LB_SPIN_RO         = EXT4_LB_RO(EXT4_LK_SPIN),
++      EXT4_LB_SPIN            = EXT4_LB_RW(EXT4_LK_SPIN),
++      /* accurate searching */
++      EXT4_LB_EXACT           = EXT4_LB_RO(EXT4_LK_MAX << 1),
++};
++
++enum ext4_pdo_lock_opc {
++      /* external */
++      EXT4_HLOCK_READDIR      = (EXT4_LB_DE_RO | EXT4_LB_DX_RO),
++      EXT4_HLOCK_LOOKUP       = (EXT4_LB_DE_RO | EXT4_LB_SPIN_RO |
++                                 EXT4_LB_EXACT),
++      EXT4_HLOCK_DEL          = (EXT4_LB_DE | EXT4_LB_SPIN_RO |
++                                 EXT4_LB_EXACT),
++      EXT4_HLOCK_ADD          = (EXT4_LB_DE | EXT4_LB_SPIN_RO),
++
++      /* internal */
++      EXT4_HLOCK_LOOKUP_SAFE  = (EXT4_LB_DE_RO | EXT4_LB_DX_RO |
++                                 EXT4_LB_EXACT),
++      EXT4_HLOCK_DEL_SAFE     = (EXT4_LB_DE | EXT4_LB_DX_RO | EXT4_LB_EXACT),
++      EXT4_HLOCK_SPLIT        = (EXT4_LB_DE | EXT4_LB_DX | EXT4_LB_SPIN),
++};
++
++extern struct htree_lock_head *ext4_htree_lock_head_alloc(unsigned hbits);
++#define ext4_htree_lock_head_free(lhead)      htree_lock_head_free(lhead)
++
++extern struct htree_lock *ext4_htree_lock_alloc(void);
++#define ext4_htree_lock_free(lck)             htree_lock_free(lck)
++
++extern void ext4_htree_lock(struct htree_lock *lck,
++                          struct htree_lock_head *lhead,
++                          struct inode *dir, unsigned flags);
++#define ext4_htree_unlock(lck)                  htree_unlock(lck)
++
++extern struct buffer_head *__ext4_find_entry(struct inode *dir,
++                                      const struct qstr *d_name,
++                                      struct ext4_dir_entry_2 **res_dir,
++                                      int *inlined, struct htree_lock *lck);
++extern int __ext4_add_entry(handle_t *handle, struct dentry *dentry,
++                    struct inode *inode, struct htree_lock *lck);
+ /* 32 and 64 bit signed EOF for dx directories */
+ #define EXT4_HTREE_EOF_32BIT   ((1UL  << (32 - 1)) - 1)
+@@ -2475,8 +2544,16 @@ int ext4_insert_dentry(struct inode *dir,
+                      struct ext4_filename *fname, void *data);
+ static inline void ext4_update_dx_flag(struct inode *inode)
+ {
++      /* Disable it for ldiskfs, because going from a DX directory to
++       * a non-DX directory while it is in use will completely break
++       * the htree-locking.
++       * If we really want to support this operation in the future,
++       * we need to exclusively lock the directory at here which will
++       * increase complexity of code */
++#if 0
+       if (!ext4_has_feature_dir_index(inode->i_sb))
+               ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
++#endif
+ }
+ static unsigned char ext4_filetype_table[] = {
+       DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
+diff --git a/fs/ext4/htree_lock.c b/fs/ext4/htree_lock.c
+new file mode 100644
+index 0000000..99e7375
+--- /dev/null
++++ b/fs/ext4/htree_lock.c
+@@ -0,0 +1,880 @@
++/*
++ * fs/ext4/htree_lock.c
++ *
++ * Copyright (c) 2011, 2012, Intel Corporation.
++ *
++ * Author: Liang Zhen <liang@whamcloud.com>
++ */
++#include <linux/jbd2.h>
++#include <linux/hash.h>
++#include <linux/module.h>
++#include <linux/htree_lock.h>
++
++enum {
++      HTREE_LOCK_BIT_EX       = (1 << HTREE_LOCK_EX),
++      HTREE_LOCK_BIT_PW       = (1 << HTREE_LOCK_PW),
++      HTREE_LOCK_BIT_PR       = (1 << HTREE_LOCK_PR),
++      HTREE_LOCK_BIT_CW       = (1 << HTREE_LOCK_CW),
++      HTREE_LOCK_BIT_CR       = (1 << HTREE_LOCK_CR),
++};
++
++enum {
++      HTREE_LOCK_COMPAT_EX    = 0,
++      HTREE_LOCK_COMPAT_PW    = HTREE_LOCK_COMPAT_EX | HTREE_LOCK_BIT_CR,
++      HTREE_LOCK_COMPAT_PR    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_PR,
++      HTREE_LOCK_COMPAT_CW    = HTREE_LOCK_COMPAT_PW | HTREE_LOCK_BIT_CW,
++      HTREE_LOCK_COMPAT_CR    = HTREE_LOCK_COMPAT_CW | HTREE_LOCK_BIT_PR |
++                                HTREE_LOCK_BIT_PW,
++};
++
++static int htree_lock_compat[] = {
++      [HTREE_LOCK_EX]         HTREE_LOCK_COMPAT_EX,
++      [HTREE_LOCK_PW]         HTREE_LOCK_COMPAT_PW,
++      [HTREE_LOCK_PR]         HTREE_LOCK_COMPAT_PR,
++      [HTREE_LOCK_CW]         HTREE_LOCK_COMPAT_CW,
++      [HTREE_LOCK_CR]         HTREE_LOCK_COMPAT_CR,
++};
++
++/* max allowed htree-lock depth.
++ * We only need depth=3 for ext4 although user can have higher value. */
++#define HTREE_LOCK_DEP_MAX    16
++
++#ifdef HTREE_LOCK_DEBUG
++
++static char *hl_name[] = {
++      [HTREE_LOCK_EX]         "EX",
++      [HTREE_LOCK_PW]         "PW",
++      [HTREE_LOCK_PR]         "PR",
++      [HTREE_LOCK_CW]         "CW",
++      [HTREE_LOCK_CR]         "CR",
++};
++
++/* lock stats */
++struct htree_lock_node_stats {
++      unsigned long long      blocked[HTREE_LOCK_MAX];
++      unsigned long long      granted[HTREE_LOCK_MAX];
++      unsigned long long      retried[HTREE_LOCK_MAX];
++      unsigned long long      events;
++};
++
++struct htree_lock_stats {
++      struct htree_lock_node_stats    nodes[HTREE_LOCK_DEP_MAX];
++      unsigned long long      granted[HTREE_LOCK_MAX];
++      unsigned long long      blocked[HTREE_LOCK_MAX];
++};
++
++static struct htree_lock_stats hl_stats;
++
++void htree_lock_stat_reset(void)
++{
++      memset(&hl_stats, 0, sizeof(hl_stats));
++}
++
++void htree_lock_stat_print(int depth)
++{
++      int     i;
++      int     j;
++
++      printk(KERN_DEBUG "HTREE LOCK STATS:\n");
++      for (i = 0; i < HTREE_LOCK_MAX; i++) {
++              printk(KERN_DEBUG "[%s]: G [%10llu], B [%10llu]\n",
++                     hl_name[i], hl_stats.granted[i], hl_stats.blocked[i]);
++      }
++      for (i = 0; i < depth; i++) {
++              printk(KERN_DEBUG "HTREE CHILD [%d] STATS:\n", i);
++              for (j = 0; j < HTREE_LOCK_MAX; j++) {
++                      printk(KERN_DEBUG
++                              "[%s]: G [%10llu], B [%10llu], R [%10llu]\n",
++                              hl_name[j], hl_stats.nodes[i].granted[j],
++                              hl_stats.nodes[i].blocked[j],
++                              hl_stats.nodes[i].retried[j]);
++              }
++      }
++}
++
++#define lk_grant_inc(m)       do { hl_stats.granted[m]++; } while (0)
++#define lk_block_inc(m)       do { hl_stats.blocked[m]++; } while (0)
++#define ln_grant_inc(d, m)    do { hl_stats.nodes[d].granted[m]++; } while (0)
++#define ln_block_inc(d, m)    do { hl_stats.nodes[d].blocked[m]++; } while (0)
++#define ln_retry_inc(d, m)    do { hl_stats.nodes[d].retried[m]++; } while (0)
++#define ln_event_inc(d)       do { hl_stats.nodes[d].events++; } while (0)
++
++#else /* !DEBUG */
++
++void htree_lock_stat_reset(void) {}
++void htree_lock_stat_print(int depth) {}
++
++#define lk_grant_inc(m)             do {} while (0)
++#define lk_block_inc(m)             do {} while (0)
++#define ln_grant_inc(d, m)    do {} while (0)
++#define ln_block_inc(d, m)    do {} while (0)
++#define ln_retry_inc(d, m)    do {} while (0)
++#define ln_event_inc(d)             do {} while (0)
++
++#endif /* DEBUG */
++
++EXPORT_SYMBOL(htree_lock_stat_reset);
++EXPORT_SYMBOL(htree_lock_stat_print);
++
++#define HTREE_DEP_ROOT                  (-1)
++
++#define htree_spin_lock(lhead, dep)                           \
++      bit_spin_lock((dep) + 1, &(lhead)->lh_lock)
++#define htree_spin_unlock(lhead, dep)                         \
++      bit_spin_unlock((dep) + 1, &(lhead)->lh_lock)
++
++#define htree_key_event_ignore(child, ln)                     \
++      (!((child)->lc_events & (1 << (ln)->ln_mode)))
++
++static int
++htree_key_list_empty(struct htree_lock_node *ln)
++{
++      return list_empty(&ln->ln_major_list) && list_empty(&ln->ln_minor_list);
++}
++
++static void
++htree_key_list_del_init(struct htree_lock_node *ln)
++{
++      struct htree_lock_node *tmp = NULL;
++
++      if (!list_empty(&ln->ln_minor_list)) {
++              tmp = list_entry(ln->ln_minor_list.next,
++                               struct htree_lock_node, ln_minor_list);
++              list_del_init(&ln->ln_minor_list);
++      }
++
++      if (list_empty(&ln->ln_major_list))
++              return;
++
++      if (tmp == NULL) { /* not on minor key list */
++              list_del_init(&ln->ln_major_list);
++      } else {
++              BUG_ON(!list_empty(&tmp->ln_major_list));
++              list_replace_init(&ln->ln_major_list, &tmp->ln_major_list);
++      }
++}
++
++static void
++htree_key_list_replace_init(struct htree_lock_node *old,
++                          struct htree_lock_node *new)
++{
++      if (!list_empty(&old->ln_major_list))
++              list_replace_init(&old->ln_major_list, &new->ln_major_list);
++
++      if (!list_empty(&old->ln_minor_list))
++              list_replace_init(&old->ln_minor_list, &new->ln_minor_list);
++}
++
++static void
++htree_key_event_enqueue(struct htree_lock_child *child,
++                      struct htree_lock_node *ln, int dep, void *event)
++{
++      struct htree_lock_node *tmp;
++
++      /* NB: ALWAYS called holding lhead::lh_lock(dep) */
++      BUG_ON(ln->ln_mode == HTREE_LOCK_NL);
++      if (event == NULL || htree_key_event_ignore(child, ln))
++              return;
++
++      /* shouldn't be a very long list */
++      list_for_each_entry(tmp, &ln->ln_alive_list, ln_alive_list) {
++              if (tmp->ln_mode == HTREE_LOCK_NL) {
++                      ln_event_inc(dep);
++                      if (child->lc_callback != NULL)
++                              child->lc_callback(tmp->ln_ev_target, event);
++              }
++      }
++}
++
++static int
++htree_node_lock_enqueue(struct htree_lock *newlk, struct htree_lock *curlk,
++                      unsigned dep, int wait, void *event)
++{
++      struct htree_lock_child *child = &newlk->lk_head->lh_children[dep];
++      struct htree_lock_node *newln = &newlk->lk_nodes[dep];
++      struct htree_lock_node *curln = &curlk->lk_nodes[dep];
++
++      /* NB: ALWAYS called holding lhead::lh_lock(dep) */
++      /* NB: we only expect PR/PW lock mode at here, only these two modes are
++       * allowed for htree_node_lock(asserted in htree_node_lock_internal),
++       * NL is only used for listener, user can't directly require NL mode */
++      if ((curln->ln_mode == HTREE_LOCK_NL) ||
++          (curln->ln_mode != HTREE_LOCK_PW &&
++           newln->ln_mode != HTREE_LOCK_PW)) {
++              /* no conflict, attach it on granted list of @curlk */
++              if (curln->ln_mode != HTREE_LOCK_NL) {
++                      list_add(&newln->ln_granted_list,
++                               &curln->ln_granted_list);
++              } else {
++                      /* replace key owner */
++                      htree_key_list_replace_init(curln, newln);
++              }
++
++              list_add(&newln->ln_alive_list, &curln->ln_alive_list);
++              htree_key_event_enqueue(child, newln, dep, event);
++              ln_grant_inc(dep, newln->ln_mode);
++              return 1; /* still hold lh_lock */
++      }
++
++      if (!wait) { /* can't grant and don't want to wait */
++              ln_retry_inc(dep, newln->ln_mode);
++              newln->ln_mode = HTREE_LOCK_INVAL;
++              return -1; /* don't wait and just return -1 */
++      }
++
++      newlk->lk_task = current;
++      set_current_state(TASK_UNINTERRUPTIBLE);
++      /* conflict, attach it on blocked list of curlk */
++      list_add_tail(&newln->ln_blocked_list, &curln->ln_blocked_list);
++      list_add(&newln->ln_alive_list, &curln->ln_alive_list);
++      ln_block_inc(dep, newln->ln_mode);
++
++      htree_spin_unlock(newlk->lk_head, dep);
++      /* wait to be given the lock */
++      if (newlk->lk_task != NULL)
++              schedule();
++      /* granted, no doubt, wake up will set me RUNNING */
++      if (event == NULL || htree_key_event_ignore(child, newln))
++              return 0; /* granted without lh_lock */
++
++      htree_spin_lock(newlk->lk_head, dep);
++      htree_key_event_enqueue(child, newln, dep, event);
++      return 1; /* still hold lh_lock */
++}
++
++/*
++ * get PR/PW access to particular tree-node according to @dep and @key,
++ * it will return -1 if @wait is false and can't immediately grant this lock.
++ * All listeners(HTREE_LOCK_NL) on @dep and with the same @key will get
++ * @event if it's not NULL.
++ * NB: ALWAYS called holding lhead::lh_lock
++ */
++static int
++htree_node_lock_internal(struct htree_lock_head *lhead, struct htree_lock *lck,
++                       htree_lock_mode_t mode, u32 key, unsigned dep,
++                       int wait, void *event)
++{
++      LIST_HEAD(list);
++      struct htree_lock       *tmp;
++      struct htree_lock       *tmp2;
++      u16                     major;
++      u16                     minor;
++      u8                      reverse;
++      u8                      ma_bits;
++      u8                      mi_bits;
++
++      BUG_ON(mode != HTREE_LOCK_PW && mode != HTREE_LOCK_PR);
++      BUG_ON(htree_node_is_granted(lck, dep));
++
++      key = hash_long(key, lhead->lh_hbits);
++
++      mi_bits = lhead->lh_hbits >> 1;
++      ma_bits = lhead->lh_hbits - mi_bits;
++
++      lck->lk_nodes[dep].ln_major_key = major = key & ((1U << ma_bits) - 1);
++      lck->lk_nodes[dep].ln_minor_key = minor = key >> ma_bits;
++      lck->lk_nodes[dep].ln_mode = mode;
++
++      /*
++       * The major key list is an ordered list, so searches are started
++       * at the end of the list that is numerically closer to major_key,
++       * so at most half of the list will be walked (for well-distributed
++       * keys). The list traversal aborts early if the expected key
++       * location is passed.
++       */
++      reverse = (major >= (1 << (ma_bits - 1)));
++
++      if (reverse) {
++              list_for_each_entry_reverse(tmp,
++                                      &lhead->lh_children[dep].lc_list,
++                                      lk_nodes[dep].ln_major_list) {
++                      if (tmp->lk_nodes[dep].ln_major_key == major) {
++                              goto search_minor;
++
++                      } else if (tmp->lk_nodes[dep].ln_major_key < major) {
++                              /* attach _after_ @tmp */
++                              list_add(&lck->lk_nodes[dep].ln_major_list,
++                                       &tmp->lk_nodes[dep].ln_major_list);
++                              goto out_grant_major;
++                      }
++              }
++
++              list_add(&lck->lk_nodes[dep].ln_major_list,
++                       &lhead->lh_children[dep].lc_list);
++              goto out_grant_major;
++
++      } else {
++              list_for_each_entry(tmp, &lhead->lh_children[dep].lc_list,
++                                  lk_nodes[dep].ln_major_list) {
++                      if (tmp->lk_nodes[dep].ln_major_key == major) {
++                              goto search_minor;
++
++                      } else if (tmp->lk_nodes[dep].ln_major_key > major) {
++                              /* insert _before_ @tmp */
++                              list_add_tail(&lck->lk_nodes[dep].ln_major_list,
++                                      &tmp->lk_nodes[dep].ln_major_list);
++                              goto out_grant_major;
++                      }
++              }
++
++              list_add_tail(&lck->lk_nodes[dep].ln_major_list,
++                            &lhead->lh_children[dep].lc_list);
++              goto out_grant_major;
++      }
++
++ search_minor:
++      /*
++       * NB: minor_key list doesn't have a "head", @list is just a
++       * temporary stub for helping list searching, make sure it's removed
++       * after searching.
++       * minor_key list is an ordered list too.
++       */
++      list_add_tail(&list, &tmp->lk_nodes[dep].ln_minor_list);
++
++      reverse = (minor >= (1 << (mi_bits - 1)));
++
++      if (reverse) {
++              list_for_each_entry_reverse(tmp2, &list,
++                                          lk_nodes[dep].ln_minor_list) {
++                      if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
++                              goto out_enqueue;
++
++                      } else if (tmp2->lk_nodes[dep].ln_minor_key < minor) {
++                              /* attach _after_ @tmp2 */
++                              list_add(&lck->lk_nodes[dep].ln_minor_list,
++                                       &tmp2->lk_nodes[dep].ln_minor_list);
++                              goto out_grant_minor;
++                      }
++              }
++
++              list_add(&lck->lk_nodes[dep].ln_minor_list, &list);
++
++      } else {
++              list_for_each_entry(tmp2, &list,
++                                  lk_nodes[dep].ln_minor_list) {
++                      if (tmp2->lk_nodes[dep].ln_minor_key == minor) {
++                              goto out_enqueue;
++
++                      } else if (tmp2->lk_nodes[dep].ln_minor_key > minor) {
++                              /* insert _before_ @tmp2 */
++                              list_add_tail(&lck->lk_nodes[dep].ln_minor_list,
++                                      &tmp2->lk_nodes[dep].ln_minor_list);
++                              goto out_grant_minor;
++                      }
++              }
++
++              list_add_tail(&lck->lk_nodes[dep].ln_minor_list, &list);
++      }
++
++ out_grant_minor:
++      if (list.next == &lck->lk_nodes[dep].ln_minor_list) {
++              /* new lock @lck is the first one on minor_key list, which
++               * means it has the smallest minor_key and it should
++               * replace @tmp as minor_key owner */
++              list_replace_init(&tmp->lk_nodes[dep].ln_major_list,
++                                &lck->lk_nodes[dep].ln_major_list);
++      }
++      /* remove the temporary head */
++      list_del(&list);
++
++ out_grant_major:
++      ln_grant_inc(dep, lck->lk_nodes[dep].ln_mode);
++      return 1; /* granted with holding lh_lock */
++
++ out_enqueue:
++      list_del(&list); /* remove temprary head */
++      return htree_node_lock_enqueue(lck, tmp2, dep, wait, event);
++}
++
++/*
++ * release the key of @lck at level @dep, and grant any blocked locks.
++ * caller will still listen on @key if @event is not NULL, which means
++ * caller can see a event (by event_cb) while granting any lock with
++ * the same key at level @dep.
++ * NB: ALWAYS called holding lhead::lh_lock
++ * NB: listener will not block anyone because listening mode is HTREE_LOCK_NL
++ */
++static void
++htree_node_unlock_internal(struct htree_lock_head *lhead,
++                         struct htree_lock *curlk, unsigned dep, void *event)
++{
++      struct htree_lock_node  *curln = &curlk->lk_nodes[dep];
++      struct htree_lock       *grtlk = NULL;
++      struct htree_lock_node  *grtln;
++      struct htree_lock       *poslk;
++      struct htree_lock       *tmplk;
++
++      if (!htree_node_is_granted(curlk, dep))
++              return;
++
++      if (!list_empty(&curln->ln_granted_list)) {
++              /* there is another granted lock */
++              grtlk = list_entry(curln->ln_granted_list.next,
++                                 struct htree_lock,
++                                 lk_nodes[dep].ln_granted_list);
++              list_del_init(&curln->ln_granted_list);
++      }
++
++      if (grtlk == NULL && !list_empty(&curln->ln_blocked_list)) {
++              /*
++               * @curlk is the only granted lock, so we confirmed:
++               * a) curln is key owner (attached on major/minor_list),
++               *    so if there is any blocked lock, it should be attached
++               *    on curln->ln_blocked_list
++               * b) we always can grant the first blocked lock
++               */
++              grtlk = list_entry(curln->ln_blocked_list.next,
++                                 struct htree_lock,
++                                 lk_nodes[dep].ln_blocked_list);
++              BUG_ON(grtlk->lk_task == NULL);
++              wake_up_process(grtlk->lk_task);
++      }
++
++      if (event != NULL &&
++          lhead->lh_children[dep].lc_events != HTREE_EVENT_DISABLE) {
++              curln->ln_ev_target = event;
++              curln->ln_mode = HTREE_LOCK_NL; /* listen! */
++      } else {
++              curln->ln_mode = HTREE_LOCK_INVAL;
++      }
++
++      if (grtlk == NULL) { /* I must be the only one locking this key */
++              struct htree_lock_node *tmpln;
++
++              BUG_ON(htree_key_list_empty(curln));
++
++              if (curln->ln_mode == HTREE_LOCK_NL) /* listening */
++                      return;
++
++              /* not listening */
++              if (list_empty(&curln->ln_alive_list)) { /* no more listener */
++                      htree_key_list_del_init(curln);
++                      return;
++              }
++
++              tmpln = list_entry(curln->ln_alive_list.next,
++                                 struct htree_lock_node, ln_alive_list);
++
++              BUG_ON(tmpln->ln_mode != HTREE_LOCK_NL);
++
++              htree_key_list_replace_init(curln, tmpln);
++              list_del_init(&curln->ln_alive_list);
++
++              return;
++      }
++
++      /* have a granted lock */
++      grtln = &grtlk->lk_nodes[dep];
++      if (!list_empty(&curln->ln_blocked_list)) {
++              /* only key owner can be on both lists */
++              BUG_ON(htree_key_list_empty(curln));
++
++              if (list_empty(&grtln->ln_blocked_list)) {
++                      list_add(&grtln->ln_blocked_list,
++                               &curln->ln_blocked_list);
++              }
++              list_del_init(&curln->ln_blocked_list);
++      }
++      /*
++       * NB: this is the tricky part:
++       * We have only two modes for child-lock (PR and PW), also,
++       * only owner of the key (attached on major/minor_list) can be on
++       * both blocked_list and granted_list, so @grtlk must be one
++       * of these two cases:
++       *
++       * a) @grtlk is taken from granted_list, which means we've granted
++       *    more than one lock so @grtlk has to be PR, the first blocked
++       *    lock must be PW and we can't grant it at all.
++       *    So even @grtlk is not owner of the key (empty blocked_list),
++       *    we don't care because we can't grant any lock.
++       * b) we just grant a new lock which is taken from head of blocked
++       *    list, and it should be the first granted lock, and it should
++       *    be the first one linked on blocked_list.
++       *
++       * Either way, we can get correct result by iterating blocked_list
++       * of @grtlk, and don't have to bother on how to find out
++       * owner of current key.
++       */
++      list_for_each_entry_safe(poslk, tmplk, &grtln->ln_blocked_list,
++                               lk_nodes[dep].ln_blocked_list) {
++              if (grtlk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW ||
++                  poslk->lk_nodes[dep].ln_mode == HTREE_LOCK_PW)
++                      break;
++              /* grant all readers */
++              list_del_init(&poslk->lk_nodes[dep].ln_blocked_list);
++              list_add(&poslk->lk_nodes[dep].ln_granted_list,
++                       &grtln->ln_granted_list);
++
++              BUG_ON(poslk->lk_task == NULL);
++              wake_up_process(poslk->lk_task);
++      }
++
++      /* if @curln is the owner of this key, replace it with @grtln */
++      if (!htree_key_list_empty(curln))
++              htree_key_list_replace_init(curln, grtln);
++
++      if (curln->ln_mode == HTREE_LOCK_INVAL)
++              list_del_init(&curln->ln_alive_list);
++}
++
++/*
++ * it's just wrapper of htree_node_lock_internal, it returns 1 on granted
++ * and 0 only if @wait is false and can't grant it immediately
++ */
++int
++htree_node_lock_try(struct htree_lock *lck, htree_lock_mode_t mode,
++                  u32 key, unsigned dep, int wait, void *event)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      int rc;
++
++      BUG_ON(dep >= lck->lk_depth);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++      htree_spin_lock(lhead, dep);
++      rc = htree_node_lock_internal(lhead, lck, mode, key, dep, wait, event);
++      if (rc != 0)
++              htree_spin_unlock(lhead, dep);
++      return rc >= 0;
++}
++EXPORT_SYMBOL(htree_node_lock_try);
++
++/* it's wrapper of htree_node_unlock_internal */
++void
++htree_node_unlock(struct htree_lock *lck, unsigned dep, void *event)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++
++      BUG_ON(dep >= lck->lk_depth);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++      htree_spin_lock(lhead, dep);
++      htree_node_unlock_internal(lhead, lck, dep, event);
++      htree_spin_unlock(lhead, dep);
++}
++EXPORT_SYMBOL(htree_node_unlock);
++
++/* stop listening on child-lock level @dep */
++void
++htree_node_stop_listen(struct htree_lock *lck, unsigned dep)
++{
++      struct htree_lock_node *ln = &lck->lk_nodes[dep];
++      struct htree_lock_node *tmp;
++
++      BUG_ON(htree_node_is_granted(lck, dep));
++      BUG_ON(!list_empty(&ln->ln_blocked_list));
++      BUG_ON(!list_empty(&ln->ln_granted_list));
++
++      if (!htree_node_is_listening(lck, dep))
++              return;
++
++      htree_spin_lock(lck->lk_head, dep);
++      ln->ln_mode = HTREE_LOCK_INVAL;
++      ln->ln_ev_target = NULL;
++
++      if (htree_key_list_empty(ln)) { /* not owner */
++              list_del_init(&ln->ln_alive_list);
++              goto out;
++      }
++
++      /* I'm the owner... */
++      if (list_empty(&ln->ln_alive_list)) { /* no more listener */
++              htree_key_list_del_init(ln);
++              goto out;
++      }
++
++      tmp = list_entry(ln->ln_alive_list.next,
++                       struct htree_lock_node, ln_alive_list);
++
++      BUG_ON(tmp->ln_mode != HTREE_LOCK_NL);
++      htree_key_list_replace_init(ln, tmp);
++      list_del_init(&ln->ln_alive_list);
++ out:
++      htree_spin_unlock(lck->lk_head, dep);
++}
++EXPORT_SYMBOL(htree_node_stop_listen);
++
++/* release all child-locks if we have any */
++static void
++htree_node_release_all(struct htree_lock *lck)
++{
++      int     i;
++
++      for (i = 0; i < lck->lk_depth; i++) {
++              if (htree_node_is_granted(lck, i))
++                      htree_node_unlock(lck, i, NULL);
++              else if (htree_node_is_listening(lck, i))
++                      htree_node_stop_listen(lck, i);
++      }
++}
++
++/*
++ * obtain htree lock, it could be blocked inside if there's conflict
++ * with any granted or blocked lock and @wait is true.
++ * NB: ALWAYS called holding lhead::lh_lock
++ */
++static int
++htree_lock_internal(struct htree_lock *lck, int wait)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      int     granted = 0;
++      int     blocked = 0;
++      int     i;
++
++      for (i = 0; i < HTREE_LOCK_MAX; i++) {
++              if (lhead->lh_ngranted[i] != 0)
++                      granted |= 1 << i;
++              if (lhead->lh_nblocked[i] != 0)
++                      blocked |= 1 << i;
++      }
++      if ((htree_lock_compat[lck->lk_mode] & granted) != granted ||
++          (htree_lock_compat[lck->lk_mode] & blocked) != blocked) {
++              /* will block current lock even it just conflicts with any
++               * other blocked lock, so lock like EX wouldn't starve */
++              if (!wait)
++                      return -1;
++              lhead->lh_nblocked[lck->lk_mode]++;
++              lk_block_inc(lck->lk_mode);
++
++              lck->lk_task = current;
++              list_add_tail(&lck->lk_blocked_list, &lhead->lh_blocked_list);
++
++              set_current_state(TASK_UNINTERRUPTIBLE);
++              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++              /* wait to be given the lock */
++              if (lck->lk_task != NULL)
++                      schedule();
++              /* granted, no doubt. wake up will set me RUNNING */
++              return 0; /* without lh_lock */
++      }
++      lhead->lh_ngranted[lck->lk_mode]++;
++      lk_grant_inc(lck->lk_mode);
++      return 1;
++}
++
++/* release htree lock. NB: ALWAYS called holding lhead::lh_lock */
++static void
++htree_unlock_internal(struct htree_lock *lck)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      struct htree_lock *tmp;
++      struct htree_lock *tmp2;
++      int granted = 0;
++      int i;
++
++      BUG_ON(lhead->lh_ngranted[lck->lk_mode] == 0);
++
++      lhead->lh_ngranted[lck->lk_mode]--;
++      lck->lk_mode = HTREE_LOCK_INVAL;
++
++      for (i = 0; i < HTREE_LOCK_MAX; i++) {
++              if (lhead->lh_ngranted[i] != 0)
++                      granted |= 1 << i;
++      }
++      list_for_each_entry_safe(tmp, tmp2,
++                               &lhead->lh_blocked_list, lk_blocked_list) {
++              /* conflict with any granted lock? */
++              if ((htree_lock_compat[tmp->lk_mode] & granted) != granted)
++                      break;
++
++              list_del_init(&tmp->lk_blocked_list);
++
++              BUG_ON(lhead->lh_nblocked[tmp->lk_mode] == 0);
++
++              lhead->lh_nblocked[tmp->lk_mode]--;
++              lhead->lh_ngranted[tmp->lk_mode]++;
++              granted |= 1 << tmp->lk_mode;
++
++              BUG_ON(tmp->lk_task == NULL);
++              wake_up_process(tmp->lk_task);
++      }
++}
++
++/* it's wrapper of htree_lock_internal and exported interface.
++ * It always return 1 with granted lock if @wait is true, it can return 0
++ * if @wait is false and locking request can't be granted immediately */
++int
++htree_lock_try(struct htree_lock *lck, struct htree_lock_head *lhead,
++             htree_lock_mode_t mode, int wait)
++{
++      int     rc;
++
++      BUG_ON(lck->lk_depth > lhead->lh_depth);
++      BUG_ON(lck->lk_head != NULL);
++      BUG_ON(lck->lk_task != NULL);
++
++      lck->lk_head = lhead;
++      lck->lk_mode = mode;
++
++      htree_spin_lock(lhead, HTREE_DEP_ROOT);
++      rc = htree_lock_internal(lck, wait);
++      if (rc != 0)
++              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++      return rc >= 0;
++}
++EXPORT_SYMBOL(htree_lock_try);
++
++/* it's wrapper of htree_unlock_internal and exported interface.
++ * It will release all htree_node_locks and htree_lock */
++void
++htree_unlock(struct htree_lock *lck)
++{
++      BUG_ON(lck->lk_head == NULL);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++
++      htree_node_release_all(lck);
++
++      htree_spin_lock(lck->lk_head, HTREE_DEP_ROOT);
++      htree_unlock_internal(lck);
++      htree_spin_unlock(lck->lk_head, HTREE_DEP_ROOT);
++      lck->lk_head = NULL;
++      lck->lk_task = NULL;
++}
++EXPORT_SYMBOL(htree_unlock);
++
++/* change lock mode */
++void
++htree_change_mode(struct htree_lock *lck, htree_lock_mode_t mode)
++{
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL);
++      lck->lk_mode = mode;
++}
++EXPORT_SYMBOL(htree_change_mode);
++
++/* release htree lock, and lock it again with new mode.
++ * This function will first release all htree_node_locks and htree_lock,
++ * then try to gain htree_lock with new @mode.
++ * It always return 1 with granted lock if @wait is true, it can return 0
++ * if @wait is false and locking request can't be granted immediately */
++int
++htree_change_lock_try(struct htree_lock *lck, htree_lock_mode_t mode, int wait)
++{
++      struct htree_lock_head *lhead = lck->lk_head;
++      int rc;
++
++      BUG_ON(lhead == NULL);
++      BUG_ON(lck->lk_mode == mode);
++      BUG_ON(lck->lk_mode == HTREE_LOCK_INVAL || mode == HTREE_LOCK_INVAL);
++
++      htree_node_release_all(lck);
++
++      htree_spin_lock(lhead, HTREE_DEP_ROOT);
++      htree_unlock_internal(lck);
++      lck->lk_mode = mode;
++      rc = htree_lock_internal(lck, wait);
++      if (rc != 0)
++              htree_spin_unlock(lhead, HTREE_DEP_ROOT);
++      return rc >= 0;
++}
++EXPORT_SYMBOL(htree_change_lock_try);
++
++/* create a htree_lock head with @depth levels (number of child-locks),
++ * it is a per resoruce structure */
++struct htree_lock_head *
++htree_lock_head_alloc(unsigned depth, unsigned hbits, unsigned priv)
++{
++      struct htree_lock_head *lhead;
++      int  i;
++
++      if (depth > HTREE_LOCK_DEP_MAX) {
++              printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
++                      depth, HTREE_LOCK_DEP_MAX);
++              return NULL;
++      }
++
++      lhead = kzalloc(offsetof(struct htree_lock_head,
++                               lh_children[depth]) + priv, GFP_NOFS);
++      if (lhead == NULL)
++              return NULL;
++
++      if (hbits < HTREE_HBITS_MIN)
++              lhead->lh_hbits = HTREE_HBITS_MIN;
++      else if (hbits > HTREE_HBITS_MAX)
++              lhead->lh_hbits = HTREE_HBITS_MAX;
++
++      lhead->lh_lock = 0;
++      lhead->lh_depth = depth;
++      INIT_LIST_HEAD(&lhead->lh_blocked_list);
++      if (priv > 0) {
++              lhead->lh_private = (void *)lhead +
++                      offsetof(struct htree_lock_head, lh_children[depth]);
++      }
++
++      for (i = 0; i < depth; i++) {
++              INIT_LIST_HEAD(&lhead->lh_children[i].lc_list);
++              lhead->lh_children[i].lc_events = HTREE_EVENT_DISABLE;
++      }
++      return lhead;
++}
++EXPORT_SYMBOL(htree_lock_head_alloc);
++
++/* free the htree_lock head */
++void
++htree_lock_head_free(struct htree_lock_head *lhead)
++{
++      int     i;
++
++      BUG_ON(!list_empty(&lhead->lh_blocked_list));
++      for (i = 0; i < lhead->lh_depth; i++)
++              BUG_ON(!list_empty(&lhead->lh_children[i].lc_list));
++      kfree(lhead);
++}
++EXPORT_SYMBOL(htree_lock_head_free);
++
++/* register event callback for @events of child-lock at level @dep */
++void
++htree_lock_event_attach(struct htree_lock_head *lhead, unsigned dep,
++                      unsigned events, htree_event_cb_t callback)
++{
++      BUG_ON(lhead->lh_depth <= dep);
++      lhead->lh_children[dep].lc_events = events;
++      lhead->lh_children[dep].lc_callback = callback;
++}
++EXPORT_SYMBOL(htree_lock_event_attach);
++
++/* allocate a htree_lock, which is per-thread structure, @pbytes is some
++ * extra-bytes as private data for caller */
++struct htree_lock *
++htree_lock_alloc(unsigned depth, unsigned pbytes)
++{
++      struct htree_lock *lck;
++      int i = offsetof(struct htree_lock, lk_nodes[depth]);
++
++      if (depth > HTREE_LOCK_DEP_MAX) {
++              printk(KERN_ERR "%d is larger than max htree_lock depth %d\n",
++                      depth, HTREE_LOCK_DEP_MAX);
++              return NULL;
++      }
++      lck = kzalloc(i + pbytes, GFP_NOFS);
++      if (lck == NULL)
++              return NULL;
++
++      if (pbytes != 0)
++              lck->lk_private = (void *)lck + i;
++      lck->lk_mode = HTREE_LOCK_INVAL;
++      lck->lk_depth = depth;
++      INIT_LIST_HEAD(&lck->lk_blocked_list);
++
++      for (i = 0; i < depth; i++) {
++              struct htree_lock_node *node = &lck->lk_nodes[i];
++
++              node->ln_mode = HTREE_LOCK_INVAL;
++              INIT_LIST_HEAD(&node->ln_major_list);
++              INIT_LIST_HEAD(&node->ln_minor_list);
++              INIT_LIST_HEAD(&node->ln_alive_list);
++              INIT_LIST_HEAD(&node->ln_blocked_list);
++              INIT_LIST_HEAD(&node->ln_granted_list);
++      }
++
++      return lck;
++}
++EXPORT_SYMBOL(htree_lock_alloc);
++
++/* free htree_lock node */
++void
++htree_lock_free(struct htree_lock *lck)
++{
++      BUG_ON(lck->lk_mode != HTREE_LOCK_INVAL);
++      kfree(lck);
++}
++EXPORT_SYMBOL(htree_lock_free);
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index 3f70bca..99a8da2 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -52,6 +52,7 @@ struct buffer_head *ext4_append(handle_t *handle,
+                                       ext4_lblk_t *block)
+ {
+       struct buffer_head *bh;
++      struct ext4_inode_info *ei = EXT4_I(inode);
+       int err;
+       if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb &&
+@@ -59,15 +60,22 @@ struct buffer_head *ext4_append(handle_t *handle,
+                     EXT4_SB(inode->i_sb)->s_max_dir_size_kb)))
+               return ERR_PTR(-ENOSPC);
++      /* with parallel dir operations all appends
++      * have to be serialized -bzzz */
++      down(&ei->i_append_sem);
++
+       *block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+       bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
+-      if (IS_ERR(bh))
++      if (IS_ERR(bh)) {
++              up(&ei->i_append_sem);
+               return bh;
++      }
+       inode->i_size += inode->i_sb->s_blocksize;
+       EXT4_I(inode)->i_disksize = inode->i_size;
+       BUFFER_TRACE(bh, "get_write_access");
+       err = ext4_journal_get_write_access(handle, bh);
++      up(&ei->i_append_sem);
+       if (err) {
+               brelse(bh);
+               ext4_std_error(inode->i_sb, err);
+@@ -247,7 +255,8 @@ static unsigned dx_node_limit(struct inode *dir);
+ static struct dx_frame *dx_probe(struct ext4_filename *fname,
+                                struct inode *dir,
+                                struct dx_hash_info *hinfo,
+-                               struct dx_frame *frame);
++                               struct dx_frame *frame,
++                               struct htree_lock *lck);
+ static void dx_release(struct dx_frame *frames);
+ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
+                      unsigned blocksize, struct dx_hash_info *hinfo,
+@@ -261,12 +270,13 @@ static void dx_insert_block(struct dx_frame *frame,
+ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
+                                struct dx_frame *frame,
+                                struct dx_frame *frames,
+-                               __u32 *start_hash);
++                               __u32 *start_hash, struct htree_lock *lck);
+ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
+               struct ext4_filename *fname,
+-              struct ext4_dir_entry_2 **res_dir);
++              struct ext4_dir_entry_2 **res_dir, struct htree_lock *lck);
+ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
+-                           struct dentry *dentry, struct inode *inode);
++                           struct dentry *dentry, struct inode *inode,
++                           struct htree_lock *l