Whamcloud - gitweb
LU-9305 osd-zfs: arc_buf could be non-pagesize aligned 95/26895/8
authorJinshan Xiong <jinshan.xiong@intel.com>
Mon, 1 May 2017 04:10:51 +0000 (21:10 -0700)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 16 May 2017 05:45:30 +0000 (05:45 +0000)
ZFS only guarantees PAGE_SIZE alignment to arc_buf_t only when
the block size is not less than (PAGE_SIZE << 2).

The patch for ZFS https://github.com/zfsonlinux/zfs/pull/6084 fixes
the alignment problem, buf Lustre still needs a fix to handle
the problem in case it's running old ZFS release.

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: I6fd17d7b20499ec0406a3e10cebf6882b92a730f
Reviewed-on: https://review.whamcloud.com/26895
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
config/lustre-build-zfs.m4
lustre/osd-zfs/osd_internal.h
lustre/osd-zfs/osd_io.c
lustre/tests/sanity.sh

index 49170d2..b92d17f 100644 (file)
@@ -406,6 +406,22 @@ your distribution.
                ])
        ])
 
+       dnl #
+       dnl # Define zfs source code version
+       dnl #
+       AS_IF([test x$enable_zfs = xyes], [
+               ZFS_MAJOR=$(echo $zfsver | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\1/'])
+               ZFS_MINOR=$(echo $zfsver | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\2/'])
+               ZFS_PATCH=$(echo $zfsver | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\3/'])
+               ZFS_FIX=$(echo $zfsver   | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\5/'])
+               AS_IF([test -z "$ZFS_FIX"], [ZFS_FIX="0"])
+
+               AC_DEFINE_UNQUOTED([ZFS_MAJOR], [$ZFS_MAJOR], [zfs major version])
+               AC_DEFINE_UNQUOTED([ZFS_MINOR], [$ZFS_MINOR], [zfs minor version])
+               AC_DEFINE_UNQUOTED([ZFS_PATCH], [$ZFS_PATCH], [zfs patch version])
+               AC_DEFINE_UNQUOTED([ZFS_FIX],   [$ZFS_FIX],   [zfs fix version])
+       ])
+
        AS_IF([test "x$enable_zfs" = xyes], [
                LB_CHECK_COMPILE([if zfs defines dsl_pool_config_enter/exit],
                dsl_pool_config_enter, [
index 4d9656e..38c7532 100644 (file)
@@ -74,6 +74,9 @@
 #undef kmem_cache_free
 #endif
 
+#define ZFS_VERSION_CODE       \
+       OBD_OCD_VERSION(ZFS_MAJOR, ZFS_MINOR, ZFS_PATCH, ZFS_FIX)
+
 #define LUSTRE_ROOT_FID_SEQ    0
 #define DMU_OSD_SVNAME         "svname"
 #define DMU_OSD_OI_NAME_BASE   "oi"
index 94eee7d..a74e58c 100644 (file)
@@ -278,6 +278,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
 
 static inline struct page *kmem_to_page(void *addr)
 {
+       LASSERT(!((unsigned long)addr & ~PAGE_MASK));
        if (is_vmalloc_addr(addr))
                return vmalloc_to_page(addr);
        else
@@ -388,6 +389,29 @@ err:
        RETURN(rc);
 }
 
+static inline arc_buf_t *osd_request_arcbuf(dnode_t *dn, size_t bs)
+{
+       arc_buf_t *abuf;
+
+       abuf = dmu_request_arcbuf(&dn->dn_bonus->db, bs);
+       if (unlikely(!abuf))
+               return ERR_PTR(-ENOMEM);
+
+#if ZFS_VERSION_CODE < OBD_OCD_VERSION(0, 7, 0, 0)
+       /**
+        * ZFS prior to 0.7.0 doesn't guarantee PAGE_SIZE alignment for zio
+        * blocks smaller than (PAGE_SIZE << 2). This poses a problem of
+        * setting up page array for RDMA transfer. See LU-9305.
+        */
+       if ((unsigned long)abuf->b_data & ~PAGE_MASK) {
+               dmu_return_arcbuf(abuf);
+               return NULL;
+       }
+#endif
+
+       return abuf;
+}
+
 static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj,
                                loff_t off, ssize_t len, struct niobuf_local *lnb)
 {
@@ -409,13 +433,15 @@ static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj,
                off_in_block = off & (bs - 1);
                sz_in_block = min_t(int, bs - off_in_block, len);
 
+               abuf = NULL;
                if (sz_in_block == bs) {
                        /* full block, try to use zerocopy */
+                       abuf = osd_request_arcbuf(dn, bs);
+                       if (unlikely(IS_ERR(abuf)))
+                               GOTO(out_err, rc = PTR_ERR(abuf));
+               }
 
-                       abuf = dmu_request_arcbuf(&dn->dn_bonus->db, bs);
-                       if (unlikely(abuf == NULL))
-                               GOTO(out_err, rc = -ENOMEM);
-
+               if (abuf != NULL) {
                        atomic_inc(&osd->od_zerocopy_loan);
 
                        /* go over pages arcbuf contains, put them as
index 7e8a091..afb8288 100755 (executable)
@@ -15768,7 +15768,7 @@ zfs_oid_to_objid()
        local objid=$2
 
        local vdevdir=$(dirname $(facet_vdevice $ost))
-       local cmd="$ZDB -e -p $vdevdir -dddd $(facet_device $ost)"
+       local cmd="$ZDB -e -p $vdevdir -ddddd $(facet_device $ost)"
        local zfs_zapid=$(do_facet $ost $cmd |
                          grep -w "/O/0/d$((objid%32))" -C 5 |
                          awk '/Object/{getline; print $1}')
@@ -15805,6 +15805,7 @@ test_312() { # LU-4856
        local max_blksz=$(do_facet ost1 \
                          $ZFS get -p recordsize $(facet_device ost1) |
                          awk '!/VALUE/{print $3}')
+       local min_blksz=$(getconf PAGE_SIZE)
 
        # to make life a little bit easier
        $LFS mkdir -c 1 -i 0 $DIR/$tdir
@@ -15819,7 +15820,7 @@ test_312() { # LU-4856
 
        # block size change by sequential over write
        local blksz
-       for ((bs=4096; bs <= max_blksz; bs <<= 2)); do
+       for ((bs=$min_blksz; bs <= max_blksz; bs <<= 2)); do
                dd if=/dev/zero of=$tf bs=$bs count=1 oflag=sync conv=notrunc
 
                blksz=$(zfs_object_blksz ost1 $zfs_objid)
@@ -15828,18 +15829,18 @@ test_312() { # LU-4856
        rm -f $tf
 
        # block size change by sequential append write
-       dd if=/dev/zero of=$tf bs=4K count=1 oflag=sync conv=notrunc
+       dd if=/dev/zero of=$tf bs=$min_blksz count=1 oflag=sync conv=notrunc
        oid=$($LFS getstripe $tf | awk '/obdidx/{getline; print $2}')
        zfs_objid=$(zfs_oid_to_objid ost1 $oid)
 
-       for ((count = 1; count < $((max_blksz / 4096)); count *= 2)); do
-               dd if=/dev/zero of=$tf bs=4K count=$count seek=$count \
+       for ((count = 1; count < $((max_blksz / min_blksz)); count *= 2)); do
+               dd if=/dev/zero of=$tf bs=$min_blksz count=$count seek=$count \
                        oflag=sync conv=notrunc
 
                blksz=$(zfs_object_blksz ost1 $zfs_objid)
-               blksz=$((blksz / 8192)) # in 2*4K unit
-               [ $blksz -eq $count ] ||
-                       error "blksz error(in 8k): $blksz, expected: $count"
+               [ $blksz -eq $((2 * count * min_blksz)) ] ||
+                       error "blksz error, actual $blksz, "    \
+                               "expected: 2 * $count * $min_blksz"
        done
        rm -f $tf
 
@@ -15848,9 +15849,10 @@ test_312() { # LU-4856
        oid=$($LFS getstripe $tf | awk '/obdidx/{getline; print $2}')
        zfs_objid=$(zfs_oid_to_objid ost1 $oid)
 
-       dd if=/dev/zero of=$tf bs=8K count=1 oflag=sync conv=notrunc
+       dd if=/dev/zero of=$tf bs=1K count=1 oflag=sync conv=notrunc
        blksz=$(zfs_object_blksz ost1 $zfs_objid)
-       [ $blksz -eq 8192 ] || error "blksz error: $blksz, expected: 8k"
+       [ $blksz -eq $min_blksz ] ||
+               error "blksz error: $blksz, expected: $min_blksz"
 
        dd if=/dev/zero of=$tf bs=64K count=1 oflag=sync conv=notrunc seek=128
        blksz=$(zfs_object_blksz ost1 $zfs_objid)