From 3800d05641158608e5a139336c00a781fa9fb8c7 Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Sun, 30 Apr 2017 21:10:51 -0700 Subject: [PATCH] LU-9305 osd-zfs: arc_buf could be non-pagesize aligned ZFS only guarantees PAGE_SIZE alignment to arc_buf_t only when the block size is not less than (PAGE_SIZE << 2). The patch for ZFS https://github.com/zfsonlinux/zfs/pull/6084 fixes the alignment problem, buf Lustre still needs a fix to handle the problem in case it's running old ZFS release. Signed-off-by: Jinshan Xiong Change-Id: I6fd17d7b20499ec0406a3e10cebf6882b92a730f Reviewed-on: https://review.whamcloud.com/26895 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger --- config/lustre-build-zfs.m4 | 16 ++++++++++++++++ lustre/osd-zfs/osd_internal.h | 3 +++ lustre/osd-zfs/osd_io.c | 34 ++++++++++++++++++++++++++++++---- lustre/tests/sanity.sh | 22 ++++++++++++---------- 4 files changed, 61 insertions(+), 14 deletions(-) diff --git a/config/lustre-build-zfs.m4 b/config/lustre-build-zfs.m4 index 49170d2..b92d17f 100644 --- a/config/lustre-build-zfs.m4 +++ b/config/lustre-build-zfs.m4 @@ -406,6 +406,22 @@ your distribution. ]) ]) + dnl # + dnl # Define zfs source code version + dnl # + AS_IF([test x$enable_zfs = xyes], [ + ZFS_MAJOR=$(echo $zfsver | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\1/']) + ZFS_MINOR=$(echo $zfsver | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\2/']) + ZFS_PATCH=$(echo $zfsver | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\3/']) + ZFS_FIX=$(echo $zfsver | sed -re ['s/([0-9]+)\.([0-9]+)\.([0-9]+)(\.([0-9]+))?.*/\5/']) + AS_IF([test -z "$ZFS_FIX"], [ZFS_FIX="0"]) + + AC_DEFINE_UNQUOTED([ZFS_MAJOR], [$ZFS_MAJOR], [zfs major version]) + AC_DEFINE_UNQUOTED([ZFS_MINOR], [$ZFS_MINOR], [zfs minor version]) + AC_DEFINE_UNQUOTED([ZFS_PATCH], [$ZFS_PATCH], [zfs patch version]) + AC_DEFINE_UNQUOTED([ZFS_FIX], [$ZFS_FIX], [zfs fix version]) + ]) + AS_IF([test "x$enable_zfs" = xyes], [ LB_CHECK_COMPILE([if zfs defines dsl_pool_config_enter/exit], dsl_pool_config_enter, [ diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index 4d9656e..38c7532 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -74,6 +74,9 @@ #undef kmem_cache_free #endif +#define ZFS_VERSION_CODE \ + OBD_OCD_VERSION(ZFS_MAJOR, ZFS_MINOR, ZFS_PATCH, ZFS_FIX) + #define LUSTRE_ROOT_FID_SEQ 0 #define DMU_OSD_SVNAME "svname" #define DMU_OSD_OI_NAME_BASE "oi" diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 94eee7d..a74e58c 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -278,6 +278,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, static inline struct page *kmem_to_page(void *addr) { + LASSERT(!((unsigned long)addr & ~PAGE_MASK)); if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); else @@ -388,6 +389,29 @@ err: RETURN(rc); } +static inline arc_buf_t *osd_request_arcbuf(dnode_t *dn, size_t bs) +{ + arc_buf_t *abuf; + + abuf = dmu_request_arcbuf(&dn->dn_bonus->db, bs); + if (unlikely(!abuf)) + return ERR_PTR(-ENOMEM); + +#if ZFS_VERSION_CODE < OBD_OCD_VERSION(0, 7, 0, 0) + /** + * ZFS prior to 0.7.0 doesn't guarantee PAGE_SIZE alignment for zio + * blocks smaller than (PAGE_SIZE << 2). This poses a problem of + * setting up page array for RDMA transfer. See LU-9305. + */ + if ((unsigned long)abuf->b_data & ~PAGE_MASK) { + dmu_return_arcbuf(abuf); + return NULL; + } +#endif + + return abuf; +} + static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj, loff_t off, ssize_t len, struct niobuf_local *lnb) { @@ -409,13 +433,15 @@ static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj, off_in_block = off & (bs - 1); sz_in_block = min_t(int, bs - off_in_block, len); + abuf = NULL; if (sz_in_block == bs) { /* full block, try to use zerocopy */ + abuf = osd_request_arcbuf(dn, bs); + if (unlikely(IS_ERR(abuf))) + GOTO(out_err, rc = PTR_ERR(abuf)); + } - abuf = dmu_request_arcbuf(&dn->dn_bonus->db, bs); - if (unlikely(abuf == NULL)) - GOTO(out_err, rc = -ENOMEM); - + if (abuf != NULL) { atomic_inc(&osd->od_zerocopy_loan); /* go over pages arcbuf contains, put them as diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7e8a091..afb8288 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -15768,7 +15768,7 @@ zfs_oid_to_objid() local objid=$2 local vdevdir=$(dirname $(facet_vdevice $ost)) - local cmd="$ZDB -e -p $vdevdir -dddd $(facet_device $ost)" + local cmd="$ZDB -e -p $vdevdir -ddddd $(facet_device $ost)" local zfs_zapid=$(do_facet $ost $cmd | grep -w "/O/0/d$((objid%32))" -C 5 | awk '/Object/{getline; print $1}') @@ -15805,6 +15805,7 @@ test_312() { # LU-4856 local max_blksz=$(do_facet ost1 \ $ZFS get -p recordsize $(facet_device ost1) | awk '!/VALUE/{print $3}') + local min_blksz=$(getconf PAGE_SIZE) # to make life a little bit easier $LFS mkdir -c 1 -i 0 $DIR/$tdir @@ -15819,7 +15820,7 @@ test_312() { # LU-4856 # block size change by sequential over write local blksz - for ((bs=4096; bs <= max_blksz; bs <<= 2)); do + for ((bs=$min_blksz; bs <= max_blksz; bs <<= 2)); do dd if=/dev/zero of=$tf bs=$bs count=1 oflag=sync conv=notrunc blksz=$(zfs_object_blksz ost1 $zfs_objid) @@ -15828,18 +15829,18 @@ test_312() { # LU-4856 rm -f $tf # block size change by sequential append write - dd if=/dev/zero of=$tf bs=4K count=1 oflag=sync conv=notrunc + dd if=/dev/zero of=$tf bs=$min_blksz count=1 oflag=sync conv=notrunc oid=$($LFS getstripe $tf | awk '/obdidx/{getline; print $2}') zfs_objid=$(zfs_oid_to_objid ost1 $oid) - for ((count = 1; count < $((max_blksz / 4096)); count *= 2)); do - dd if=/dev/zero of=$tf bs=4K count=$count seek=$count \ + for ((count = 1; count < $((max_blksz / min_blksz)); count *= 2)); do + dd if=/dev/zero of=$tf bs=$min_blksz count=$count seek=$count \ oflag=sync conv=notrunc blksz=$(zfs_object_blksz ost1 $zfs_objid) - blksz=$((blksz / 8192)) # in 2*4K unit - [ $blksz -eq $count ] || - error "blksz error(in 8k): $blksz, expected: $count" + [ $blksz -eq $((2 * count * min_blksz)) ] || + error "blksz error, actual $blksz, " \ + "expected: 2 * $count * $min_blksz" done rm -f $tf @@ -15848,9 +15849,10 @@ test_312() { # LU-4856 oid=$($LFS getstripe $tf | awk '/obdidx/{getline; print $2}') zfs_objid=$(zfs_oid_to_objid ost1 $oid) - dd if=/dev/zero of=$tf bs=8K count=1 oflag=sync conv=notrunc + dd if=/dev/zero of=$tf bs=1K count=1 oflag=sync conv=notrunc blksz=$(zfs_object_blksz ost1 $zfs_objid) - [ $blksz -eq 8192 ] || error "blksz error: $blksz, expected: 8k" + [ $blksz -eq $min_blksz ] || + error "blksz error: $blksz, expected: $min_blksz" dd if=/dev/zero of=$tf bs=64K count=1 oflag=sync conv=notrunc seek=128 blksz=$(zfs_object_blksz ost1 $zfs_objid) -- 1.8.3.1