From b738c4850935f3a9c483b3141cb37d6539557615 Mon Sep 17 00:00:00 2001
From: Jinshan Xiong <jinshan.xiong@intel.com>
Date: Thu, 11 Aug 2016 21:20:44 -0700
Subject: [PATCH] LU-4865 zfs: grow block size by write pattern

This patch grows the block size by write RPC. The osd-zfs blocksize
used to be fixed at 128KB, which is too big for random write and
too small for seqential write.

This patch decides the block size by the first few RPCs. If the first
few RPCs are sequential, mostly it will pick maximum block size for
the object; otherwise, a feasible block size will be picked by the
RPC size.

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: I66f7cbdc2b5e0365058b152b4865b00cdabb0cf3
Reviewed-on: http://review.whamcloud.com/18441
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Don Brady <don.brady@intel.com>
---
 lustre/osd-zfs/osd_handler.c  |  2 +-
 lustre/osd-zfs/osd_internal.h |  2 +-
 lustre/osd-zfs/osd_io.c       | 66 +++++++++++++++++++++++++++++
 lustre/osd-zfs/osd_object.c   | 13 +++---
 lustre/tests/sanity.sh        | 98 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 171 insertions(+), 10 deletions(-)

diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c
index 5d1ae30..7e5c3dc 100644
--- a/lustre/osd-zfs/osd_handler.c
+++ b/lustre/osd-zfs/osd_handler.c
@@ -977,7 +977,7 @@ static int osd_mount(const struct lu_env *env,
 		RETURN(rc);
 
 	o->od_xattr_in_sa = B_TRUE;
-	o->od_max_blksz = SPA_OLD_MAXBLOCKSIZE;
+	o->od_max_blksz = osd_spa_maxblocksize(o->od_os->os_spa);
 
 	rc = osd_objset_register_callbacks(o);
 	if (rc)
diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h
index 20b1171..0706ea1 100644
--- a/lustre/osd-zfs/osd_internal.h
+++ b/lustre/osd-zfs/osd_internal.h
@@ -329,7 +329,7 @@ struct osd_object {
 	struct rw_semaphore	 oo_sem;
 
 	/* to serialize some updates: destroy vs. others,
-	 * xattr_set, etc */
+	 * xattr_set, object block size change etc */
 	struct rw_semaphore	 oo_guard;
 
 	/* protected by oo_guard */
diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c
index a6b6cea..eed35f5 100644
--- a/lustre/osd-zfs/osd_io.c
+++ b/lustre/osd-zfs/osd_io.c
@@ -682,6 +682,68 @@ retry:
 	RETURN(rc);
 }
 
+/**
+ * Policy to grow ZFS block size by write pattern.
+ * For sequential write, it grows block size gradually until it reaches the
+ * maximum blocksize the dataset can support. Otherwise, it will pick a
+ * a block size by the writing region of this I/O.
+ */
+static int osd_grow_blocksize(struct osd_object *obj, struct osd_thandle *oh,
+			      uint64_t start, uint64_t end)
+{
+	struct osd_device	*osd = osd_obj2dev(obj);
+	dmu_buf_impl_t		*db = (dmu_buf_impl_t *)obj->oo_db;
+	dnode_t			*dn;
+	uint32_t		 blksz;
+	int			 rc = 0;
+
+	ENTRY;
+
+	DB_DNODE_ENTER(db);
+	dn = DB_DNODE(db);
+
+	if (dn->dn_maxblkid > 0) /* can't change block size */
+		GOTO(out, rc);
+
+	if (dn->dn_datablksz >= osd->od_max_blksz)
+		GOTO(out, rc);
+
+	down_write(&obj->oo_guard);
+
+	blksz = dn->dn_datablksz;
+	if (blksz >= osd->od_max_blksz) /* check again after grabbing lock */
+		GOTO(out_unlock, rc);
+
+	/* now ZFS can support up to 16MB block size, and if the write
+	 * is sequential, it just increases the block size gradually */
+	if (start <= blksz) { /* sequential */
+		blksz = (uint32_t)min_t(uint64_t, osd->od_max_blksz, end);
+	} else { /* sparse, pick a block size by write region */
+		blksz = (uint32_t)min_t(uint64_t, osd->od_max_blksz,
+					end - start);
+	}
+
+	if (!is_power_of_2(blksz))
+		blksz = size_roundup_power2(blksz);
+
+	if (blksz > dn->dn_datablksz) {
+		rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object,
+					       blksz, 0, oh->ot_tx);
+		LASSERT(ergo(rc == 0, dn->dn_datablksz >= blksz));
+		if (rc < 0)
+			CDEBUG(D_INODE, "object "DFID": change block size"
+			       "%u -> %u error rc = %d\n",
+			       PFID(lu_object_fid(&obj->oo_dt.do_lu)),
+			       dn->dn_datablksz, blksz, rc);
+	}
+	EXIT;
+out_unlock:
+	up_write(&obj->oo_guard);
+out:
+	DB_DNODE_EXIT(db);
+	return rc;
+}
+
 static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
 			struct niobuf_local *lnb, int npages,
 			struct thandle *th)
@@ -700,6 +762,10 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
 	LASSERT(th != NULL);
 	oh = container_of0(th, struct osd_thandle, ot_super);
 
+	/* adjust block size. Assume the buffers are sorted. */
+	(void)osd_grow_blocksize(obj, oh, lnb[0].lnb_file_offset,
+				 lnb[npages - 1].lnb_file_offset +
+				 lnb[npages - 1].lnb_len);
 	for (i = 0; i < npages; i++) {
 		CDEBUG(D_INODE, "write %u bytes at %u\n",
 			(unsigned) lnb[i].lnb_len,
diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c
index a1d3ab7..bd1fac2 100644
--- a/lustre/osd-zfs/osd_object.c
+++ b/lustre/osd-zfs/osd_object.c
@@ -1384,16 +1384,13 @@ static dmu_buf_t *osd_mkreg(const struct lu_env *env, struct osd_object *obj,
 	if (rc)
 		return ERR_PTR(rc);
 
-	/*
-	 * XXX: This heuristic is non-optimal.  It would be better to
-	 * increase the blocksize up to osd->od_max_blksz during the write.
-	 * This is exactly how the ZPL behaves and it ensures that the right
-	 * blocksize is selected based on the file size rather than the
-	 * making broad assumptions based on the osd type.
-	 */
 	if ((fid_is_idif(fid) || fid_is_norm(fid)) && osd->od_is_ost) {
+		/* The minimum block size must be at least page size otherwise
+		 * it will break the assumption in tgt_thread_big_cache where
+		 * the array size is PTLRPC_MAX_BRW_PAGES. It will also affect
+		 * RDMA due to subpage transfer size */
 		rc = -dmu_object_set_blocksize(osd->od_os, db->db_object,
-					       osd->od_max_blksz, 0, oh->ot_tx);
+					       PAGE_SIZE, 0, oh->ot_tx);
 		if (unlikely(rc)) {
 			CERROR("%s: can't change blocksize: %d\n",
 			       osd->od_svname, rc);
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 97f561b..085a3f4 100755
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -14997,6 +14997,104 @@ test_311() {
 }
 run_test 311 "disable OSP precreate, and unlink should destroy objs"
 
+zfs_oid_to_objid()
+{
+	local ost=$1
+	local objid=$2
+
+	local vdevdir=$(dirname $(facet_vdevice $ost))
+	local cmd="$ZDB -e -p $vdevdir -dddd $(facet_device $ost)"
+	local zfs_zapid=$(do_facet $ost $cmd |
+			  grep -w "/O/0/d$((objid%32))" -C 5 |
+			  awk '/Object/{getline; print $1}')
+	local zfs_objid=$(do_facet $ost $cmd $zfs_zapid |
+			  awk "/$objid = /"'{printf $3}')
+
+	echo $zfs_objid
+}
+
+zfs_object_blksz() {
+	local ost=$1
+	local objid=$2
+
+	local vdevdir=$(dirname $(facet_vdevice $ost))
+	local cmd="$ZDB -e -p $vdevdir -dddd $(facet_device $ost)"
+	local blksz=$(do_facet $ost $cmd $objid |
+		      awk '/dblk/{getline; printf $4}')
+
+	case "${blksz: -1}" in
+		k|K) blksz=$((${blksz:0:$((${#blksz} - 1))}*1024)) ;;
+		m|M) blksz=$((${blksz:0:$((${#blksz} - 1))}*1024*1024)) ;;
+		*) ;;
+	esac
+
+	echo $blksz
+}
+
+test_312() { # LU-4856
+	[ $(facet_fstype ost1) = "zfs" ] ||
+		{ skip "the test only applies to zfs" && return; }
+
+	local max_blksz=$(do_facet ost1 \
+			  $ZFS get -p recordsize $(facet_device ost1) |
+			  awk '!/VALUE/{print $3}')
+
+	# to make life a little bit easier
+	$LFS mkdir -c 1 -i 0 $DIR/$tdir
+	$LFS setstripe -c 1 -i 0 $DIR/$tdir
+
+	local tf=$DIR/$tdir/$tfile
+	touch $tf
+	local oid=$($LFS getstripe $tf | awk '/obdidx/{getline; print $2}')
+
+	# Get ZFS object id
+	local zfs_objid=$(zfs_oid_to_objid ost1 $oid)
+
+	# block size change by sequential over write
+	local blksz
+	for ((bs=4096; bs <= max_blksz; bs <<= 2)); do
+		dd if=/dev/zero of=$tf bs=$bs count=1 oflag=sync conv=notrunc
+
+		blksz=$(zfs_object_blksz ost1 $zfs_objid)
+		[ $blksz -eq $bs ] || error "blksz error: $blksz, expected: $bs"
+	done
+	rm -f $tf
+
+	# block size change by sequential append write
+	dd if=/dev/zero of=$tf bs=4K count=1 oflag=sync conv=notrunc
+	oid=$($LFS getstripe $tf | awk '/obdidx/{getline; print $2}')
+	zfs_objid=$(zfs_oid_to_objid ost1 $oid)
+
+	for ((count = 1; count < $((max_blksz / 4096)); count *= 2)); do
+		dd if=/dev/zero of=$tf bs=4K count=$count seek=$count \
+			oflag=sync conv=notrunc
+
+		blksz=$(zfs_object_blksz ost1 $zfs_objid)
+		blksz=$((blksz / 8192)) # in 2*4K unit
+		[ $blksz -eq $count ] ||
+			error "blksz error(in 8k): $blksz, expected: $count"
+	done
+	rm -f $tf
+
+	# random write
+	touch $tf
+	oid=$($LFS getstripe $tf | awk '/obdidx/{getline; print $2}')
+	zfs_objid=$(zfs_oid_to_objid ost1 $oid)
+
+	dd if=/dev/zero of=$tf bs=8K count=1 oflag=sync conv=notrunc
+	blksz=$(zfs_object_blksz ost1 $zfs_objid)
+	[ $blksz -eq 8192 ] || error "blksz error: $blksz, expected: 8k"
+
+	dd if=/dev/zero of=$tf bs=64K count=1 oflag=sync conv=notrunc seek=128
+	blksz=$(zfs_object_blksz ost1 $zfs_objid)
+	[ $blksz -eq 65536 ] || error "blksz error: $blksz, expected: 64k"
+
+	dd if=/dev/zero of=$tf bs=1M count=1 oflag=sync conv=notrunc
+	blksz=$(zfs_object_blksz ost1 $zfs_objid)
+	[ $blksz -eq 65536 ] || error "rewrite error: $blksz, expected: 64k"
+}
+run_test 312 "make sure ZFS adjusts its block size by write pattern"
+
 test_400a() { # LU-1606, was conf-sanity test_74
 	local extra_flags=''
 	local out=$TMP/$tfile
-- 
1.8.3.1