From 3e4369135127b350dbc26a4a5dc94cfa46e394cf Mon Sep 17 00:00:00 2001 From: Jinshan Xiong Date: Wed, 3 Jun 2015 00:06:19 -0700 Subject: [PATCH] LU-4865 zfs: grow block size by write pattern This patch grows the block size by write RPC. The osd-zfs blocksize used to be fixed at 128KB, which is too big for random write and too small for seqential write. This patch decides the block size by the first few RPCs. If the first few RPCs are sequential, mostly it will use bigger block size; otherwise, smaller block size will be used. Signed-off-by: Jinshan Xiong Change-Id: I6cf8fa8eca998e73bbb99f66eb1323c3abcf39f0 Reviewed-on: http://review.whamcloud.com/15127 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/osd-zfs/osd_io.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ lustre/osd-zfs/osd_object.c | 11 +++------- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 9b650f5..004cccf 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -720,6 +720,51 @@ retry: RETURN(rc); } +/** + * Policy to grow ZFS block size by write pattern. + * For sequential write, it grows block size gradually until it reaches the + * maximum blocksize the dataset can support. Otherwise, it will just use + * the maximum block size. + */ +static int osd_grow_blocksize(struct osd_object *obj, struct osd_thandle *oh, + uint64_t start, uint64_t end) +{ + struct osd_device *osd = osd_obj2dev(obj); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)obj->oo_db; + dnode_t *dn; + uint32_t blksz; + int rc = 0; + ENTRY; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_maxblkid > 0) /* can't change block size */ + GOTO(out, rc); + + blksz = dn->dn_datablksz; + if (blksz >= osd->od_max_blksz) + GOTO(out, rc); + + /* now ZFS can support up to 16MB block size, and if the write + * is sequential, it just increases the block size gradually */ + if (start <= blksz) { /* sequential */ + blksz = (uint32_t)min_t(uint64_t, osd->od_max_blksz, end); + if (!is_power_of_2(blksz)) + blksz = size_roundup_power2(blksz); + } else { /* otherwise, use maximum block size */ + blksz = osd->od_max_blksz; + } + + if (blksz > dn->dn_datablksz) + rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object, + blksz, 0, oh->ot_tx); + EXIT; +out: + DB_DNODE_EXIT(db); + return rc; +} + static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, struct niobuf_local *lnb, int npages, struct thandle *th) @@ -738,6 +783,14 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); + /* adjust block size. Assume the buffers are sorted. */ + rc = osd_grow_blocksize(obj, oh, lnb[0].lnb_file_offset, + lnb[npages - 1].lnb_file_offset + + lnb[npages - 1].lnb_len); + if (rc < 0) /* ignore the error */ + CDEBUG(D_INODE, "obj "DFID": change block size error rc=%d\n", + PFID(lu_object_fid(&dt->do_lu)), rc); + for (i = 0; i < npages; i++) { CDEBUG(D_INODE, "write %u bytes at %u\n", (unsigned) lnb[i].lnb_len, diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c index 9b5e11f..39b31ba 100644 --- a/lustre/osd-zfs/osd_object.c +++ b/lustre/osd-zfs/osd_object.c @@ -1293,16 +1293,11 @@ static dmu_buf_t *osd_mkreg(const struct lu_env *env, struct osd_object *obj, if (rc) return ERR_PTR(rc); - /* - * XXX: This heuristic is non-optimal. It would be better to - * increase the blocksize up to osd->od_max_blksz during the write. - * This is exactly how the ZPL behaves and it ensures that the right - * blocksize is selected based on the file size rather than the - * making broad assumptions based on the osd type. - */ if (!lu_device_is_md(osd2lu_dev(osd))) { + /* uses 4K as default block size because clients write data + * with page size that is 4K at minimum */ rc = -dmu_object_set_blocksize(osd->od_os, db->db_object, - osd->od_max_blksz, 0, oh->ot_tx); + 4096, 0, oh->ot_tx); if (unlikely(rc)) { CERROR("%s: can't change blocksize: %d\n", osd->od_svname, rc); -- 1.8.3.1