Whamcloud - gitweb
LU-4865 zfs: grow block size by write pattern 27/15127/12
authorJinshan Xiong <jinshan.xiong@intel.com>
Wed, 3 Jun 2015 07:06:19 +0000 (00:06 -0700)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 10 Sep 2015 01:02:45 +0000 (01:02 +0000)
This patch grows the block size by write RPC. The osd-zfs blocksize
used to be fixed at 128KB, which is too big for random write and
too small for seqential write.

This patch decides the block size by the first few RPCs. If the first
few RPCs are sequential, mostly it will use bigger block size;
otherwise, smaller block size will be used.

Signed-off-by: Jinshan Xiong <jinshan.xiong@intel.com>
Change-Id: I6cf8fa8eca998e73bbb99f66eb1323c3abcf39f0
Reviewed-on: http://review.whamcloud.com/15127
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/osd-zfs/osd_io.c
lustre/osd-zfs/osd_object.c

index 9b650f5..004cccf 100644 (file)
@@ -720,6 +720,51 @@ retry:
        RETURN(rc);
 }
 
+/**
+ * Policy to grow ZFS block size by write pattern.
+ * For sequential write, it grows block size gradually until it reaches the
+ * maximum blocksize the dataset can support. Otherwise, it will just use
+ * the maximum block size.
+ */
+static int osd_grow_blocksize(struct osd_object *obj, struct osd_thandle *oh,
+                             uint64_t start, uint64_t end)
+{
+       struct osd_device       *osd = osd_obj2dev(obj);
+       dmu_buf_impl_t          *db = (dmu_buf_impl_t *)obj->oo_db;
+       dnode_t                 *dn;
+       uint32_t                 blksz;
+       int                      rc = 0;
+       ENTRY;
+
+       DB_DNODE_ENTER(db);
+       dn = DB_DNODE(db);
+
+       if (dn->dn_maxblkid > 0) /* can't change block size */
+               GOTO(out, rc);
+
+       blksz = dn->dn_datablksz;
+       if (blksz >= osd->od_max_blksz)
+               GOTO(out, rc);
+
+       /* now ZFS can support up to 16MB block size, and if the write
+        * is sequential, it just increases the block size gradually */
+       if (start <= blksz) { /* sequential */
+               blksz = (uint32_t)min_t(uint64_t, osd->od_max_blksz, end);
+               if (!is_power_of_2(blksz))
+                       blksz = size_roundup_power2(blksz);
+       } else { /* otherwise, use maximum block size */
+               blksz = osd->od_max_blksz;
+       }
+
+       if (blksz > dn->dn_datablksz)
+               rc = -dmu_object_set_blocksize(osd->od_os, dn->dn_object,
+                                              blksz, 0, oh->ot_tx);
+       EXIT;
+out:
+       DB_DNODE_EXIT(db);
+       return rc;
+}
+
 static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                        struct niobuf_local *lnb, int npages,
                        struct thandle *th)
@@ -738,6 +783,14 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
        LASSERT(th != NULL);
        oh = container_of0(th, struct osd_thandle, ot_super);
 
+       /* adjust block size. Assume the buffers are sorted. */
+       rc = osd_grow_blocksize(obj, oh, lnb[0].lnb_file_offset,
+                               lnb[npages - 1].lnb_file_offset +
+                               lnb[npages - 1].lnb_len);
+       if (rc < 0) /* ignore the error */
+               CDEBUG(D_INODE, "obj "DFID": change block size error rc=%d\n",
+                      PFID(lu_object_fid(&dt->do_lu)), rc);
+
        for (i = 0; i < npages; i++) {
                CDEBUG(D_INODE, "write %u bytes at %u\n",
                        (unsigned) lnb[i].lnb_len,
index 9b5e11f..39b31ba 100644 (file)
@@ -1293,16 +1293,11 @@ static dmu_buf_t *osd_mkreg(const struct lu_env *env, struct osd_object *obj,
        if (rc)
                return ERR_PTR(rc);
 
-       /*
-        * XXX: This heuristic is non-optimal.  It would be better to
-        * increase the blocksize up to osd->od_max_blksz during the write.
-        * This is exactly how the ZPL behaves and it ensures that the right
-        * blocksize is selected based on the file size rather than the
-        * making broad assumptions based on the osd type.
-        */
        if (!lu_device_is_md(osd2lu_dev(osd))) {
+               /* uses 4K as default block size because clients write data
+                * with page size that is 4K at minimum */
                rc = -dmu_object_set_blocksize(osd->od_os, db->db_object,
-                                              osd->od_max_blksz, 0, oh->ot_tx);
+                                              4096, 0, oh->ot_tx);
                if (unlikely(rc)) {
                        CERROR("%s: can't change blocksize: %d\n",
                               osd->od_svname, rc);