Whamcloud - gitweb
LU-14918 osd: don't declare similar zfs writes twice 01/49701/7
authorAlex Zhuravlev <bzzz@whamcloud.com>
Thu, 19 Jan 2023 19:06:36 +0000 (22:06 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 14 Feb 2023 06:02:55 +0000 (06:02 +0000)
in some cases (like overstriping) the same operations can be
declared multiple times (new llog records) and this lead to
huge number of credits and performance degradation. we can
avoid this checking for duplicate declarations.
notice each declare operation results in a allocation in ZFS.

the example for an overstriped file (2000 stripes over 4 OSTs),
declare ops before after
create: 2001 2
unlink: 10001 10

creation of 1K-stripe files (over 4 OSTs) is 2.5% faster.
removal of 1K-stripe files is 44% faster.

single-stripe file creation/removal does not degrade.

Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Change-Id: I5d9e6d3a1574ccd7bf97fd3a67ab4fff0b6a352c
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49701
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Arshad Hussain <arshad.hussain@aeoncomputing.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/obdclass/llog_osd.c
lustre/osd-zfs/osd_io.c

index 92f47a8..a246731 100644 (file)
@@ -339,16 +339,15 @@ static int llog_osd_declare_write_rec(const struct lu_env *env,
        lgi->lgi_buf.lb_len = chunk_size;
        lgi->lgi_buf.lb_buf = NULL;
        /* each time we update header */
-       rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0,
-                                    th);
+       rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, th);
        if (rc || idx == 0) /* if error or just header */
                RETURN(rc);
 
        /**
         * the pad record can be inserted so take into account double
-        * record size
+        * record size: pad and the actual record into a new block
         */
-       lgi->lgi_buf.lb_len = chunk_size * 2;
+       lgi->lgi_buf.lb_len = rec->lrh_len * 2;
        lgi->lgi_buf.lb_buf = NULL;
        /* XXX: implement declared window or multi-chunks approach */
        rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th);
index 018911b..5e7708c 100644 (file)
@@ -170,6 +170,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
 {
        struct osd_object  *obj  = osd_dt_obj(dt);
        struct osd_device  *osd = osd_obj2dev(obj);
+       loff_t _pos = pos, max = 0;
        struct osd_thandle *oh;
        uint64_t            oid;
        ENTRY;
@@ -190,9 +191,44 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
        /* XXX: we still miss for append declaration support in ZFS
         *      -1 means append which is used by llog mostly, llog
         *      can grow upto LLOG_MIN_CHUNK_SIZE*8 records */
+       max = max_t(loff_t, 256 * 8 * LLOG_MIN_CHUNK_SIZE,
+                   obj->oo_attr.la_size + (2 << 20));
        if (pos == -1)
-               pos = max_t(loff_t, 256 * 8 * LLOG_MIN_CHUNK_SIZE,
-                           obj->oo_attr.la_size + (2 << 20));
+               pos = max;
+       if (obj->oo_dn) {
+               loff_t tstart, tend, end = pos + buf->lb_len;
+               dmu_tx_hold_t *txh;
+
+               /* try to find a close declared window to fit/extend */
+               for (txh = list_head(&oh->ot_tx->tx_holds); txh != NULL;
+                   txh = list_next(&oh->ot_tx->tx_holds, txh)) {
+                       if (obj->oo_dn != txh->txh_dnode)
+                               continue;
+                       if (txh->txh_type != THT_WRITE)
+                               continue;
+
+                       /* bytes already declared in this handle */
+                       tstart = txh->txh_arg1;
+                       tend = txh->txh_arg1 + txh->txh_arg2;
+
+                       if (pos < tstart)
+                               tstart = pos;
+                       if (tend < end)
+                               tend = end;
+                       /* if this is an append, then extend it */
+                       if (_pos == -1 && txh->txh_arg1 == max)
+                               tend += buf->lb_len;
+                       /* don't let too big appends */
+                       if (tend - tstart > 4*1024*1024)
+                               continue;
+                       if (pos >= tend || end <= tstart)
+                               continue;
+
+                       txh->txh_arg1 = tstart;
+                       txh->txh_arg2 = tend - tstart;
+                       return 0;
+               }
+       }
        osd_tx_hold_write(oh->ot_tx, oid, obj->oo_dn, pos, buf->lb_len);
 
        /* dt_declare_write() is usually called for system objects, such