Whamcloud - gitweb
LU-4611 osd: improve credits calculation 58/9258/15
authorAlex Zhuravlev <alexey.zhuravlev@intel.com>
Thu, 13 Feb 2014 19:55:43 +0000 (23:55 +0400)
committerOleg Drokin <oleg.drokin@intel.com>
Tue, 25 Mar 2014 14:12:31 +0000 (14:12 +0000)
- llog catalog do not declare records twice (for old and new objects)
  this might be an issue on ZFS with full debug enabled..
- llog to specify append by pos=-1, so OSD can take this into account
- object create/destroy should not include OI, this is calculated yet
- EA declaration to improve few specific cases
- osd_declare_write() to recognize overwrite optimistically,
  using inode size and i_blocks
- osd_declare_write() to optimize very specific cases, like legacy
  blockmaps with small offsets and allocated indirects
- index delete modify just a single block

preliminary testing on a local setup with 7 OSTs:
1360 credits before and 436 credits after.

llog declarations and index inserts (part of llog object creation)
still consume about 70% (7 OSTs):
   create: 7/28, destroy: 1/4
   attr_set: 2/2, xattr_set: 8/21
   write: 37/179, punch: 14/56, quota 2/2
   insert: 8/135, delete: 2/5
   ref_add: 1/1, ref_del: 3/3

Signed-off-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Change-Id: Icb5c79df1f8ba248509b9d2561ac8843bb01f6af
Reviewed-on: http://review.whamcloud.com/9258
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Niu Yawei <yawei.niu@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/obdclass/llog_cat.c
lustre/obdclass/llog_internal.h
lustre/obdclass/llog_osd.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c
lustre/osd-ldiskfs/osd_quota.c
lustre/osd-zfs/osd_io.c

index fc7cd13..4ec7794 100644 (file)
@@ -66,9 +66,9 @@ static int llog_cat_new_log(const struct lu_env *env,
                            struct llog_handle *loghandle,
                            struct thandle *th)
 {
-
+       struct llog_thread_info *lgi = llog_info(env);
+       struct llog_logid_rec *rec = &lgi->lgi_logid;
         struct llog_log_hdr *llh;
-        struct llog_logid_rec rec = { { 0 }, };
         int rc, index, bitmap_size;
         ENTRY;
 
@@ -122,16 +122,16 @@ static int llog_cat_new_log(const struct lu_env *env,
               DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
               loghandle->lgh_id.lgl_ogen, index,
               POSTID(&cathandle->lgh_id.lgl_oi));
-        /* build the record for this log in the catalog */
-        rec.lid_hdr.lrh_len = sizeof(rec);
-        rec.lid_hdr.lrh_index = index;
-        rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
-        rec.lid_id = loghandle->lgh_id;
-        rec.lid_tail.lrt_len = sizeof(rec);
-        rec.lid_tail.lrt_index = index;
+       /* build the record for this log in the catalog */
+       rec->lid_hdr.lrh_len = sizeof(*rec);
+       rec->lid_hdr.lrh_index = index;
+       rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+       rec->lid_id = loghandle->lgh_id;
+       rec->lid_tail.lrt_len = sizeof(*rec);
+       rec->lid_tail.lrt_index = index;
 
         /* update the catalog: header and record */
-       rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+       rc = llog_write_rec(env, cathandle, &rec->lid_hdr,
                            &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
        if (rc < 0)
                GOTO(out_destroy, rc);
@@ -383,6 +383,8 @@ int llog_cat_declare_add_rec(const struct lu_env *env,
                             struct llog_handle *cathandle,
                             struct llog_rec_hdr *rec, struct thandle *th)
 {
+       struct llog_thread_info *lgi = llog_info(env);
+       struct llog_logid_rec   *lirec = &lgi->lgi_logid;
        struct llog_handle      *loghandle, *next;
        int                      rc = 0;
 
@@ -418,12 +420,14 @@ int llog_cat_declare_add_rec(const struct lu_env *env,
        if (rc)
                GOTO(out, rc);
 
+       lirec->lid_hdr.lrh_len = sizeof(*lirec);
+
        if (!llog_exist(cathandle->u.chd.chd_current_log)) {
                rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
                                         th);
                if (rc)
                        GOTO(out, rc);
-               llog_declare_write_rec(env, cathandle, NULL, -1, th);
+               llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1, th);
        }
        /* declare records in the llogs */
        rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
@@ -435,9 +439,14 @@ int llog_cat_declare_add_rec(const struct lu_env *env,
        if (next) {
                if (!llog_exist(next)) {
                        rc = llog_declare_create(env, next, th);
-                       llog_declare_write_rec(env, cathandle, NULL, -1, th);
+                       llog_declare_write_rec(env, cathandle, &lirec->lid_hdr,
+                                              -1, th);
                }
-               llog_declare_write_rec(env, next, rec, -1, th);
+               /* XXX: we hope for declarations made for existing llog
+                *      this might be not correct with some backends
+                *      where declarations are expected against specific
+                *      object like ZFS with full debugging enabled */
+               /*llog_declare_write_rec(env, next, rec, -1, th);*/
        }
 out:
        RETURN(rc);
index 0752141..f90be39 100644 (file)
@@ -58,6 +58,7 @@ struct llog_thread_info {
        loff_t                           lgi_off;
        struct llog_rec_hdr              lgi_lrh;
        struct llog_rec_tail             lgi_tail;
+       struct llog_logid_rec            lgi_logid;
 };
 
 extern struct lu_context_key llog_thread_key;
index 9bff08e..fd9bfc3 100644 (file)
@@ -283,6 +283,8 @@ static int llog_osd_declare_write_rec(const struct lu_env *env,
        LASSERT(env);
        LASSERT(th);
        LASSERT(loghandle);
+       LASSERT(rec);
+       LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
 
        o = loghandle->lgh_obj;
        LASSERT(o);
@@ -309,10 +311,10 @@ static int llog_osd_declare_write_rec(const struct lu_env *env,
                lgi->lgi_off = 0;
        }
 
-       lgi->lgi_buf.lb_len = 32 * 1024;
+       lgi->lgi_buf.lb_len = rec->lrh_len;
        lgi->lgi_buf.lb_buf = NULL;
        /* XXX: implement declared window or multi-chunks approach */
-       rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th);
+       rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th);
 
        RETURN(rc);
 }
@@ -913,11 +915,8 @@ static int llog_osd_declare_create(const struct lu_env *env,
        if (rc)
                RETURN(rc);
 
-       lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
-       lgi->lgi_buf.lb_buf = NULL;
-       rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, th);
-       if (rc)
-               RETURN(rc);
+       /* do not declare header initialization here as it's declared
+        * in llog_osd_declare_write_rec() which is always called */
 
        if (res->lgh_name) {
                struct dt_object *llog_dir;
index c6e090d..75fc855 100644 (file)
@@ -1001,11 +1001,9 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                      LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name,
                      oh->ot_credits,
                      osd_journal(dev)->j_max_transaction_buffers);
-               CWARN("  create: %u/%u, delete: %u/%u, destroy: %u/%u\n",
+               CWARN("  create: %u/%u, destroy: %u/%u\n",
                      oti->oti_declare_ops[OSD_OT_CREATE],
                      oti->oti_declare_ops_cred[OSD_OT_CREATE],
-                     oti->oti_declare_ops[OSD_OT_DELETE],
-                     oti->oti_declare_ops_cred[OSD_OT_DELETE],
                      oti->oti_declare_ops[OSD_OT_DESTROY],
                      oti->oti_declare_ops_cred[OSD_OT_DESTROY]);
                CWARN("  attr_set: %u/%u, xattr_set: %u/%u\n",
@@ -1023,8 +1021,8 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                CWARN("  insert: %u/%u, delete: %u/%u\n",
                      oti->oti_declare_ops[OSD_OT_INSERT],
                      oti->oti_declare_ops_cred[OSD_OT_INSERT],
-                     oti->oti_declare_ops[OSD_OT_DESTROY],
-                     oti->oti_declare_ops_cred[OSD_OT_DESTROY]);
+                     oti->oti_declare_ops[OSD_OT_DELETE],
+                     oti->oti_declare_ops_cred[OSD_OT_DELETE]);
                CWARN("  ref_add: %u/%u, ref_del: %u/%u\n",
                      oti->oti_declare_ops[OSD_OT_REF_ADD],
                      oti->oti_declare_ops_cred[OSD_OT_REF_ADD],
@@ -1430,55 +1428,57 @@ static int osd_init_capa_ctxt(const struct lu_env *env, struct dt_device *d,
  * If we mount with --data_journal we may need more.
  */
 const int osd_dto_credits_noquota[DTO_NR] = {
-        /**
-         * Insert/Delete.
-         * INDEX_EXTRA_TRANS_BLOCKS(8) +
-         * SINGLEDATA_TRANS_BLOCKS(8)
-         * XXX Note: maybe iam need more, since iam have more level than
-         *           EXT3 htree.
-         */
-        [DTO_INDEX_INSERT]  = 16,
-        [DTO_INDEX_DELETE]  = 16,
-        /**
+       /**
+        * Insert.
+        * INDEX_EXTRA_TRANS_BLOCKS(8) +
+        * SINGLEDATA_TRANS_BLOCKS(8)
+        * XXX Note: maybe iam need more, since iam have more level than
+        *           EXT3 htree.
+        */
+       [DTO_INDEX_INSERT]  = 16,
+       /**
+        * Delete
+        * just modify a single entry, probably merge few within a block
+        */
+       [DTO_INDEX_DELETE]  = 1,
+       /**
         * Used for OI scrub
-         */
-        [DTO_INDEX_UPDATE]  = 16,
-        /**
-         * Create a object. The same as create object in EXT3.
-         * DATA_TRANS_BLOCKS(14) +
-         * INDEX_EXTRA_BLOCKS(8) +
-         * 3(inode bits, groups, GDT)
-         */
-        [DTO_OBJECT_CREATE] = 25,
-        /**
-         * XXX: real credits to be fixed
-         */
-        [DTO_OBJECT_DELETE] = 25,
-        /**
-         * Attr set credits (inode)
-         */
-        [DTO_ATTR_SET_BASE] = 1,
-        /**
-         * Xattr set. The same as xattr of EXT3.
-         * DATA_TRANS_BLOCKS(14)
-         * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS
-         * are also counted in. Do not know why?
-         */
-        [DTO_XATTR_SET]     = 14,
-        [DTO_LOG_REC]       = 14,
-        /**
-         * credits for inode change during write.
-         */
-        [DTO_WRITE_BASE]    = 3,
-        /**
-         * credits for single block write.
-         */
-        [DTO_WRITE_BLOCK]   = 14,
-        /**
-         * Attr set credits for chown.
-         * This is extra credits for setattr, and it is null without quota
-         */
-        [DTO_ATTR_SET_CHOWN]= 0
+        */
+       [DTO_INDEX_UPDATE]  = 16,
+       /**
+        * 4(inode, inode bits, groups, GDT)
+        *   notice: OI updates are counted separately with DTO_INDEX_INSERT
+        */
+       [DTO_OBJECT_CREATE] = 4,
+       /**
+        * 4(inode, inode bits, groups, GDT)
+        *   notice: OI updates are counted separately with DTO_INDEX_DELETE
+        */
+       [DTO_OBJECT_DELETE] = 4,
+       /**
+        * Attr set credits (inode)
+        */
+       [DTO_ATTR_SET_BASE] = 1,
+       /**
+        * Xattr set. The same as xattr of EXT3.
+        * DATA_TRANS_BLOCKS(14)
+        * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS
+        * are also counted in. Do not know why?
+        */
+       [DTO_XATTR_SET]     = 14,
+       /**
+        * credits for inode change during write.
+        */
+       [DTO_WRITE_BASE]    = 3,
+       /**
+        * credits for single block write.
+        */
+       [DTO_WRITE_BLOCK]   = 14,
+       /**
+        * Attr set credits for chown.
+        * This is extra credits for setattr, and it is null without quota
+        */
+       [DTO_ATTR_SET_CHOWN] = 0
 };
 
 static const struct dt_device_operations osd_dt_ops = {
@@ -2346,12 +2346,10 @@ static int osd_declare_object_create(const struct lu_env *env,
 
        osd_trans_declare_op(env, oh, OSD_OT_CREATE,
                             osd_dto_credits_noquota[DTO_OBJECT_CREATE]);
-       if (!fid_is_on_ost(osd_oti_get(env), osd_dt_dev(handle->th_dev),
-                          lu_object_fid(&dt->do_lu), OI_CHECK_FLD))
-               /* Reuse idle OI block may cause additional one OI block
-                * to be changed. */
-               osd_trans_declare_op(env, oh, OSD_OT_INSERT,
-                               osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1);
+       /* Reuse idle OI block may cause additional one OI block
+        * to be changed. */
+       osd_trans_declare_op(env, oh, OSD_OT_INSERT,
+                            osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1);
 
        /* If this is directory, then we expect . and .. to be inserted as
         * well. The one directory block always needs to be created for the
@@ -2916,16 +2914,32 @@ static int osd_declare_xattr_set(const struct lu_env *env,
                                  int fl, struct thandle *handle)
 {
        struct osd_thandle *oh;
+       int credits;
 
        LASSERT(handle != NULL);
 
        oh = container_of0(handle, struct osd_thandle, ot_super);
        LASSERT(oh->ot_handle == NULL);
 
-       osd_trans_declare_op(env, oh, OSD_OT_XATTR_SET,
-                            strcmp(name, XATTR_NAME_VERSION) == 0 ?
-                            osd_dto_credits_noquota[DTO_ATTR_SET_BASE] :
-                            osd_dto_credits_noquota[DTO_XATTR_SET]);
+       /* optimistic optimization: LMA is set first and usually fit inode */
+       if (strcmp(name, XATTR_NAME_LMA) == 0) {
+               if (dt_object_exists(dt))
+                       credits = 0;
+               else
+                       credits = 1;
+       } else if (strcmp(name, XATTR_NAME_VERSION) == 0) {
+               credits = 1;
+       } else {
+               struct osd_device  *osd = osd_dev(dt->do_lu.lo_dev);
+               struct super_block *sb = osd_sb(osd);
+               credits = osd_dto_credits_noquota[DTO_XATTR_SET];
+               if (buf && buf->lb_len > sb->s_blocksize) {
+                       credits *= (buf->lb_len + sb->s_blocksize - 1) >>
+                                       sb->s_blocksize_bits;
+               }
+       }
+
+       osd_trans_declare_op(env, oh, OSD_OT_XATTR_SET, credits);
 
        return 0;
 }
index dc52885..d806a70 100644 (file)
@@ -336,7 +336,6 @@ enum dt_txn_op {
         DTO_OBJECT_DELETE,
         DTO_ATTR_SET_BASE,
         DTO_XATTR_SET,
-        DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */
         DTO_WRITE_BASE,
         DTO_WRITE_BLOCK,
         DTO_ATTR_SET_CHOWN,
index 5e2a16b..7bab102 100644 (file)
@@ -1314,36 +1314,149 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
         return rc;
 }
 
+static inline int osd_extents_enabled(struct super_block *sb,
+                                     struct inode *inode)
+{
+       if (inode != NULL) {
+               if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL)
+                       return 1;
+       } else if (test_opt(sb, EXTENTS)) {
+               return 1;
+       }
+       return 0;
+}
+
+static inline int osd_calc_bkmap_credits(struct super_block *sb,
+                                        struct inode *inode,
+                                        const loff_t size,
+                                        const loff_t pos,
+                                        const int blocks)
+{
+       int credits, bits, bs, i;
+
+       bits = sb->s_blocksize_bits;
+       bs = 1 << bits;
+
+       /* legacy blockmap: 3 levels * 3 (bitmap,gd,itself)
+        * we do not expect blockmaps on the large files,
+        * so let's shrink it to 2 levels (4GB files) */
+
+       /* this is default reservation: 2 levels */
+       credits = (blocks + 2) * 3;
+
+       /* actual offset is unknown, hard to optimize */
+       if (pos == -1)
+               return credits;
+
+       /* now check for few specific cases to optimize */
+       if (pos + size <= LDISKFS_NDIR_BLOCKS * bs) {
+               /* no indirects */
+               credits = blocks;
+               /* allocate if not allocated */
+               if (inode == NULL) {
+                       credits += blocks * 2;
+                       return credits;
+               }
+               for (i = (pos >> bits); i < (pos >> bits) + blocks; i++) {
+                       LASSERT(i < LDISKFS_NDIR_BLOCKS);
+                       if (LDISKFS_I(inode)->i_data[i] == 0)
+                               credits += 2;
+               }
+       } else if (pos + size <= (LDISKFS_NDIR_BLOCKS + 1024) * bs) {
+               /* single indirect */
+               credits = blocks * 3;
+               /* probably indirect block has been allocated already */
+               if (!inode || LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK])
+                       credits += 3;
+       }
+
+       return credits;
+}
+
 static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
-                                const struct lu_buf *buf, loff_t pos,
+                                const struct lu_buf *buf, loff_t _pos,
                                 struct thandle *handle)
 {
-        struct osd_thandle *oh;
-        int                 credits;
-       struct inode       *inode;
-       int                 rc;
+       struct osd_object  *obj  = osd_dt_obj(dt);
+       struct inode       *inode = obj->oo_inode;
+       struct super_block *sb = osd_sb(osd_obj2dev(obj));
+       struct osd_thandle *oh;
+       int                 rc = 0, est = 0, credits, blocks, allocated = 0;
+       int                 bits, bs;
+       int                 depth, size;
+       loff_t              pos;
        ENTRY;
 
+       LASSERT(buf != NULL);
         LASSERT(handle != NULL);
 
         oh = container_of0(handle, struct osd_thandle, ot_super);
         LASSERT(oh->ot_handle == NULL);
 
-       credits = osd_dto_credits_noquota[DTO_WRITE_BLOCK];
+       size = buf->lb_len;
+       bits = sb->s_blocksize_bits;
+       bs = 1 << bits;
 
-       osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
+       if (_pos == -1) {
+               /* if this is an append, then we
+                * should expect cross-block record */
+               pos = 0;
+       } else {
+               pos = _pos;
+       }
 
-       inode = osd_dt_obj(dt)->oo_inode;
+       /* blocks to modify */
+       blocks = ((pos + size + bs - 1) >> bits) - (pos >> bits);
+       LASSERT(blocks > 0);
+
+       if (inode != NULL && _pos != -1) {
+               /* object size in blocks */
+               est = (i_size_read(inode) + bs - 1) >> bits;
+               allocated = inode->i_blocks >> (bits - 9);
+               if (pos + size <= i_size_read(inode) && est <= allocated) {
+                       /* looks like an overwrite, no need to modify tree */
+                       credits = blocks;
+                       /* no need to modify i_size */
+                       goto out;
+               }
+       }
+
+       if (osd_extents_enabled(sb, inode)) {
+               /*
+                * many concurrent threads may grow tree by the time
+                * our transaction starts. so, consider 2 is a min depth
+                * for every level we may need to allocate a new block
+                * and take some entries from the old one. so, 3 blocks
+                * to allocate (bitmap, gd, itself) + old block - 4 per
+                * level.
+                */
+               depth = inode != NULL ? ext_depth(inode) : 0;
+               depth = max(depth, 1) + 1;
+               credits = depth;
+               /* if not append, then split may need to modify
+                * existing blocks moving entries into the new ones */
+               if (_pos == -1)
+                       credits += depth;
+               /* blocks to store data: bitmap,gd,itself */
+               credits += blocks * 3;
+       } else {
+               credits = osd_calc_bkmap_credits(sb, inode, size, _pos, blocks);
+       }
+       /* if inode is created as part of the transaction,
+        * then it's counted already by the creation method */
+       if (inode != NULL)
+               credits++;
+
+out:
 
-       /* we may declare write to non-exist llog */
-       if (inode == NULL)
-               RETURN(0);
+       osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
 
        /* dt_declare_write() is usually called for system objects, such
         * as llog or last_rcvd files. We needn't enforce quota on those
         * objects, so always set the lqi_space as 0. */
-       rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, 0, oh,
-                                  true, true, NULL, false);
+       if (inode != NULL)
+               rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid,
+                                          0, oh, true, true, NULL, false);
        RETURN(rc);
 }
 
index c53d6f8..feda48d 100644 (file)
@@ -866,6 +866,7 @@ static int truncate_quota_index(const struct lu_env *env, struct dt_object *dt,
        struct inode            *inode;
        int                      rc;
        struct iam_container    *bag = &(osd_dt_obj(dt))->oo_dir->od_container;
+       struct lu_buf           *lb = &osd_oti_get(env)->oti_buf;
        ENTRY;
 
        LASSERT(bag->ic_root_bh != NULL);
@@ -897,7 +898,9 @@ static int truncate_quota_index(const struct lu_env *env, struct dt_object *dt,
        inode = osd_dt_obj(dt)->oo_inode;
        LASSERT(inode);
 
-       rc = dt_declare_record_write(env, dt, NULL, 0, th);
+       /* iam_lfix_create() writes two blocks at the beginning */
+       lb->lb_len = osd_sb(osd)->s_blocksize * 2;
+       rc = dt_declare_record_write(env, dt, lb, 0, th);
        if (rc)
                GOTO(out, rc);
 
index 1590e3d..b315807 100644 (file)
@@ -51,6 +51,7 @@
 #include <obd_class.h>
 #include <lustre_disk.h>
 #include <lustre_fid.h>
+#include <lustre/lustre_idl.h> /* LLOG_CHUNK_SIZE definition */
 
 #include "osd_internal.h"
 
@@ -137,6 +138,12 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
                dmu_tx_hold_sa_create(oh->ot_tx, ZFS_SA_BASE_ATTR_SIZE);
        }
 
+       /* XXX: we still miss for append declaration support in ZFS
+        *      -1 means append which is used by llog mostly, llog
+        *      can grow upto LLOG_CHUNK_SIZE*8 records */
+       if (pos == -1)
+               pos = max_t(loff_t, 256 * 8 * LLOG_CHUNK_SIZE,
+                           obj->oo_attr.la_size + (2 << 20));
        dmu_tx_hold_write(oh->ot_tx, oid, pos, buf->lb_len);
 
        /* dt_declare_write() is usually called for system objects, such