From 3902ff4c54925b2f1fcb732a32ed7ee5428e9f77 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Thu, 13 Feb 2014 23:55:43 +0400 Subject: [PATCH] LU-4611 osd: improve credits calculation - llog catalog do not declare records twice (for old and new objects) this might be an issue on ZFS with full debug enabled.. - llog to specify append by pos=-1, so OSD can take this into account - object create/destroy should not include OI, this is calculated yet - EA declaration to improve few specific cases - osd_declare_write() to recognize overwrite optimistically, using inode size and i_blocks - osd_declare_write() to optimize very specific cases, like legacy blockmaps with small offsets and allocated indirects - index delete modify just a single block preliminary testing on a local setup with 7 OSTs: 1360 credits before and 436 credits after. llog declarations and index inserts (part of llog object creation) still consume about 70% (7 OSTs): create: 7/28, destroy: 1/4 attr_set: 2/2, xattr_set: 8/21 write: 37/179, punch: 14/56, quota 2/2 insert: 8/135, delete: 2/5 ref_add: 1/1, ref_del: 3/3 Signed-off-by: Alex Zhuravlev Change-Id: Icb5c79df1f8ba248509b9d2561ac8843bb01f6af Reviewed-on: http://review.whamcloud.com/9258 Tested-by: Jenkins Reviewed-by: Andreas Dilger Reviewed-by: Niu Yawei Tested-by: Maloo Reviewed-by: Oleg Drokin --- lustre/obdclass/llog_cat.c | 35 ++++++---- lustre/obdclass/llog_internal.h | 1 + lustre/obdclass/llog_osd.c | 13 ++-- lustre/osd-ldiskfs/osd_handler.c | 140 +++++++++++++++++++++----------------- lustre/osd-ldiskfs/osd_internal.h | 1 - lustre/osd-ldiskfs/osd_io.c | 139 +++++++++++++++++++++++++++++++++---- lustre/osd-ldiskfs/osd_quota.c | 5 +- lustre/osd-zfs/osd_io.c | 7 ++ 8 files changed, 243 insertions(+), 98 deletions(-) diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c index fc7cd13..4ec7794 100644 --- a/lustre/obdclass/llog_cat.c +++ b/lustre/obdclass/llog_cat.c @@ -66,9 +66,9 @@ static int llog_cat_new_log(const struct lu_env *env, struct llog_handle *loghandle, struct thandle *th) { - + struct llog_thread_info *lgi = llog_info(env); + struct llog_logid_rec *rec = &lgi->lgi_logid; struct llog_log_hdr *llh; - struct llog_logid_rec rec = { { 0 }, }; int rc, index, bitmap_size; ENTRY; @@ -122,16 +122,16 @@ static int llog_cat_new_log(const struct lu_env *env, DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen, index, POSTID(&cathandle->lgh_id.lgl_oi)); - /* build the record for this log in the catalog */ - rec.lid_hdr.lrh_len = sizeof(rec); - rec.lid_hdr.lrh_index = index; - rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; - rec.lid_id = loghandle->lgh_id; - rec.lid_tail.lrt_len = sizeof(rec); - rec.lid_tail.lrt_index = index; + /* build the record for this log in the catalog */ + rec->lid_hdr.lrh_len = sizeof(*rec); + rec->lid_hdr.lrh_index = index; + rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec->lid_id = loghandle->lgh_id; + rec->lid_tail.lrt_len = sizeof(*rec); + rec->lid_tail.lrt_index = index; /* update the catalog: header and record */ - rc = llog_write_rec(env, cathandle, &rec.lid_hdr, + rc = llog_write_rec(env, cathandle, &rec->lid_hdr, &loghandle->u.phd.phd_cookie, 1, NULL, index, th); if (rc < 0) GOTO(out_destroy, rc); @@ -383,6 +383,8 @@ int llog_cat_declare_add_rec(const struct lu_env *env, struct llog_handle *cathandle, struct llog_rec_hdr *rec, struct thandle *th) { + struct llog_thread_info *lgi = llog_info(env); + struct llog_logid_rec *lirec = &lgi->lgi_logid; struct llog_handle *loghandle, *next; int rc = 0; @@ -418,12 +420,14 @@ int llog_cat_declare_add_rec(const struct lu_env *env, if (rc) GOTO(out, rc); + lirec->lid_hdr.lrh_len = sizeof(*lirec); + if (!llog_exist(cathandle->u.chd.chd_current_log)) { rc = llog_declare_create(env, cathandle->u.chd.chd_current_log, th); if (rc) GOTO(out, rc); - llog_declare_write_rec(env, cathandle, NULL, -1, th); + llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1, th); } /* declare records in the llogs */ rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log, @@ -435,9 +439,14 @@ int llog_cat_declare_add_rec(const struct lu_env *env, if (next) { if (!llog_exist(next)) { rc = llog_declare_create(env, next, th); - llog_declare_write_rec(env, cathandle, NULL, -1, th); + llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, + -1, th); } - llog_declare_write_rec(env, next, rec, -1, th); + /* XXX: we hope for declarations made for existing llog + * this might be not correct with some backends + * where declarations are expected against specific + * object like ZFS with full debugging enabled */ + /*llog_declare_write_rec(env, next, rec, -1, th);*/ } out: RETURN(rc); diff --git a/lustre/obdclass/llog_internal.h b/lustre/obdclass/llog_internal.h index 0752141..f90be39 100644 --- a/lustre/obdclass/llog_internal.h +++ b/lustre/obdclass/llog_internal.h @@ -58,6 +58,7 @@ struct llog_thread_info { loff_t lgi_off; struct llog_rec_hdr lgi_lrh; struct llog_rec_tail lgi_tail; + struct llog_logid_rec lgi_logid; }; extern struct lu_context_key llog_thread_key; diff --git a/lustre/obdclass/llog_osd.c b/lustre/obdclass/llog_osd.c index 9bff08e..fd9bfc36 100644 --- a/lustre/obdclass/llog_osd.c +++ b/lustre/obdclass/llog_osd.c @@ -283,6 +283,8 @@ static int llog_osd_declare_write_rec(const struct lu_env *env, LASSERT(env); LASSERT(th); LASSERT(loghandle); + LASSERT(rec); + LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE); o = loghandle->lgh_obj; LASSERT(o); @@ -309,10 +311,10 @@ static int llog_osd_declare_write_rec(const struct lu_env *env, lgi->lgi_off = 0; } - lgi->lgi_buf.lb_len = 32 * 1024; + lgi->lgi_buf.lb_len = rec->lrh_len; lgi->lgi_buf.lb_buf = NULL; /* XXX: implement declared window or multi-chunks approach */ - rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th); + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th); RETURN(rc); } @@ -913,11 +915,8 @@ static int llog_osd_declare_create(const struct lu_env *env, if (rc) RETURN(rc); - lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE; - lgi->lgi_buf.lb_buf = NULL; - rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, th); - if (rc) - RETURN(rc); + /* do not declare header initialization here as it's declared + * in llog_osd_declare_write_rec() which is always called */ if (res->lgh_name) { struct dt_object *llog_dir; diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index c6e090d..75fc855 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1001,11 +1001,9 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d, LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, oh->ot_credits, osd_journal(dev)->j_max_transaction_buffers); - CWARN(" create: %u/%u, delete: %u/%u, destroy: %u/%u\n", + CWARN(" create: %u/%u, destroy: %u/%u\n", oti->oti_declare_ops[OSD_OT_CREATE], oti->oti_declare_ops_cred[OSD_OT_CREATE], - oti->oti_declare_ops[OSD_OT_DELETE], - oti->oti_declare_ops_cred[OSD_OT_DELETE], oti->oti_declare_ops[OSD_OT_DESTROY], oti->oti_declare_ops_cred[OSD_OT_DESTROY]); CWARN(" attr_set: %u/%u, xattr_set: %u/%u\n", @@ -1023,8 +1021,8 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d, CWARN(" insert: %u/%u, delete: %u/%u\n", oti->oti_declare_ops[OSD_OT_INSERT], oti->oti_declare_ops_cred[OSD_OT_INSERT], - oti->oti_declare_ops[OSD_OT_DESTROY], - oti->oti_declare_ops_cred[OSD_OT_DESTROY]); + oti->oti_declare_ops[OSD_OT_DELETE], + oti->oti_declare_ops_cred[OSD_OT_DELETE]); CWARN(" ref_add: %u/%u, ref_del: %u/%u\n", oti->oti_declare_ops[OSD_OT_REF_ADD], oti->oti_declare_ops_cred[OSD_OT_REF_ADD], @@ -1430,55 +1428,57 @@ static int osd_init_capa_ctxt(const struct lu_env *env, struct dt_device *d, * If we mount with --data_journal we may need more. */ const int osd_dto_credits_noquota[DTO_NR] = { - /** - * Insert/Delete. - * INDEX_EXTRA_TRANS_BLOCKS(8) + - * SINGLEDATA_TRANS_BLOCKS(8) - * XXX Note: maybe iam need more, since iam have more level than - * EXT3 htree. - */ - [DTO_INDEX_INSERT] = 16, - [DTO_INDEX_DELETE] = 16, - /** + /** + * Insert. + * INDEX_EXTRA_TRANS_BLOCKS(8) + + * SINGLEDATA_TRANS_BLOCKS(8) + * XXX Note: maybe iam need more, since iam have more level than + * EXT3 htree. + */ + [DTO_INDEX_INSERT] = 16, + /** + * Delete + * just modify a single entry, probably merge few within a block + */ + [DTO_INDEX_DELETE] = 1, + /** * Used for OI scrub - */ - [DTO_INDEX_UPDATE] = 16, - /** - * Create a object. The same as create object in EXT3. - * DATA_TRANS_BLOCKS(14) + - * INDEX_EXTRA_BLOCKS(8) + - * 3(inode bits, groups, GDT) - */ - [DTO_OBJECT_CREATE] = 25, - /** - * XXX: real credits to be fixed - */ - [DTO_OBJECT_DELETE] = 25, - /** - * Attr set credits (inode) - */ - [DTO_ATTR_SET_BASE] = 1, - /** - * Xattr set. The same as xattr of EXT3. - * DATA_TRANS_BLOCKS(14) - * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS - * are also counted in. Do not know why? - */ - [DTO_XATTR_SET] = 14, - [DTO_LOG_REC] = 14, - /** - * credits for inode change during write. - */ - [DTO_WRITE_BASE] = 3, - /** - * credits for single block write. - */ - [DTO_WRITE_BLOCK] = 14, - /** - * Attr set credits for chown. - * This is extra credits for setattr, and it is null without quota - */ - [DTO_ATTR_SET_CHOWN]= 0 + */ + [DTO_INDEX_UPDATE] = 16, + /** + * 4(inode, inode bits, groups, GDT) + * notice: OI updates are counted separately with DTO_INDEX_INSERT + */ + [DTO_OBJECT_CREATE] = 4, + /** + * 4(inode, inode bits, groups, GDT) + * notice: OI updates are counted separately with DTO_INDEX_DELETE + */ + [DTO_OBJECT_DELETE] = 4, + /** + * Attr set credits (inode) + */ + [DTO_ATTR_SET_BASE] = 1, + /** + * Xattr set. The same as xattr of EXT3. + * DATA_TRANS_BLOCKS(14) + * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS + * are also counted in. Do not know why? + */ + [DTO_XATTR_SET] = 14, + /** + * credits for inode change during write. + */ + [DTO_WRITE_BASE] = 3, + /** + * credits for single block write. + */ + [DTO_WRITE_BLOCK] = 14, + /** + * Attr set credits for chown. + * This is extra credits for setattr, and it is null without quota + */ + [DTO_ATTR_SET_CHOWN] = 0 }; static const struct dt_device_operations osd_dt_ops = { @@ -2346,12 +2346,10 @@ static int osd_declare_object_create(const struct lu_env *env, osd_trans_declare_op(env, oh, OSD_OT_CREATE, osd_dto_credits_noquota[DTO_OBJECT_CREATE]); - if (!fid_is_on_ost(osd_oti_get(env), osd_dt_dev(handle->th_dev), - lu_object_fid(&dt->do_lu), OI_CHECK_FLD)) - /* Reuse idle OI block may cause additional one OI block - * to be changed. */ - osd_trans_declare_op(env, oh, OSD_OT_INSERT, - osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1); + /* Reuse idle OI block may cause additional one OI block + * to be changed. */ + osd_trans_declare_op(env, oh, OSD_OT_INSERT, + osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1); /* If this is directory, then we expect . and .. to be inserted as * well. The one directory block always needs to be created for the @@ -2916,16 +2914,32 @@ static int osd_declare_xattr_set(const struct lu_env *env, int fl, struct thandle *handle) { struct osd_thandle *oh; + int credits; LASSERT(handle != NULL); oh = container_of0(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); - osd_trans_declare_op(env, oh, OSD_OT_XATTR_SET, - strcmp(name, XATTR_NAME_VERSION) == 0 ? - osd_dto_credits_noquota[DTO_ATTR_SET_BASE] : - osd_dto_credits_noquota[DTO_XATTR_SET]); + /* optimistic optimization: LMA is set first and usually fit inode */ + if (strcmp(name, XATTR_NAME_LMA) == 0) { + if (dt_object_exists(dt)) + credits = 0; + else + credits = 1; + } else if (strcmp(name, XATTR_NAME_VERSION) == 0) { + credits = 1; + } else { + struct osd_device *osd = osd_dev(dt->do_lu.lo_dev); + struct super_block *sb = osd_sb(osd); + credits = osd_dto_credits_noquota[DTO_XATTR_SET]; + if (buf && buf->lb_len > sb->s_blocksize) { + credits *= (buf->lb_len + sb->s_blocksize - 1) >> + sb->s_blocksize_bits; + } + } + + osd_trans_declare_op(env, oh, OSD_OT_XATTR_SET, credits); return 0; } diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index dc52885..d806a70 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -336,7 +336,6 @@ enum dt_txn_op { DTO_OBJECT_DELETE, DTO_ATTR_SET_BASE, DTO_XATTR_SET, - DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */ DTO_WRITE_BASE, DTO_WRITE_BLOCK, DTO_ATTR_SET_CHOWN, diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 5e2a16b..7bab102 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -1314,36 +1314,149 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, return rc; } +static inline int osd_extents_enabled(struct super_block *sb, + struct inode *inode) +{ + if (inode != NULL) { + if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) + return 1; + } else if (test_opt(sb, EXTENTS)) { + return 1; + } + return 0; +} + +static inline int osd_calc_bkmap_credits(struct super_block *sb, + struct inode *inode, + const loff_t size, + const loff_t pos, + const int blocks) +{ + int credits, bits, bs, i; + + bits = sb->s_blocksize_bits; + bs = 1 << bits; + + /* legacy blockmap: 3 levels * 3 (bitmap,gd,itself) + * we do not expect blockmaps on the large files, + * so let's shrink it to 2 levels (4GB files) */ + + /* this is default reservation: 2 levels */ + credits = (blocks + 2) * 3; + + /* actual offset is unknown, hard to optimize */ + if (pos == -1) + return credits; + + /* now check for few specific cases to optimize */ + if (pos + size <= LDISKFS_NDIR_BLOCKS * bs) { + /* no indirects */ + credits = blocks; + /* allocate if not allocated */ + if (inode == NULL) { + credits += blocks * 2; + return credits; + } + for (i = (pos >> bits); i < (pos >> bits) + blocks; i++) { + LASSERT(i < LDISKFS_NDIR_BLOCKS); + if (LDISKFS_I(inode)->i_data[i] == 0) + credits += 2; + } + } else if (pos + size <= (LDISKFS_NDIR_BLOCKS + 1024) * bs) { + /* single indirect */ + credits = blocks * 3; + /* probably indirect block has been allocated already */ + if (!inode || LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK]) + credits += 3; + } + + return credits; +} + static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, - const struct lu_buf *buf, loff_t pos, + const struct lu_buf *buf, loff_t _pos, struct thandle *handle) { - struct osd_thandle *oh; - int credits; - struct inode *inode; - int rc; + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + struct super_block *sb = osd_sb(osd_obj2dev(obj)); + struct osd_thandle *oh; + int rc = 0, est = 0, credits, blocks, allocated = 0; + int bits, bs; + int depth, size; + loff_t pos; ENTRY; + LASSERT(buf != NULL); LASSERT(handle != NULL); oh = container_of0(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); - credits = osd_dto_credits_noquota[DTO_WRITE_BLOCK]; + size = buf->lb_len; + bits = sb->s_blocksize_bits; + bs = 1 << bits; - osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits); + if (_pos == -1) { + /* if this is an append, then we + * should expect cross-block record */ + pos = 0; + } else { + pos = _pos; + } - inode = osd_dt_obj(dt)->oo_inode; + /* blocks to modify */ + blocks = ((pos + size + bs - 1) >> bits) - (pos >> bits); + LASSERT(blocks > 0); + + if (inode != NULL && _pos != -1) { + /* object size in blocks */ + est = (i_size_read(inode) + bs - 1) >> bits; + allocated = inode->i_blocks >> (bits - 9); + if (pos + size <= i_size_read(inode) && est <= allocated) { + /* looks like an overwrite, no need to modify tree */ + credits = blocks; + /* no need to modify i_size */ + goto out; + } + } + + if (osd_extents_enabled(sb, inode)) { + /* + * many concurrent threads may grow tree by the time + * our transaction starts. so, consider 2 is a min depth + * for every level we may need to allocate a new block + * and take some entries from the old one. so, 3 blocks + * to allocate (bitmap, gd, itself) + old block - 4 per + * level. + */ + depth = inode != NULL ? ext_depth(inode) : 0; + depth = max(depth, 1) + 1; + credits = depth; + /* if not append, then split may need to modify + * existing blocks moving entries into the new ones */ + if (_pos == -1) + credits += depth; + /* blocks to store data: bitmap,gd,itself */ + credits += blocks * 3; + } else { + credits = osd_calc_bkmap_credits(sb, inode, size, _pos, blocks); + } + /* if inode is created as part of the transaction, + * then it's counted already by the creation method */ + if (inode != NULL) + credits++; + +out: - /* we may declare write to non-exist llog */ - if (inode == NULL) - RETURN(0); + osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits); /* dt_declare_write() is usually called for system objects, such * as llog or last_rcvd files. We needn't enforce quota on those * objects, so always set the lqi_space as 0. */ - rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, 0, oh, - true, true, NULL, false); + if (inode != NULL) + rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, + 0, oh, true, true, NULL, false); RETURN(rc); } diff --git a/lustre/osd-ldiskfs/osd_quota.c b/lustre/osd-ldiskfs/osd_quota.c index c53d6f8..feda48d 100644 --- a/lustre/osd-ldiskfs/osd_quota.c +++ b/lustre/osd-ldiskfs/osd_quota.c @@ -866,6 +866,7 @@ static int truncate_quota_index(const struct lu_env *env, struct dt_object *dt, struct inode *inode; int rc; struct iam_container *bag = &(osd_dt_obj(dt))->oo_dir->od_container; + struct lu_buf *lb = &osd_oti_get(env)->oti_buf; ENTRY; LASSERT(bag->ic_root_bh != NULL); @@ -897,7 +898,9 @@ static int truncate_quota_index(const struct lu_env *env, struct dt_object *dt, inode = osd_dt_obj(dt)->oo_inode; LASSERT(inode); - rc = dt_declare_record_write(env, dt, NULL, 0, th); + /* iam_lfix_create() writes two blocks at the beginning */ + lb->lb_len = osd_sb(osd)->s_blocksize * 2; + rc = dt_declare_record_write(env, dt, lb, 0, th); if (rc) GOTO(out, rc); diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 1590e3d..b315807 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -51,6 +51,7 @@ #include #include #include +#include /* LLOG_CHUNK_SIZE definition */ #include "osd_internal.h" @@ -137,6 +138,12 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, dmu_tx_hold_sa_create(oh->ot_tx, ZFS_SA_BASE_ATTR_SIZE); } + /* XXX: we still miss for append declaration support in ZFS + * -1 means append which is used by llog mostly, llog + * can grow upto LLOG_CHUNK_SIZE*8 records */ + if (pos == -1) + pos = max_t(loff_t, 256 * 8 * LLOG_CHUNK_SIZE, + obj->oo_attr.la_size + (2 << 20)); dmu_tx_hold_write(oh->ot_tx, oid, pos, buf->lb_len); /* dt_declare_write() is usually called for system objects, such -- 1.8.3.1