From 763b26c42fa4239989dd3bbc03e2dff53f887f1f Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 7 Dec 2021 11:13:54 +0300 Subject: [PATCH] LU-14918 osd: don't declare similar ldiskfs writes twice in some cases (like overstriping) the same operations can be declared multiple times (new llog records) and this lead to huge number of credits and performance degradation. we can avoid this checking for duplicate declarations. As every declaration would need an allocation, limit the scope of this checks to transaction likely to be large. % of "large" transaction in sanity-benchmark, depending on threshold: creates < 5 && writes < 5: 0.58% (mds1) and 2.97% (mds2) create < 7 & writes < 7: 0.58% and 2.4% create < 9 & writes < 9: 0.6% and 1.85% create < 10 & write2 < 10: 0.0004% and 0.000001% thus 10 creates or writes is selected as a threshold to enable this logic. Lustre-change: https://review.whamcloud.com/45765 Lustre-commit: 9e6225b2e7385cbb7be0474df01075fafc4966d5 Signed-off-by: Alex Zhuravlev Change-Id: I7c893fe3b95646b4b813b999bc832659dfcf03ad Reviewed-by: Andreas Dilger Reviewed-by: Li Dongyang Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53485 Tested-by: jenkins Tested-by: Maloo --- lustre/osd-ldiskfs/osd_handler.c | 73 +++++++++++++++++++++++++++++++++++++++ lustre/osd-ldiskfs/osd_internal.h | 16 +++++++++ lustre/osd-ldiskfs/osd_io.c | 3 ++ lustre/osd-ldiskfs/osd_lproc.c | 19 +++++----- lustre/tests/sanity.sh | 33 ++++++++++++++++++ 5 files changed, 136 insertions(+), 8 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 5d3e97e..7be1746 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1810,6 +1810,70 @@ static void __osd_th_check_slow(void *oth, struct osd_device *dev, #endif /* OSD_THANDLE_STATS */ /* + * in some cases (like overstriped files) the same operations on the same + * objects are declared many times and this may lead to huge number of + * credits which can be a problem and/or cause performance degradation. + * this function is to remember what declarations have been made within + * a given thandle and then skip duplications. + * limit it's scope so that regular small transactions don't need all + * this overhead with allocations, lists. + * also, limit scope to the specific objects like llogs, etc. + */ +static inline bool osd_check_special_fid(const struct lu_fid *f) +{ + if (fid_seq_is_llog(f->f_seq)) + return true; + if (f->f_seq == FID_SEQ_LOCAL_FILE && + f->f_oid == MDD_LOV_OBJ_OID) + return true; + return false; +} + +bool osd_tx_was_declared(const struct lu_env *env, struct osd_thandle *oth, + struct dt_object *dt, enum dt_txn_op op, loff_t pos) +{ + const struct lu_fid *fid = lu_object_fid(&dt->do_lu); + struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); + struct osd_thread_info *oti = osd_oti_get(env); + struct osd_obj_declare *old; + + if (osd->od_is_ost) + return false; + + /* small transactions don't need this overhead */ + if (oti->oti_declare_ops[DTO_OBJECT_CREATE] < 10 && + oti->oti_declare_ops[DTO_WRITE_BASE] < 10) + return false; + + if (osd_check_special_fid(fid) == 0) + return false; + + list_for_each_entry(old, &oth->ot_declare_list, old_list) { + if (old->old_op == op && old->old_pos == pos && + lu_fid_eq(&old->old_fid, fid)) + return true; + } + OBD_ALLOC_PTR(old); + if (unlikely(old == NULL)) + return false; + old->old_fid = *lu_object_fid(&dt->do_lu); + old->old_op = op; + old->old_pos = pos; + list_add(&old->old_list, &oth->ot_declare_list); + return false; +} + +void osd_tx_declaration_free(struct osd_thandle *oth) +{ + struct osd_obj_declare *old, *tmp; + + list_for_each_entry_safe(old, tmp, &oth->ot_declare_list, old_list) { + list_del_init(&old->old_list); + OBD_FREE_PTR(old); + } +} + +/* * Concurrency: doesn't access mutable data. */ static int osd_param_is_not_sane(const struct osd_device *dev, @@ -1894,6 +1958,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env, INIT_LIST_HEAD(&oh->ot_commit_dcb_list); INIT_LIST_HEAD(&oh->ot_stop_dcb_list); INIT_LIST_HEAD(&oh->ot_trunc_locks); + INIT_LIST_HEAD(&oh->ot_declare_list); osd_th_alloced(oh); memset(oti->oti_declare_ops, 0, @@ -2009,6 +2074,9 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, static unsigned long last_printed; static int last_credits; + lprocfs_counter_add(dev->od_stats, + LPROC_OSD_TOO_MANY_CREDITS, 1); + /* * don't make noise on a tiny testing systems * actual credits misuse will be caught anyway @@ -2135,6 +2203,8 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, qtrans = oh->ot_quota_trans; oh->ot_quota_trans = NULL; + osd_tx_declaration_free(oh); + /* move locks to local list, stop tx, execute truncates */ list_splice(&oh->ot_trunc_locks, &truncates); @@ -3855,6 +3925,9 @@ static int osd_declare_create(const struct lu_env *env, struct dt_object *dt, oh = container_of(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); + if (osd_tx_was_declared(env, oh, dt, DTO_OBJECT_CREATE, 0)) + RETURN(0); + /* * EA object consumes more credits than regular object: osd_mk_index * vs. osd_mkreg: osd_mk_index will create 2 blocks for root_node and diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 380cc2b..ead0ecd 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -458,6 +458,11 @@ struct osd_thandle { ktime_t oth_started; #endif struct list_head ot_trunc_locks; + /* + * list of declarations, used for large transactions to check + * for duplicates, like llogs + */ + struct list_head ot_declare_list; }; /** @@ -478,6 +483,13 @@ enum dt_txn_op { DTO_NR }; +struct osd_obj_declare { + struct list_head old_list; + struct lu_fid old_fid; + enum dt_txn_op old_op; + loff_t old_pos; +}; + /* * osd dev stats */ @@ -497,6 +509,7 @@ enum { LPROC_OSD_THANDLE_OPEN, LPROC_OSD_THANDLE_CLOSING, #endif + LPROC_OSD_TOO_MANY_CREDITS, LPROC_OSD_LAST, }; #endif @@ -1778,4 +1791,7 @@ static int fill_fn(struct dir_context *buf, const char *name, int namelen, \ #define WRAP_FILLDIR_FN(prefix, fill_fn) #endif +bool osd_tx_was_declared(const struct lu_env *env, struct osd_thandle *oth, + struct dt_object *dt, enum dt_txn_op op, loff_t p); + #endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index ed89259..bc55cbe 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -2023,6 +2023,9 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, bits = sb->s_blocksize_bits; bs = 1 << bits; + if (osd_tx_was_declared(env, oh, dt, DTO_WRITE_BASE, _pos)) + RETURN(0); + if (_pos == -1) { /* if this is an append, then we * should expect cross-block record diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index 270421f..e95aac8 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -109,6 +109,9 @@ static int osd_stats_init(struct osd_device *osd) LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_USECS, "thandle_closing"); #endif + lprocfs_counter_init(osd->od_stats, LPROC_OSD_TOO_MANY_CREDITS, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_REQS, + "many_credits"); result = 0; } @@ -815,14 +818,14 @@ LUSTRE_RW_ATTR(extents_dense); #endif struct ldebugfs_vars ldebugfs_osd_obd_vars[] = { - { .name = "oi_scrub", - .fops = &ldiskfs_osd_oi_scrub_fops }, - { .name = "readcache_max_filesize", - .fops = &ldiskfs_osd_readcache_max_filesize_fops}, - { .name = "readcache_max_io_mb", - .fops = &ldiskfs_osd_readcache_max_io_fops }, - { .name = "writethrough_max_io_mb", - .fops = &ldiskfs_osd_writethrough_max_io_fops }, + { .name = "oi_scrub", + .fops = &ldiskfs_osd_oi_scrub_fops }, + { .name = "readcache_max_filesize", + .fops = &ldiskfs_osd_readcache_max_filesize_fops}, + { .name = "readcache_max_io_mb", + .fops = &ldiskfs_osd_readcache_max_io_fops }, + { .name = "writethrough_max_io_mb", + .fops = &ldiskfs_osd_writethrough_max_io_fops }, { NULL } }; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index adb0919..820e657 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1714,6 +1714,39 @@ test_27cf() { } run_test 27cf "'setstripe -o' on inactive OSTs should return error" +test_27cg() { + [[ $($LCTL get_param mdc.*.import) =~ connect_flags.*overstriping ]] || + skip "server does not support overstriping" + [[ $mds1_FSTYPE != "ldiskfs" ]] && skip_env "ldiskfs only test" + large_xattr_enabled || skip_env "ea_inode feature disabled" + + local osts="0" + + for ((i=1;i<1000;i++)); do + osts+=",$((i % OSTCOUNT))" + done + + local mdts=$(comma_list $(mdts_nodes)) + local before=$(do_nodes $mdts \ + "$LCTL get_param -n osd-ldiskfs.*MDT*.stats" | + awk '/many credits/{print $3}' | + calc_sum) + + $LFS setstripe -o $osts $DIR/$tfile || error "setstripe failed" + $LFS getstripe $DIR/$tfile | grep stripe + + rm -f $DIR/$tfile || error "can't unlink" + + after=$(do_nodes $mdts \ + "$LCTL get_param -n osd-ldiskfs.*MDT*.stats" | + awk '/many credits/{print $3}' | + calc_sum) + + (( before == after )) || + error "too many credits happened: $after > $before" +} +run_test 27cg "1000 shouldn't cause too many credits" + test_27d() { test_mkdir $DIR/$tdir $LFS setstripe -c 0 -i -1 -S 0 $DIR/$tdir/$tfile || -- 1.8.3.1