From 9e6225b2e7385cbb7be0474df01075fafc4966d5 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 7 Dec 2021 11:13:54 +0300 Subject: [PATCH] LU-14918 osd: don't declare similar ldiskfs writes twice in some cases (like overstriping) the same operations can be declared multiple times (new llog records) and this lead to huge number of credits and performance degradation. we can avoid this checking for duplicate declarations. As every declaration would need an allocation, limit the scope of this checks to transaction likely to be large. % of "large" transaction in sanity-benchmark, depending on threshold: creates < 5 && writes < 5: 0.58% (mds1) and 2.97% (mds2) create < 7 & writes < 7: 0.58% and 2.4% create < 9 & writes < 9: 0.6% and 1.85% create < 10 & write2 < 10: 0.0004% and 0.000001% thus 10 creates or writes is selected as a threshold to enable this logic. Signed-off-by: Alex Zhuravlev Change-Id: I7c893fe3b95646b4b813b999bc832659dfcf03ad Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/45765 Reviewed-by: Andreas Dilger Reviewed-by: Li Dongyang Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- lustre/osd-ldiskfs/osd_handler.c | 73 +++++++++++++++++++++++++++++++++++++++ lustre/osd-ldiskfs/osd_internal.h | 16 +++++++++ lustre/osd-ldiskfs/osd_io.c | 3 ++ lustre/osd-ldiskfs/osd_lproc.c | 19 +++++----- lustre/tests/sanity.sh | 33 ++++++++++++++++++ 5 files changed, 136 insertions(+), 8 deletions(-) diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index bb57382..89c2d50 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1761,6 +1761,70 @@ static void __osd_th_check_slow(void *oth, struct osd_device *dev, #endif /* OSD_THANDLE_STATS */ /* + * in some cases (like overstriped files) the same operations on the same + * objects are declared many times and this may lead to huge number of + * credits which can be a problem and/or cause performance degradation. + * this function is to remember what declarations have been made within + * a given thandle and then skip duplications. + * limit it's scope so that regular small transactions don't need all + * this overhead with allocations, lists. + * also, limit scope to the specific objects like llogs, etc. + */ +static inline bool osd_check_special_fid(const struct lu_fid *f) +{ + if (fid_seq_is_llog(f->f_seq)) + return true; + if (f->f_seq == FID_SEQ_LOCAL_FILE && + f->f_oid == MDD_LOV_OBJ_OID) + return true; + return false; +} + +bool osd_tx_was_declared(const struct lu_env *env, struct osd_thandle *oth, + struct dt_object *dt, enum dt_txn_op op, loff_t pos) +{ + const struct lu_fid *fid = lu_object_fid(&dt->do_lu); + struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); + struct osd_thread_info *oti = osd_oti_get(env); + struct osd_obj_declare *old; + + if (osd->od_is_ost) + return false; + + /* small transactions don't need this overhead */ + if (oti->oti_declare_ops[DTO_OBJECT_CREATE] < 10 && + oti->oti_declare_ops[DTO_WRITE_BASE] < 10) + return false; + + if (osd_check_special_fid(fid) == 0) + return false; + + list_for_each_entry(old, &oth->ot_declare_list, old_list) { + if (old->old_op == op && old->old_pos == pos && + lu_fid_eq(&old->old_fid, fid)) + return true; + } + OBD_ALLOC_PTR(old); + if (unlikely(old == NULL)) + return false; + old->old_fid = *lu_object_fid(&dt->do_lu); + old->old_op = op; + old->old_pos = pos; + list_add(&old->old_list, &oth->ot_declare_list); + return false; +} + +void osd_tx_declaration_free(struct osd_thandle *oth) +{ + struct osd_obj_declare *old, *tmp; + + list_for_each_entry_safe(old, tmp, &oth->ot_declare_list, old_list) { + list_del_init(&old->old_list); + OBD_FREE_PTR(old); + } +} + +/* * Concurrency: doesn't access mutable data. */ static int osd_param_is_not_sane(const struct osd_device *dev, @@ -1845,6 +1909,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env, INIT_LIST_HEAD(&oh->ot_commit_dcb_list); INIT_LIST_HEAD(&oh->ot_stop_dcb_list); INIT_LIST_HEAD(&oh->ot_trunc_locks); + INIT_LIST_HEAD(&oh->ot_declare_list); osd_th_alloced(oh); memset(oti->oti_declare_ops, 0, @@ -1935,6 +2000,9 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, static unsigned long last_printed; static int last_credits; + lprocfs_counter_add(dev->od_stats, + LPROC_OSD_TOO_MANY_CREDITS, 1); + /* * don't make noise on a tiny testing systems * actual credits misuse will be caught anyway @@ -2065,6 +2133,8 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, qtrans = oh->ot_quota_trans; oh->ot_quota_trans = NULL; + osd_tx_declaration_free(oh); + /* move locks to local list, stop tx, execute truncates */ list_splice(&oh->ot_trunc_locks, &truncates); @@ -3707,6 +3777,9 @@ static int osd_declare_create(const struct lu_env *env, struct dt_object *dt, oh = container_of(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); + if (osd_tx_was_declared(env, oh, dt, DTO_OBJECT_CREATE, 0)) + RETURN(0); + /* * EA object consumes more credits than regular object: osd_mk_index * vs. osd_mkreg: osd_mk_index will create 2 blocks for root_node and diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 1ee407e..6e8e962 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -454,6 +454,11 @@ struct osd_thandle { ktime_t oth_started; #endif struct list_head ot_trunc_locks; + /* + * list of declarations, used for large transactions to check + * for duplicates, like llogs + */ + struct list_head ot_declare_list; }; /** @@ -474,6 +479,13 @@ enum dt_txn_op { DTO_NR }; +struct osd_obj_declare { + struct list_head old_list; + struct lu_fid old_fid; + enum dt_txn_op old_op; + loff_t old_pos; +}; + /* * osd dev stats */ @@ -493,6 +505,7 @@ enum { LPROC_OSD_THANDLE_OPEN, LPROC_OSD_THANDLE_CLOSING, #endif + LPROC_OSD_TOO_MANY_CREDITS, LPROC_OSD_LAST, }; #endif @@ -1727,4 +1740,7 @@ static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw) return false; } +bool osd_tx_was_declared(const struct lu_env *env, struct osd_thandle *oth, + struct dt_object *dt, enum dt_txn_op op, loff_t p); + #endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index dbaee9e..43bc85c 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -2031,6 +2031,9 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, bits = sb->s_blocksize_bits; bs = 1 << bits; + if (osd_tx_was_declared(env, oh, dt, DTO_WRITE_BASE, _pos)) + RETURN(0); + if (_pos == -1) { /* if this is an append, then we * should expect cross-block record diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index 13ba77d..135e316 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -109,6 +109,9 @@ static int osd_stats_init(struct osd_device *osd) LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_USECS, "thandle closing"); #endif + lprocfs_counter_init(osd->od_stats, LPROC_OSD_TOO_MANY_CREDITS, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_REQS, + "many_credits"); result = 0; } @@ -786,14 +789,14 @@ ssize_t index_backup_store(struct kobject *kobj, struct attribute *attr, LUSTRE_RW_ATTR(index_backup); struct ldebugfs_vars ldebugfs_osd_obd_vars[] = { - { .name = "oi_scrub", - .fops = &ldiskfs_osd_oi_scrub_fops }, - { .name = "readcache_max_filesize", - .fops = &ldiskfs_osd_readcache_fops }, - { .name = "readcache_max_io_mb", - .fops = &ldiskfs_osd_readcache_max_io_fops }, - { .name = "writethrough_max_io_mb", - .fops = &ldiskfs_osd_writethrough_max_io_fops }, + { .name = "oi_scrub", + .fops = &ldiskfs_osd_oi_scrub_fops }, + { .name = "readcache_max_filesize", + .fops = &ldiskfs_osd_readcache_fops }, + { .name = "readcache_max_io_mb", + .fops = &ldiskfs_osd_readcache_max_io_fops }, + { .name = "writethrough_max_io_mb", + .fops = &ldiskfs_osd_writethrough_max_io_fops }, { NULL } }; diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 63b36ac..7f55e16 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1710,6 +1710,39 @@ test_27cf() { } run_test 27cf "'setstripe -o' on inactive OSTs should return error" +test_27cg() { + [[ $($LCTL get_param mdc.*.import) =~ connect_flags.*overstriping ]] || + skip "server does not support overstriping" + [[ $mds1_FSTYPE != "ldiskfs" ]] && skip_env "ldiskfs only test" + large_xattr_enabled || skip_env "ea_inode feature disabled" + + local osts="0" + + for ((i=1;i<1000;i++)); do + osts+=",$((i % OSTCOUNT))" + done + + local mdts=$(comma_list $(mdts_nodes)) + local before=$(do_nodes $mdts \ + "$LCTL get_param -n osd-ldiskfs.*MDT*.stats" | + awk '/many credits/{print $3}' | + calc_sum) + + $LFS setstripe -o $osts $DIR/$tfile || error "setstripe failed" + $LFS getstripe $DIR/$tfile | grep stripe + + rm -f $DIR/$tfile || error "can't unlink" + + after=$(do_nodes $mdts \ + "$LCTL get_param -n osd-ldiskfs.*MDT*.stats" | + awk '/many credits/{print $3}' | + calc_sum) + + (( before == after )) || + error "too many credits happened: $after > $before" +} +run_test 27cg "1000 shouldn't cause too many credits" + test_27d() { test_mkdir $DIR/$tdir $LFS setstripe -c 0 -i -1 -S 0 $DIR/$tdir/$tfile || -- 1.8.3.1