From: Andrew Perepechko Date: Thu, 2 Dec 2021 07:26:32 +0000 (+0300) Subject: LU-13309 osd: use per-cpu counters for brw_stats X-Git-Tag: 2.14.57~41 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=787c1884e6451ae764568ade3658e537dcc19097;p=fs%2Flustre-release.git LU-13309 osd: use per-cpu counters for brw_stats Based on perf reports, oh_lock is highly contended when running IOR with NVMe storage, so we need to move to per-cpu counters. struct brw_stats becomes larger: from 3872 to 18208 bytes. Also, 4 bytes are allocated per each cpu for every counter. With an 8-cpu system and 32 4-byte per-cpu counters, there are 448 per-cpu counters or 1792 bytes per-cpu. These counters will either reuse already allocated per-cpu pages or allocate a new page on each cpu (8 pages total). Change-Id: I24536a0138067fb868aaf962d9321dea7566d13f Signed-off-by: Andrew Perepechko HPE-bug-id: LUS-8007, LUS-8185 Reviewed-on: https://review.whamcloud.com/37915 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexander Boyko Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 9b2ba94..85fab90 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -102,6 +102,11 @@ struct obd_histogram { unsigned long oh_buckets[OBD_HIST_MAX]; }; +struct obd_hist_pcpu { + struct percpu_counter oh_pc_buckets[OBD_HIST_MAX]; + bool oh_initialized; +}; + enum { RENAME_SAMEDIR_SIZE = 0, RENAME_CROSSDIR_SRC_SIZE, @@ -412,11 +417,12 @@ struct brw_stats_props { struct brw_stats { ktime_t bs_init; - struct obd_histogram bs_hist[BRW_RW_STATS_NUM]; + struct obd_hist_pcpu bs_hist[BRW_RW_STATS_NUM]; struct brw_stats_props bs_props[BRW_RW_STATS_NUM / 2]; }; -void lprocfs_init_brw_stats(struct brw_stats *brw_stats); +int lprocfs_init_brw_stats(struct brw_stats *brw_stats); +void lprocfs_fini_brw_stats(struct brw_stats *brw_stats); void ldebugfs_register_osd_stats(struct dentry *parent, struct brw_stats *brw_stats, @@ -650,6 +656,15 @@ void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value); void lprocfs_oh_clear(struct obd_histogram *oh); unsigned long lprocfs_oh_sum(struct obd_histogram *oh); +void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh, unsigned int value); +void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh, unsigned int value); +int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh); +void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh); +void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh); +unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh); +unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh, + unsigned int value); + void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, struct lprocfs_counter *cnt); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 2872e89..1d5b9b9 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -1941,6 +1941,95 @@ void lprocfs_oh_clear(struct obd_histogram *oh) } EXPORT_SYMBOL(lprocfs_oh_clear); +void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh, + unsigned int value) +{ + if (value >= OBD_HIST_MAX) + value = OBD_HIST_MAX - 1; + + percpu_counter_inc(&oh->oh_pc_buckets[value]); +} +EXPORT_SYMBOL(lprocfs_oh_tally_pcpu); + +void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh, + unsigned int value) +{ + unsigned int val = 0; + + if (likely(value != 0)) + val = min(fls(value - 1), OBD_HIST_MAX); + + lprocfs_oh_tally_pcpu(oh, val); +} +EXPORT_SYMBOL(lprocfs_oh_tally_log2_pcpu); + +unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh, + unsigned int value) +{ + return percpu_counter_sum(&oh->oh_pc_buckets[value]); +} +EXPORT_SYMBOL(lprocfs_oh_counter_pcpu); + +unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh) +{ + unsigned long ret = 0; + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + ret += percpu_counter_sum(&oh->oh_pc_buckets[i]); + + return ret; +} +EXPORT_SYMBOL(lprocfs_oh_sum_pcpu); + +int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh) +{ + int i, rc; + + if (oh->oh_initialized) + return 0; + + for (i = 0; i < OBD_HIST_MAX; i++) { + rc = percpu_counter_init(&oh->oh_pc_buckets[i], 0, GFP_KERNEL); + if (rc) + goto out; + } + + oh->oh_initialized = true; + + return 0; + +out: + for (i--; i >= 0; i--) + percpu_counter_destroy(&oh->oh_pc_buckets[i]); + + return rc; +} +EXPORT_SYMBOL(lprocfs_oh_alloc_pcpu); + +void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh) +{ + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + percpu_counter_set(&oh->oh_pc_buckets[i], 0); +} +EXPORT_SYMBOL(lprocfs_oh_clear_pcpu); + +void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh) +{ + int i; + + if (!oh->oh_initialized) + return; + + for (i = 0; i < OBD_HIST_MAX; i++) + percpu_counter_destroy(&oh->oh_pc_buckets[i]); + + oh->oh_initialized = false; +} +EXPORT_SYMBOL(lprocfs_oh_release_pcpu); + ssize_t lustre_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c index a3ddaa9..9a30639c 100644 --- a/lustre/obdclass/lprocfs_status_server.c +++ b/lustre/obdclass/lprocfs_status_server.c @@ -702,8 +702,8 @@ void lprocfs_free_obd_stats(struct obd_device *obd) EXPORT_SYMBOL(lprocfs_free_obd_stats); static void display_brw_stats(struct seq_file *seq, const char *name, - const char *units, struct obd_histogram *read, - struct obd_histogram *write, bool scale) + const char *units, struct obd_hist_pcpu *read, + struct obd_hist_pcpu *write, bool scale) { unsigned long read_tot, write_tot, r, w, read_cum = 0, write_cum = 0; unsigned int i; @@ -712,15 +712,15 @@ static void display_brw_stats(struct seq_file *seq, const char *name, seq_printf(seq, "%-22s %-5s %% cum %% | %-11s %% cum %%\n", name, units, units); - read_tot = lprocfs_oh_sum(read); - write_tot = lprocfs_oh_sum(write); + read_tot = lprocfs_oh_sum_pcpu(read); + write_tot = lprocfs_oh_sum_pcpu(write); if (!read_tot && !write_tot) return; for (i = 0; i < OBD_HIST_MAX; i++) { - r = read->oh_buckets[i]; - w = write->oh_buckets[i]; + r = lprocfs_oh_counter_pcpu(read, i); + w = lprocfs_oh_counter_pcpu(write, i); read_cum += r; write_cum += w; if (read_cum == 0 && write_cum == 0) @@ -799,21 +799,35 @@ static ssize_t brw_stats_seq_write(struct file *file, int i; for (i = 0; i < BRW_RW_STATS_NUM; i++) - lprocfs_oh_clear(&brw_stats->bs_hist[i]); + lprocfs_oh_clear_pcpu(&brw_stats->bs_hist[i]); return len; } LDEBUGFS_SEQ_FOPS(brw_stats); -void lprocfs_init_brw_stats(struct brw_stats *brw_stats) +int lprocfs_init_brw_stats(struct brw_stats *brw_stats) +{ + int i, result; + + for (i = 0; i < BRW_RW_STATS_NUM; i++) { + result = lprocfs_oh_alloc_pcpu(&brw_stats->bs_hist[i]); + if (result) + break; + } + + return result; +} +EXPORT_SYMBOL(lprocfs_init_brw_stats); + +void lprocfs_fini_brw_stats(struct brw_stats *brw_stats) { int i; for (i = 0; i < BRW_RW_STATS_NUM; i++) - spin_lock_init(&brw_stats->bs_hist[i].oh_lock); + lprocfs_oh_release_pcpu(&brw_stats->bs_hist[i]); } -EXPORT_SYMBOL(lprocfs_init_brw_stats); +EXPORT_SYMBOL(lprocfs_fini_brw_stats); void ldebugfs_register_osd_stats(struct dentry *parent, struct brw_stats *brw_stats, diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index cdf0a37..55403b4 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -8182,14 +8182,16 @@ static int osd_device_init0(const struct lu_env *env, INIT_LIST_HEAD(&o->od_ios_list); - lprocfs_init_brw_stats(&o->od_brw_stats); + rc = lprocfs_init_brw_stats(&o->od_brw_stats); + if (rc) + GOTO(out_brw_stats, rc); /* setup scrub, including OI files initialization */ o->od_in_init = 1; rc = osd_scrub_setup(env, o, restored); o->od_in_init = 0; if (rc < 0) - GOTO(out_site, rc); + GOTO(out_brw_stats, rc); rc = osd_procfs_init(o, o->od_svname); if (rc != 0) { @@ -8240,6 +8242,8 @@ out_procfs: osd_procfs_fini(o); out_scrub: osd_scrub_cleanup(env, o); +out_brw_stats: + lprocfs_fini_brw_stats(&o->od_brw_stats); out_site: lu_site_fini(&o->od_site); out_compat: diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 1db5db3..95215d6 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -147,13 +147,15 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf) int rw = iobuf->dr_rw; if (iobuf->dr_elapsed_valid) { + struct brw_stats *h = &d->od_brw_stats; + iobuf->dr_elapsed_valid = 0; LASSERT(iobuf->dr_dev == d); LASSERT(iobuf->dr_frags > 0); - lprocfs_oh_tally(&d->od_brw_stats.bs_hist[BRW_R_DIO_FRAGS + rw], - iobuf->dr_frags); - lprocfs_oh_tally_log2(&d->od_brw_stats.bs_hist[BRW_R_IO_TIME+rw], - ktime_to_ms(iobuf->dr_elapsed)); + lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_DIO_FRAGS+rw], + iobuf->dr_frags); + lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_IO_TIME+rw], + ktime_to_ms(iobuf->dr_elapsed)); } } @@ -230,21 +232,23 @@ static void dio_complete_routine(struct bio *bio, int error) static void record_start_io(struct osd_iobuf *iobuf, int size) { struct osd_device *osd = iobuf->dr_dev; - struct obd_histogram *h = osd->od_brw_stats.bs_hist; + struct brw_stats *h = &osd->od_brw_stats; iobuf->dr_frags++; atomic_inc(&iobuf->dr_numreqs); if (iobuf->dr_rw == 0) { atomic_inc(&osd->od_r_in_flight); - lprocfs_oh_tally(&h[BRW_R_RPC_HIST], + lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_RPC_HIST], atomic_read(&osd->od_r_in_flight)); - lprocfs_oh_tally_log2(&h[BRW_R_DISK_IOSIZE], size); + lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_DISK_IOSIZE], + size); } else if (iobuf->dr_rw == 1) { atomic_inc(&osd->od_w_in_flight); - lprocfs_oh_tally(&h[BRW_W_RPC_HIST], + lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_RPC_HIST], atomic_read(&osd->od_w_in_flight)); - lprocfs_oh_tally_log2(&h[BRW_W_DISK_IOSIZE], size); + lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_W_DISK_IOSIZE], + size); } else { LBUG(); } diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index e2cc801..01d5982 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -57,7 +57,7 @@ void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf) blocks_per_page = PAGE_SIZE >> osd_sb(osd)->s_blocksize_bits; - lprocfs_oh_tally_log2(&bs->bs_hist[BRW_R_PAGES + rw], nr_pages); + lprocfs_oh_tally_log2_pcpu(&bs->bs_hist[BRW_R_PAGES + rw], nr_pages); while (nr_pages-- > 0) { if (last_page && (*pages)->index != (last_page->index + 1)) @@ -71,8 +71,10 @@ void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf) } } - lprocfs_oh_tally(&bs->bs_hist[BRW_R_DISCONT_PAGES+rw], discont_pages); - lprocfs_oh_tally(&bs->bs_hist[BRW_R_DISCONT_BLOCKS+rw], discont_blocks); + lprocfs_oh_tally_pcpu(&bs->bs_hist[BRW_R_DISCONT_PAGES+rw], + discont_pages); + lprocfs_oh_tally_pcpu(&bs->bs_hist[BRW_R_DISCONT_BLOCKS+rw], + discont_blocks); } static int osd_stats_init(struct osd_device *osd) @@ -828,6 +830,8 @@ out: int osd_procfs_fini(struct osd_device *osd) { + lprocfs_fini_brw_stats(&osd->od_brw_stats); + if (osd->od_stats) lprocfs_free_stats(&osd->od_stats); diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index 9552fa9..8a9d4c2 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -1191,7 +1191,9 @@ static int osd_mount(const struct lu_env *env, if (opts && strstr(opts, "resetoi")) resetoi = true; - lprocfs_init_brw_stats(&o->od_brw_stats); + rc = lprocfs_init_brw_stats(&o->od_brw_stats); + if (rc) + GOTO(err, rc); o->od_in_init = 1; rc = osd_scrub_setup(env, o, interval, resetoi); diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 9460dc9..f267367 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -70,36 +70,40 @@ static void dbuf_set_pending_evict(dmu_buf_t *db) static void record_start_io(struct osd_device *osd, int rw, int discont_pages) { - struct obd_histogram *h = osd->od_brw_stats.bs_hist; + struct brw_stats *h = &osd->od_brw_stats; if (rw == READ) { atomic_inc(&osd->od_r_in_flight); - lprocfs_oh_tally(&h[BRW_R_RPC_HIST], - atomic_read(&osd->od_r_in_flight)); - lprocfs_oh_tally(&h[BRW_R_DISCONT_PAGES], discont_pages); + lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_RPC_HIST], + atomic_read(&osd->od_r_in_flight)); + lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_DISCONT_PAGES], + discont_pages); } else { atomic_inc(&osd->od_w_in_flight); - lprocfs_oh_tally(&h[BRW_W_RPC_HIST], - atomic_read(&osd->od_w_in_flight)); - lprocfs_oh_tally(&h[BRW_W_DISCONT_PAGES], discont_pages); + lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_RPC_HIST], + atomic_read(&osd->od_w_in_flight)); + lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_DISCONT_PAGES], + discont_pages); } } static void record_end_io(struct osd_device *osd, int rw, unsigned long elapsed, int disksize, int npages) { - struct obd_histogram *h = osd->od_brw_stats.bs_hist; + struct brw_stats *h = &osd->od_brw_stats; if (rw == READ) atomic_dec(&osd->od_r_in_flight); else atomic_dec(&osd->od_w_in_flight); - lprocfs_oh_tally_log2(&h[BRW_R_PAGES + rw], npages); + lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_PAGES + rw], npages); if (disksize > 0) - lprocfs_oh_tally_log2(&h[BRW_R_DISK_IOSIZE + rw], disksize); + lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_DISK_IOSIZE + rw], + disksize); if (elapsed) - lprocfs_oh_tally_log2(&h[BRW_R_IO_TIME + rw], elapsed); + lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_IO_TIME + rw], + elapsed); } static ssize_t __osd_read(const struct lu_env *env, struct dt_object *dt, diff --git a/lustre/osd-zfs/osd_lproc.c b/lustre/osd-zfs/osd_lproc.c index f865343..c199717 100644 --- a/lustre/osd-zfs/osd_lproc.c +++ b/lustre/osd-zfs/osd_lproc.c @@ -407,6 +407,8 @@ int osd_procfs_fini(struct osd_device *osd) { ENTRY; + lprocfs_fini_brw_stats(&osd->od_brw_stats); + if (osd->od_stats) lprocfs_free_stats(&osd->od_stats);