Whamcloud - gitweb
LU-13309 osd: use per-cpu counters for brw_stats 15/37915/14
authorAndrew Perepechko <andrew.perepechko@hpe.com>
Thu, 2 Dec 2021 07:26:32 +0000 (10:26 +0300)
committerOleg Drokin <green@whamcloud.com>
Tue, 11 Jan 2022 06:18:23 +0000 (06:18 +0000)
Based on perf reports, oh_lock is highly contended
when running IOR with NVMe storage, so we need to
move to per-cpu counters.

struct brw_stats becomes larger: from 3872 to 18208 bytes.
Also, 4 bytes are allocated per each cpu for every counter.
With an 8-cpu system and 32 4-byte per-cpu counters,
there are 448 per-cpu counters or 1792 bytes per-cpu.
These counters will either reuse already
allocated per-cpu pages or allocate a new page on each cpu
(8 pages total).

Change-Id: I24536a0138067fb868aaf962d9321dea7566d13f
Signed-off-by: Andrew Perepechko <andrew.perepechko@hpe.com>
HPE-bug-id: LUS-8007, LUS-8185
Reviewed-on: https://review.whamcloud.com/37915
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/lprocfs_status.h
lustre/obdclass/lprocfs_status.c
lustre/obdclass/lprocfs_status_server.c
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_io.c
lustre/osd-ldiskfs/osd_lproc.c
lustre/osd-zfs/osd_handler.c
lustre/osd-zfs/osd_io.c
lustre/osd-zfs/osd_lproc.c

index 9b2ba94..85fab90 100644 (file)
@@ -102,6 +102,11 @@ struct obd_histogram {
        unsigned long   oh_buckets[OBD_HIST_MAX];
 };
 
+struct obd_hist_pcpu {
+       struct percpu_counter   oh_pc_buckets[OBD_HIST_MAX];
+       bool                    oh_initialized;
+};
+
 enum {
         RENAME_SAMEDIR_SIZE = 0,
         RENAME_CROSSDIR_SRC_SIZE,
@@ -412,11 +417,12 @@ struct brw_stats_props {
 
 struct brw_stats {
        ktime_t                 bs_init;
-       struct obd_histogram    bs_hist[BRW_RW_STATS_NUM];
+       struct obd_hist_pcpu    bs_hist[BRW_RW_STATS_NUM];
        struct brw_stats_props  bs_props[BRW_RW_STATS_NUM / 2];
 };
 
-void lprocfs_init_brw_stats(struct brw_stats *brw_stats);
+int lprocfs_init_brw_stats(struct brw_stats *brw_stats);
+void lprocfs_fini_brw_stats(struct brw_stats *brw_stats);
 
 void ldebugfs_register_osd_stats(struct dentry *parent,
                                 struct brw_stats *brw_stats,
@@ -650,6 +656,15 @@ void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value);
 void lprocfs_oh_clear(struct obd_histogram *oh);
 unsigned long lprocfs_oh_sum(struct obd_histogram *oh);
 
+void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh, unsigned int value);
+void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh, unsigned int value);
+int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh);
+void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh);
+void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh);
+unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh);
+unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh,
+                     unsigned int value);
+
 void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
                            struct lprocfs_counter *cnt);
 
index 2872e89..1d5b9b9 100644 (file)
@@ -1941,6 +1941,95 @@ void lprocfs_oh_clear(struct obd_histogram *oh)
 }
 EXPORT_SYMBOL(lprocfs_oh_clear);
 
+void lprocfs_oh_tally_pcpu(struct obd_hist_pcpu *oh,
+                          unsigned int value)
+{
+       if (value >= OBD_HIST_MAX)
+               value = OBD_HIST_MAX - 1;
+
+       percpu_counter_inc(&oh->oh_pc_buckets[value]);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_pcpu);
+
+void lprocfs_oh_tally_log2_pcpu(struct obd_hist_pcpu *oh,
+                               unsigned int value)
+{
+       unsigned int val = 0;
+
+       if (likely(value != 0))
+               val = min(fls(value - 1), OBD_HIST_MAX);
+
+       lprocfs_oh_tally_pcpu(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2_pcpu);
+
+unsigned long lprocfs_oh_counter_pcpu(struct obd_hist_pcpu *oh,
+                                     unsigned int value)
+{
+       return percpu_counter_sum(&oh->oh_pc_buckets[value]);
+}
+EXPORT_SYMBOL(lprocfs_oh_counter_pcpu);
+
+unsigned long lprocfs_oh_sum_pcpu(struct obd_hist_pcpu *oh)
+{
+       unsigned long ret = 0;
+       int i;
+
+       for (i = 0; i < OBD_HIST_MAX; i++)
+               ret += percpu_counter_sum(&oh->oh_pc_buckets[i]);
+
+       return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum_pcpu);
+
+int lprocfs_oh_alloc_pcpu(struct obd_hist_pcpu *oh)
+{
+       int i, rc;
+
+       if (oh->oh_initialized)
+               return 0;
+
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               rc = percpu_counter_init(&oh->oh_pc_buckets[i], 0, GFP_KERNEL);
+               if (rc)
+                       goto out;
+       }
+
+       oh->oh_initialized = true;
+
+       return 0;
+
+out:
+       for (i--; i >= 0; i--)
+               percpu_counter_destroy(&oh->oh_pc_buckets[i]);
+
+       return rc;
+}
+EXPORT_SYMBOL(lprocfs_oh_alloc_pcpu);
+
+void lprocfs_oh_clear_pcpu(struct obd_hist_pcpu *oh)
+{
+       int i;
+
+       for (i = 0; i < OBD_HIST_MAX; i++)
+               percpu_counter_set(&oh->oh_pc_buckets[i], 0);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear_pcpu);
+
+void lprocfs_oh_release_pcpu(struct obd_hist_pcpu *oh)
+{
+       int i;
+
+       if (!oh->oh_initialized)
+               return;
+
+       for (i = 0; i < OBD_HIST_MAX; i++)
+               percpu_counter_destroy(&oh->oh_pc_buckets[i]);
+
+       oh->oh_initialized = false;
+}
+EXPORT_SYMBOL(lprocfs_oh_release_pcpu);
+
 ssize_t lustre_attr_show(struct kobject *kobj,
                         struct attribute *attr, char *buf)
 {
index a3ddaa9..9a30639 100644 (file)
@@ -702,8 +702,8 @@ void lprocfs_free_obd_stats(struct obd_device *obd)
 EXPORT_SYMBOL(lprocfs_free_obd_stats);
 
 static void display_brw_stats(struct seq_file *seq, const char *name,
-                             const char *units, struct obd_histogram *read,
-                             struct obd_histogram *write, bool scale)
+                             const char *units, struct obd_hist_pcpu *read,
+                             struct obd_hist_pcpu *write, bool scale)
 {
        unsigned long read_tot, write_tot, r, w, read_cum = 0, write_cum = 0;
        unsigned int i;
@@ -712,15 +712,15 @@ static void display_brw_stats(struct seq_file *seq, const char *name,
        seq_printf(seq, "%-22s %-5s %% cum %% |  %-11s %% cum %%\n",
                   name, units, units);
 
-       read_tot = lprocfs_oh_sum(read);
-       write_tot = lprocfs_oh_sum(write);
+       read_tot = lprocfs_oh_sum_pcpu(read);
+       write_tot = lprocfs_oh_sum_pcpu(write);
 
        if (!read_tot && !write_tot)
                return;
 
        for (i = 0; i < OBD_HIST_MAX; i++) {
-               r = read->oh_buckets[i];
-               w = write->oh_buckets[i];
+               r = lprocfs_oh_counter_pcpu(read, i);
+               w = lprocfs_oh_counter_pcpu(write, i);
                read_cum += r;
                write_cum += w;
                if (read_cum == 0 && write_cum == 0)
@@ -799,21 +799,35 @@ static ssize_t brw_stats_seq_write(struct file *file,
        int i;
 
        for (i = 0; i < BRW_RW_STATS_NUM; i++)
-               lprocfs_oh_clear(&brw_stats->bs_hist[i]);
+               lprocfs_oh_clear_pcpu(&brw_stats->bs_hist[i]);
 
        return len;
 }
 
 LDEBUGFS_SEQ_FOPS(brw_stats);
 
-void lprocfs_init_brw_stats(struct brw_stats *brw_stats)
+int lprocfs_init_brw_stats(struct brw_stats *brw_stats)
+{
+       int i, result;
+
+       for (i = 0; i < BRW_RW_STATS_NUM; i++) {
+               result = lprocfs_oh_alloc_pcpu(&brw_stats->bs_hist[i]);
+               if (result)
+                       break;
+       }
+
+       return result;
+}
+EXPORT_SYMBOL(lprocfs_init_brw_stats);
+
+void lprocfs_fini_brw_stats(struct brw_stats *brw_stats)
 {
        int i;
 
        for (i = 0; i < BRW_RW_STATS_NUM; i++)
-               spin_lock_init(&brw_stats->bs_hist[i].oh_lock);
+               lprocfs_oh_release_pcpu(&brw_stats->bs_hist[i]);
 }
-EXPORT_SYMBOL(lprocfs_init_brw_stats);
+EXPORT_SYMBOL(lprocfs_fini_brw_stats);
 
 void ldebugfs_register_osd_stats(struct dentry *parent,
                                 struct brw_stats *brw_stats,
index cdf0a37..55403b4 100644 (file)
@@ -8182,14 +8182,16 @@ static int osd_device_init0(const struct lu_env *env,
 
        INIT_LIST_HEAD(&o->od_ios_list);
 
-       lprocfs_init_brw_stats(&o->od_brw_stats);
+       rc = lprocfs_init_brw_stats(&o->od_brw_stats);
+       if (rc)
+               GOTO(out_brw_stats, rc);
 
        /* setup scrub, including OI files initialization */
        o->od_in_init = 1;
        rc = osd_scrub_setup(env, o, restored);
        o->od_in_init = 0;
        if (rc < 0)
-               GOTO(out_site, rc);
+               GOTO(out_brw_stats, rc);
 
        rc = osd_procfs_init(o, o->od_svname);
        if (rc != 0) {
@@ -8240,6 +8242,8 @@ out_procfs:
        osd_procfs_fini(o);
 out_scrub:
        osd_scrub_cleanup(env, o);
+out_brw_stats:
+       lprocfs_fini_brw_stats(&o->od_brw_stats);
 out_site:
        lu_site_fini(&o->od_site);
 out_compat:
index 1db5db3..95215d6 100644 (file)
@@ -147,13 +147,15 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf)
        int rw = iobuf->dr_rw;
 
        if (iobuf->dr_elapsed_valid) {
+               struct brw_stats *h = &d->od_brw_stats;
+
                iobuf->dr_elapsed_valid = 0;
                LASSERT(iobuf->dr_dev == d);
                LASSERT(iobuf->dr_frags > 0);
-               lprocfs_oh_tally(&d->od_brw_stats.bs_hist[BRW_R_DIO_FRAGS + rw],
-                                iobuf->dr_frags);
-               lprocfs_oh_tally_log2(&d->od_brw_stats.bs_hist[BRW_R_IO_TIME+rw],
-                                     ktime_to_ms(iobuf->dr_elapsed));
+               lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_DIO_FRAGS+rw],
+                                     iobuf->dr_frags);
+               lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_IO_TIME+rw],
+                                          ktime_to_ms(iobuf->dr_elapsed));
        }
 }
 
@@ -230,21 +232,23 @@ static void dio_complete_routine(struct bio *bio, int error)
 static void record_start_io(struct osd_iobuf *iobuf, int size)
 {
        struct osd_device *osd = iobuf->dr_dev;
-       struct obd_histogram *h = osd->od_brw_stats.bs_hist;
+       struct brw_stats *h = &osd->od_brw_stats;
 
        iobuf->dr_frags++;
        atomic_inc(&iobuf->dr_numreqs);
 
        if (iobuf->dr_rw == 0) {
                atomic_inc(&osd->od_r_in_flight);
-               lprocfs_oh_tally(&h[BRW_R_RPC_HIST],
+               lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_RPC_HIST],
                                 atomic_read(&osd->od_r_in_flight));
-               lprocfs_oh_tally_log2(&h[BRW_R_DISK_IOSIZE], size);
+               lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_DISK_IOSIZE],
+                                          size);
        } else if (iobuf->dr_rw == 1) {
                atomic_inc(&osd->od_w_in_flight);
-               lprocfs_oh_tally(&h[BRW_W_RPC_HIST],
+               lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_RPC_HIST],
                                 atomic_read(&osd->od_w_in_flight));
-               lprocfs_oh_tally_log2(&h[BRW_W_DISK_IOSIZE], size);
+               lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_W_DISK_IOSIZE],
+                                          size);
        } else {
                LBUG();
        }
index e2cc801..01d5982 100644 (file)
@@ -57,7 +57,7 @@ void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf)
 
        blocks_per_page = PAGE_SIZE >> osd_sb(osd)->s_blocksize_bits;
 
-       lprocfs_oh_tally_log2(&bs->bs_hist[BRW_R_PAGES + rw], nr_pages);
+       lprocfs_oh_tally_log2_pcpu(&bs->bs_hist[BRW_R_PAGES + rw], nr_pages);
 
        while (nr_pages-- > 0) {
                if (last_page && (*pages)->index != (last_page->index + 1))
@@ -71,8 +71,10 @@ void osd_brw_stats_update(struct osd_device *osd, struct osd_iobuf *iobuf)
                }
        }
 
-       lprocfs_oh_tally(&bs->bs_hist[BRW_R_DISCONT_PAGES+rw], discont_pages);
-       lprocfs_oh_tally(&bs->bs_hist[BRW_R_DISCONT_BLOCKS+rw], discont_blocks);
+       lprocfs_oh_tally_pcpu(&bs->bs_hist[BRW_R_DISCONT_PAGES+rw],
+                             discont_pages);
+       lprocfs_oh_tally_pcpu(&bs->bs_hist[BRW_R_DISCONT_BLOCKS+rw],
+                             discont_blocks);
 }
 
 static int osd_stats_init(struct osd_device *osd)
@@ -828,6 +830,8 @@ out:
 
 int osd_procfs_fini(struct osd_device *osd)
 {
+       lprocfs_fini_brw_stats(&osd->od_brw_stats);
+
        if (osd->od_stats)
                lprocfs_free_stats(&osd->od_stats);
 
index 9552fa9..8a9d4c2 100644 (file)
@@ -1191,7 +1191,9 @@ static int osd_mount(const struct lu_env *env,
        if (opts && strstr(opts, "resetoi"))
                resetoi = true;
 
-       lprocfs_init_brw_stats(&o->od_brw_stats);
+       rc = lprocfs_init_brw_stats(&o->od_brw_stats);
+       if (rc)
+               GOTO(err, rc);
 
        o->od_in_init = 1;
        rc = osd_scrub_setup(env, o, interval, resetoi);
index 9460dc9..f267367 100644 (file)
@@ -70,36 +70,40 @@ static void dbuf_set_pending_evict(dmu_buf_t *db)
 
 static void record_start_io(struct osd_device *osd, int rw, int discont_pages)
 {
-       struct obd_histogram *h = osd->od_brw_stats.bs_hist;
+       struct brw_stats *h = &osd->od_brw_stats;
 
        if (rw == READ) {
                atomic_inc(&osd->od_r_in_flight);
-               lprocfs_oh_tally(&h[BRW_R_RPC_HIST],
-                                atomic_read(&osd->od_r_in_flight));
-               lprocfs_oh_tally(&h[BRW_R_DISCONT_PAGES], discont_pages);
+               lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_RPC_HIST],
+                                     atomic_read(&osd->od_r_in_flight));
+               lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_R_DISCONT_PAGES],
+                                     discont_pages);
        } else {
                atomic_inc(&osd->od_w_in_flight);
-               lprocfs_oh_tally(&h[BRW_W_RPC_HIST],
-                                atomic_read(&osd->od_w_in_flight));
-               lprocfs_oh_tally(&h[BRW_W_DISCONT_PAGES], discont_pages);
+               lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_RPC_HIST],
+                                     atomic_read(&osd->od_w_in_flight));
+               lprocfs_oh_tally_pcpu(&h->bs_hist[BRW_W_DISCONT_PAGES],
+                                     discont_pages);
        }
 }
 
 static void record_end_io(struct osd_device *osd, int rw,
                          unsigned long elapsed, int disksize, int npages)
 {
-       struct obd_histogram *h = osd->od_brw_stats.bs_hist;
+       struct brw_stats *h = &osd->od_brw_stats;
 
        if (rw == READ)
                atomic_dec(&osd->od_r_in_flight);
        else
                atomic_dec(&osd->od_w_in_flight);
 
-       lprocfs_oh_tally_log2(&h[BRW_R_PAGES + rw], npages);
+       lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_PAGES + rw], npages);
        if (disksize > 0)
-               lprocfs_oh_tally_log2(&h[BRW_R_DISK_IOSIZE + rw], disksize);
+               lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_DISK_IOSIZE + rw],
+                                          disksize);
        if (elapsed)
-               lprocfs_oh_tally_log2(&h[BRW_R_IO_TIME + rw], elapsed);
+               lprocfs_oh_tally_log2_pcpu(&h->bs_hist[BRW_R_IO_TIME + rw],
+                                           elapsed);
 }
 
 static ssize_t __osd_read(const struct lu_env *env, struct dt_object *dt,
index f865343..c199717 100644 (file)
@@ -407,6 +407,8 @@ int osd_procfs_fini(struct osd_device *osd)
 {
        ENTRY;
 
+       lprocfs_fini_brw_stats(&osd->od_brw_stats);
+
        if (osd->od_stats)
                lprocfs_free_stats(&osd->od_stats);