From: Nathaniel Clark Date: Tue, 16 Dec 2014 15:11:09 +0000 (-0500) Subject: LU-4259 osd-zfs: Add brw_stats collection and display X-Git-Tag: 2.6.93~97 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=efdacb72afa55fc8b5ad155c479aa34ef0af118a;p=fs%2Flustre-release.git LU-4259 osd-zfs: Add brw_stats collection and display Collect stats consistant with brw_stats where possible for osd-zfs. The supported stats are: (prefixed with BRW_R and BRW_W) * PAGES * RPC_HIST * IO_TIME (reads only) * DISK_IOSIZE * DISCONTINOUS PAGES (only meaningful for writes, always single buffer for reads) Signed-off-by: Nathaniel Clark Change-Id: Icd0d703d3cf31487cdfa524613cfc550db228dba Reviewed-on: http://review.whamcloud.com/11467 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index 248dd93..536da7e 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -284,6 +284,10 @@ struct osd_device { /* quota slave instance */ struct qsd_instance *od_quota_slave; + struct brw_stats od_brw_stats; + atomic_t od_r_in_flight; + atomic_t od_w_in_flight; + /* used to debug zerocopy logic: the fields track all * allocated, loaned and referenced buffers in use. * to be removed once the change is tested well. */ diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index a289ae4..47058d2 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -68,6 +68,50 @@ static char *osd_zerocopy_tag = "zerocopy"; + +static void record_start_io(struct osd_device *osd, int rw, int npages, + int discont_pages) +{ + struct obd_histogram *h = osd->od_brw_stats.hist; + + if (rw == READ) { + atomic_inc(&osd->od_r_in_flight); + lprocfs_oh_tally(&h[BRW_R_RPC_HIST], + atomic_read(&osd->od_r_in_flight)); + lprocfs_oh_tally_log2(&h[BRW_R_PAGES], npages); + lprocfs_oh_tally(&h[BRW_R_DISCONT_PAGES], discont_pages); + + } else { + atomic_inc(&osd->od_w_in_flight); + lprocfs_oh_tally(&h[BRW_W_RPC_HIST], + atomic_read(&osd->od_w_in_flight)); + lprocfs_oh_tally_log2(&h[BRW_W_PAGES], npages); + lprocfs_oh_tally(&h[BRW_W_DISCONT_PAGES], discont_pages); + + } +} + +static void record_end_io(struct osd_device *osd, int rw, + unsigned long elapsed, int disksize) +{ + struct obd_histogram *h = osd->od_brw_stats.hist; + + if (rw == READ) { + atomic_dec(&osd->od_r_in_flight); + if (disksize > 0) + lprocfs_oh_tally_log2(&h[BRW_R_DISK_IOSIZE], disksize); + if (elapsed) + lprocfs_oh_tally_log2(&h[BRW_R_IO_TIME], elapsed); + + } else { + atomic_dec(&osd->od_w_in_flight); + if (disksize > 0) + lprocfs_oh_tally_log2(&h[BRW_W_DISK_IOSIZE], disksize); + if (elapsed) + lprocfs_oh_tally_log2(&h[BRW_W_IO_TIME], elapsed); + } +} + static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, struct lu_buf *buf, loff_t *pos, struct lustre_capa *capa) @@ -77,10 +121,13 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, uint64_t old_size; int size = buf->lb_len; int rc; + unsigned long start; LASSERT(dt_object_exists(dt)); LASSERT(obj->oo_db); + start = cfs_time_current(); + read_lock(&obj->oo_attr_lock); old_size = obj->oo_attr.la_size; read_unlock(&obj->oo_attr_lock); @@ -92,16 +139,15 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, size = old_size - *pos; } + record_start_io(osd, READ, (size >> PAGE_CACHE_SHIFT), 0); + rc = -dmu_read(osd->od_os, obj->oo_db->db_object, *pos, size, buf->lb_buf, DMU_READ_PREFETCH); + + record_end_io(osd, READ, cfs_time_current() - start, size); if (rc == 0) { rc = size; *pos += size; - - /* XXX: workaround for bug in HEAD: fsfilt_ldiskfs_read() returns - * requested number of bytes, not actually read ones */ - if (S_ISLNK(obj->oo_dt.do_lu.lo_header->loh_attr)) - rc = buf->lb_len; } return rc; } @@ -162,6 +208,7 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, struct osd_thandle *oh; uint64_t offset = *pos; int rc; + ENTRY; LASSERT(dt_object_exists(dt)); @@ -170,6 +217,8 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); + record_start_io(osd, WRITE, (buf->lb_len >> PAGE_CACHE_SHIFT), 0); + dmu_write(osd->od_os, obj->oo_db->db_object, offset, (uint64_t)buf->lb_len, buf->lb_buf, oh->ot_tx); write_lock(&obj->oo_attr_lock); @@ -191,6 +240,8 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, rc = buf->lb_len; out: + record_end_io(osd, WRITE, 0, buf->lb_len); + RETURN(rc); } @@ -540,6 +591,8 @@ static int osd_declare_write_commit(const struct lu_env *env, int i, rc, flags = 0; bool ignore_quota = false, synced = false; long long space = 0; + struct page *last_page = NULL; + unsigned long discont_pages = 0; ENTRY; LASSERT(dt_object_exists(dt)); @@ -551,6 +604,9 @@ static int osd_declare_write_commit(const struct lu_env *env, oh = container_of0(th, struct osd_thandle, ot_super); for (i = 0; i < npages; i++) { + if (last_page && lnb[i].lnb_page->index != (last_page->index + 1)) + ++discont_pages; + last_page = lnb[i].lnb_page; if (lnb[i].lnb_rc) /* ENOSPC, network RPC error, etc. * We don't want to book space for pages which will be @@ -611,6 +667,7 @@ static int osd_declare_write_commit(const struct lu_env *env, CDEBUG(D_QUOTA, "writting %d pages, reserving "LPD64"K of quota " "space\n", npages, space); + record_start_io(osd, WRITE, npages, discont_pages); retry: /* acquire quota space if needed */ rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid, @@ -645,6 +702,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, struct osd_thandle *oh; uint64_t new_size = 0; int i, rc = 0; + unsigned long iosize = 0; ENTRY; LASSERT(dt_object_exists(dt)); @@ -689,6 +747,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, if (new_size < lnb[i].lnb_file_offset + lnb[i].lnb_len) new_size = lnb[i].lnb_file_offset + lnb[i].lnb_len; + iosize += lnb[i].lnb_len; } if (unlikely(new_size == 0)) { @@ -696,6 +755,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, th->th_local = 1; /* it is important to return 0 even when all lnb_rc == -ENOSPC * since ofd_commitrw_write() retries several times on ENOSPC */ + record_end_io(osd, WRITE, 0, 0); RETURN(0); } @@ -712,6 +772,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, write_unlock(&obj->oo_attr_lock); } + record_end_io(osd, WRITE, 0, iosize); + RETURN(rc); } @@ -719,13 +781,20 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, struct niobuf_local *lnb, int npages) { struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *osd = osd_obj2dev(obj); struct lu_buf buf; loff_t offset; int i; + unsigned long start; + unsigned long size = 0; LASSERT(dt_object_exists(dt)); LASSERT(obj->oo_db); + start = cfs_time_current(); + + record_start_io(osd, READ, npages, 0); + for (i = 0; i < npages; i++) { buf.lb_buf = kmap(lnb[i].lnb_page); buf.lb_len = lnb[i].lnb_len; @@ -737,6 +806,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, lnb[i].lnb_rc = osd_read(env, dt, &buf, &offset, NULL); kunmap(lnb[i].lnb_page); + size += lnb[i].lnb_rc; + if (lnb[i].lnb_rc < buf.lb_len) { /* all subsequent rc should be 0 */ while (++i < npages) @@ -745,6 +816,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, } } + record_end_io(osd, READ, cfs_time_current() - start, size); + return 0; } diff --git a/lustre/osd-zfs/osd_lproc.c b/lustre/osd-zfs/osd_lproc.c index 622bc5b..8146c16 100644 --- a/lustre/osd-zfs/osd_lproc.c +++ b/lustre/osd-zfs/osd_lproc.c @@ -50,6 +50,110 @@ #ifdef LPROCFS +#define pct(a, b) (b ? a * 100 / b : 0) + +static void display_brw_stats(struct seq_file *seq, char *name, char *units, + struct obd_histogram *read, + struct obd_histogram *write, int scale) +{ + unsigned long read_tot, write_tot, r, w, read_cum = 0, write_cum = 0; + int i; + + seq_printf(seq, "\n%26s read | write\n", " "); + seq_printf(seq, "%-22s %-5s %% cum %% | %-11s %% cum %%\n", + name, units, units); + + read_tot = lprocfs_oh_sum(read); + write_tot = lprocfs_oh_sum(write); + for (i = 0; i < OBD_HIST_MAX; i++) { + r = read->oh_buckets[i]; + w = write->oh_buckets[i]; + read_cum += r; + write_cum += w; + if (read_cum == 0 && write_cum == 0) + continue; + + if (!scale) + seq_printf(seq, "%u", i); + else if (i < 10) + seq_printf(seq, "%u", scale << i); + else if (i < 20) + seq_printf(seq, "%uK", scale << (i-10)); + else + seq_printf(seq, "%uM", scale << (i-20)); + + seq_printf(seq, ":\t\t%10lu %3lu %3lu | %4lu %3lu %3lu\n", + r, pct(r, read_tot), pct(read_cum, read_tot), + w, pct(w, write_tot), pct(write_cum, write_tot)); + + if (read_cum == read_tot && write_cum == write_tot) + break; + } +} + +static void brw_stats_show(struct seq_file *seq, struct brw_stats *brw_stats) +{ + struct timeval now; + + /* this sampling races with updates */ + do_gettimeofday(&now); + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, now.tv_usec); + + display_brw_stats(seq, "pages per bulk r/w", "rpcs", + &brw_stats->hist[BRW_R_PAGES], + &brw_stats->hist[BRW_W_PAGES], 1); + display_brw_stats(seq, "discontiguous pages", "rpcs", + &brw_stats->hist[BRW_R_DISCONT_PAGES], + &brw_stats->hist[BRW_W_DISCONT_PAGES], 0); +#if 0 + display_brw_stats(seq, "discontiguous blocks", "rpcs", + &brw_stats->hist[BRW_R_DISCONT_BLOCKS], + &brw_stats->hist[BRW_W_DISCONT_BLOCKS], 0); + + display_brw_stats(seq, "disk fragmented I/Os", "ios", + &brw_stats->hist[BRW_R_DIO_FRAGS], + &brw_stats->hist[BRW_W_DIO_FRAGS], 0); +#endif + display_brw_stats(seq, "disk I/Os in flight", "ios", + &brw_stats->hist[BRW_R_RPC_HIST], + &brw_stats->hist[BRW_W_RPC_HIST], 0); + + display_brw_stats(seq, "I/O time (1/1000s)", "ios", + &brw_stats->hist[BRW_R_IO_TIME], + &brw_stats->hist[BRW_W_IO_TIME], 1000 / HZ); + + display_brw_stats(seq, "disk I/O size", "ios", + &brw_stats->hist[BRW_R_DISK_IOSIZE], + &brw_stats->hist[BRW_W_DISK_IOSIZE], 1); +} + +#undef pct + +static int osd_brw_stats_seq_show(struct seq_file *seq, void *v) +{ + struct osd_device *osd = seq->private; + + brw_stats_show(seq, &osd->od_brw_stats); + + return 0; +} + +static ssize_t osd_brw_stats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct osd_device *osd = seq->private; + int i; + + for (i = 0; i < BRW_LAST; i++) + lprocfs_oh_clear(&osd->od_brw_stats.hist[i]); + + return len; +} + +LPROC_SEQ_FOPS(osd_brw_stats); + static int osd_stats_init(struct osd_device *osd) { int result; @@ -97,6 +201,8 @@ static int osd_stats_init(struct osd_device *osd) LPROCFS_CNTR_AVGMINMAX, "thandle_closing", "usec"); #endif + result = lprocfs_seq_create(osd->od_proc_entry, "brw_stats", + 0644, &osd_brw_stats_fops, osd); } else { result = -ENOMEM; }