Whamcloud - gitweb
LU-4259 osd-zfs: Add brw_stats collection and display 67/11467/4
authorNathaniel Clark <nathaniel.l.clark@intel.com>
Tue, 16 Dec 2014 15:11:09 +0000 (10:11 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Sat, 3 Jan 2015 03:22:29 +0000 (03:22 +0000)
Collect stats consistant with brw_stats where possible for osd-zfs.
The supported stats are: (prefixed with BRW_R and BRW_W)
 * PAGES
 * RPC_HIST
 * IO_TIME (reads only)
 * DISK_IOSIZE
 * DISCONTINOUS PAGES (only meaningful for writes, always single
buffer for reads)

Signed-off-by: Nathaniel Clark <nathaniel.l.clark@intel.com>
Change-Id: Icd0d703d3cf31487cdfa524613cfc550db228dba
Reviewed-on: http://review.whamcloud.com/11467
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/osd-zfs/osd_internal.h
lustre/osd-zfs/osd_io.c
lustre/osd-zfs/osd_lproc.c

index 248dd93..536da7e 100644 (file)
@@ -284,6 +284,10 @@ struct osd_device {
        /* quota slave instance */
        struct qsd_instance     *od_quota_slave;
 
+       struct brw_stats        od_brw_stats;
+       atomic_t                od_r_in_flight;
+       atomic_t                od_w_in_flight;
+
        /* used to debug zerocopy logic: the fields track all
         * allocated, loaned and referenced buffers in use.
         * to be removed once the change is tested well. */
index a289ae4..47058d2 100644 (file)
 
 static char *osd_zerocopy_tag = "zerocopy";
 
+
+static void record_start_io(struct osd_device *osd, int rw, int npages,
+                           int discont_pages)
+{
+       struct obd_histogram *h = osd->od_brw_stats.hist;
+
+       if (rw == READ) {
+               atomic_inc(&osd->od_r_in_flight);
+               lprocfs_oh_tally(&h[BRW_R_RPC_HIST],
+                                atomic_read(&osd->od_r_in_flight));
+               lprocfs_oh_tally_log2(&h[BRW_R_PAGES], npages);
+               lprocfs_oh_tally(&h[BRW_R_DISCONT_PAGES], discont_pages);
+
+       } else {
+               atomic_inc(&osd->od_w_in_flight);
+               lprocfs_oh_tally(&h[BRW_W_RPC_HIST],
+                                atomic_read(&osd->od_w_in_flight));
+               lprocfs_oh_tally_log2(&h[BRW_W_PAGES], npages);
+               lprocfs_oh_tally(&h[BRW_W_DISCONT_PAGES], discont_pages);
+
+       }
+}
+
+static void record_end_io(struct osd_device *osd, int rw,
+                         unsigned long elapsed, int disksize)
+{
+       struct obd_histogram *h = osd->od_brw_stats.hist;
+
+       if (rw == READ) {
+               atomic_dec(&osd->od_r_in_flight);
+               if (disksize > 0)
+                       lprocfs_oh_tally_log2(&h[BRW_R_DISK_IOSIZE], disksize);
+               if (elapsed)
+                       lprocfs_oh_tally_log2(&h[BRW_R_IO_TIME], elapsed);
+
+       } else {
+               atomic_dec(&osd->od_w_in_flight);
+               if (disksize > 0)
+                       lprocfs_oh_tally_log2(&h[BRW_W_DISK_IOSIZE], disksize);
+               if (elapsed)
+                       lprocfs_oh_tally_log2(&h[BRW_W_IO_TIME], elapsed);
+       }
+}
+
 static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
                        struct lu_buf *buf, loff_t *pos,
                        struct lustre_capa *capa)
@@ -77,10 +121,13 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
        uint64_t           old_size;
        int                size = buf->lb_len;
        int                rc;
+       unsigned long      start;
 
        LASSERT(dt_object_exists(dt));
        LASSERT(obj->oo_db);
 
+       start = cfs_time_current();
+
        read_lock(&obj->oo_attr_lock);
        old_size = obj->oo_attr.la_size;
        read_unlock(&obj->oo_attr_lock);
@@ -92,16 +139,15 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt,
                        size = old_size - *pos;
        }
 
+       record_start_io(osd, READ, (size >> PAGE_CACHE_SHIFT), 0);
+
        rc = -dmu_read(osd->od_os, obj->oo_db->db_object, *pos, size,
                        buf->lb_buf, DMU_READ_PREFETCH);
+
+       record_end_io(osd, READ, cfs_time_current() - start, size);
        if (rc == 0) {
                rc = size;
                *pos += size;
-
-               /* XXX: workaround for bug in HEAD: fsfilt_ldiskfs_read() returns
-                * requested number of bytes, not actually read ones */
-               if (S_ISLNK(obj->oo_dt.do_lu.lo_header->loh_attr))
-                       rc = buf->lb_len;
        }
        return rc;
 }
@@ -162,6 +208,7 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
        struct osd_thandle *oh;
        uint64_t            offset = *pos;
        int                 rc;
+
        ENTRY;
 
        LASSERT(dt_object_exists(dt));
@@ -170,6 +217,8 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
        LASSERT(th != NULL);
        oh = container_of0(th, struct osd_thandle, ot_super);
 
+       record_start_io(osd, WRITE, (buf->lb_len >> PAGE_CACHE_SHIFT), 0);
+
        dmu_write(osd->od_os, obj->oo_db->db_object, offset,
                (uint64_t)buf->lb_len, buf->lb_buf, oh->ot_tx);
        write_lock(&obj->oo_attr_lock);
@@ -191,6 +240,8 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt,
        rc = buf->lb_len;
 
 out:
+       record_end_io(osd, WRITE, 0, buf->lb_len);
+
        RETURN(rc);
 }
 
@@ -540,6 +591,8 @@ static int osd_declare_write_commit(const struct lu_env *env,
        int                 i, rc, flags = 0;
        bool                ignore_quota = false, synced = false;
        long long           space = 0;
+       struct page        *last_page = NULL;
+       unsigned long       discont_pages = 0;
        ENTRY;
 
        LASSERT(dt_object_exists(dt));
@@ -551,6 +604,9 @@ static int osd_declare_write_commit(const struct lu_env *env,
        oh = container_of0(th, struct osd_thandle, ot_super);
 
        for (i = 0; i < npages; i++) {
+               if (last_page && lnb[i].lnb_page->index != (last_page->index + 1))
+                       ++discont_pages;
+               last_page = lnb[i].lnb_page;
                if (lnb[i].lnb_rc)
                        /* ENOSPC, network RPC error, etc.
                         * We don't want to book space for pages which will be
@@ -611,6 +667,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
        CDEBUG(D_QUOTA, "writting %d pages, reserving "LPD64"K of quota "
               "space\n", npages, space);
 
+       record_start_io(osd, WRITE, npages, discont_pages);
 retry:
        /* acquire quota space if needed */
        rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
@@ -645,6 +702,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
        struct osd_thandle *oh;
        uint64_t            new_size = 0;
        int                 i, rc = 0;
+       unsigned long      iosize = 0;
        ENTRY;
 
        LASSERT(dt_object_exists(dt));
@@ -689,6 +747,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
 
                if (new_size < lnb[i].lnb_file_offset + lnb[i].lnb_len)
                        new_size = lnb[i].lnb_file_offset + lnb[i].lnb_len;
+               iosize += lnb[i].lnb_len;
        }
 
        if (unlikely(new_size == 0)) {
@@ -696,6 +755,7 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                th->th_local = 1;
                /* it is important to return 0 even when all lnb_rc == -ENOSPC
                 * since ofd_commitrw_write() retries several times on ENOSPC */
+               record_end_io(osd, WRITE, 0, 0);
                RETURN(0);
        }
 
@@ -712,6 +772,8 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
                write_unlock(&obj->oo_attr_lock);
        }
 
+       record_end_io(osd, WRITE, 0, iosize);
+
        RETURN(rc);
 }
 
@@ -719,13 +781,20 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
                        struct niobuf_local *lnb, int npages)
 {
        struct osd_object *obj  = osd_dt_obj(dt);
+       struct osd_device  *osd = osd_obj2dev(obj);
        struct lu_buf      buf;
        loff_t             offset;
        int                i;
+       unsigned long      start;
+       unsigned long      size = 0;
 
        LASSERT(dt_object_exists(dt));
        LASSERT(obj->oo_db);
 
+       start = cfs_time_current();
+
+       record_start_io(osd, READ, npages, 0);
+
        for (i = 0; i < npages; i++) {
                buf.lb_buf = kmap(lnb[i].lnb_page);
                buf.lb_len = lnb[i].lnb_len;
@@ -737,6 +806,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
                lnb[i].lnb_rc = osd_read(env, dt, &buf, &offset, NULL);
                kunmap(lnb[i].lnb_page);
 
+               size += lnb[i].lnb_rc;
+
                if (lnb[i].lnb_rc < buf.lb_len) {
                        /* all subsequent rc should be 0 */
                        while (++i < npages)
@@ -745,6 +816,8 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt,
                }
        }
 
+       record_end_io(osd, READ, cfs_time_current() - start, size);
+
        return 0;
 }
 
index 622bc5b..8146c16 100644 (file)
 
 #ifdef LPROCFS
 
+#define pct(a, b) (b ? a * 100 / b : 0)
+
+static void display_brw_stats(struct seq_file *seq, char *name, char *units,
+                             struct obd_histogram *read,
+                             struct obd_histogram *write, int scale)
+{
+       unsigned long read_tot, write_tot, r, w, read_cum = 0, write_cum = 0;
+       int i;
+
+       seq_printf(seq, "\n%26s read      |     write\n", " ");
+       seq_printf(seq, "%-22s %-5s %% cum %% |  %-11s %% cum %%\n",
+                  name, units, units);
+
+       read_tot = lprocfs_oh_sum(read);
+       write_tot = lprocfs_oh_sum(write);
+       for (i = 0; i < OBD_HIST_MAX; i++) {
+               r = read->oh_buckets[i];
+               w = write->oh_buckets[i];
+               read_cum += r;
+               write_cum += w;
+               if (read_cum == 0 && write_cum == 0)
+                       continue;
+
+               if (!scale)
+                       seq_printf(seq, "%u", i);
+               else if (i < 10)
+                       seq_printf(seq, "%u", scale << i);
+               else if (i < 20)
+                       seq_printf(seq, "%uK", scale << (i-10));
+               else
+                       seq_printf(seq, "%uM", scale << (i-20));
+
+               seq_printf(seq, ":\t\t%10lu %3lu %3lu   | %4lu %3lu %3lu\n",
+                          r, pct(r, read_tot), pct(read_cum, read_tot),
+                          w, pct(w, write_tot), pct(write_cum, write_tot));
+
+               if (read_cum == read_tot && write_cum == write_tot)
+                       break;
+       }
+}
+
+static void brw_stats_show(struct seq_file *seq, struct brw_stats *brw_stats)
+{
+       struct timeval now;
+
+       /* this sampling races with updates */
+       do_gettimeofday(&now);
+       seq_printf(seq, "snapshot_time:         %lu.%lu (secs.usecs)\n",
+                  now.tv_sec, now.tv_usec);
+
+       display_brw_stats(seq, "pages per bulk r/w", "rpcs",
+                         &brw_stats->hist[BRW_R_PAGES],
+                         &brw_stats->hist[BRW_W_PAGES], 1);
+       display_brw_stats(seq, "discontiguous pages", "rpcs",
+                         &brw_stats->hist[BRW_R_DISCONT_PAGES],
+                         &brw_stats->hist[BRW_W_DISCONT_PAGES], 0);
+#if 0
+       display_brw_stats(seq, "discontiguous blocks", "rpcs",
+                         &brw_stats->hist[BRW_R_DISCONT_BLOCKS],
+                         &brw_stats->hist[BRW_W_DISCONT_BLOCKS], 0);
+
+       display_brw_stats(seq, "disk fragmented I/Os", "ios",
+                         &brw_stats->hist[BRW_R_DIO_FRAGS],
+                         &brw_stats->hist[BRW_W_DIO_FRAGS], 0);
+#endif
+       display_brw_stats(seq, "disk I/Os in flight", "ios",
+                         &brw_stats->hist[BRW_R_RPC_HIST],
+                         &brw_stats->hist[BRW_W_RPC_HIST], 0);
+
+       display_brw_stats(seq, "I/O time (1/1000s)", "ios",
+                         &brw_stats->hist[BRW_R_IO_TIME],
+                         &brw_stats->hist[BRW_W_IO_TIME], 1000 / HZ);
+
+       display_brw_stats(seq, "disk I/O size", "ios",
+                         &brw_stats->hist[BRW_R_DISK_IOSIZE],
+                         &brw_stats->hist[BRW_W_DISK_IOSIZE], 1);
+}
+
+#undef pct
+
+static int osd_brw_stats_seq_show(struct seq_file *seq, void *v)
+{
+       struct osd_device *osd = seq->private;
+
+       brw_stats_show(seq, &osd->od_brw_stats);
+
+       return 0;
+}
+
+static ssize_t osd_brw_stats_seq_write(struct file *file, const char *buf,
+                                      size_t len, loff_t *off)
+{
+       struct seq_file *seq = file->private_data;
+       struct osd_device *osd = seq->private;
+       int i;
+
+       for (i = 0; i < BRW_LAST; i++)
+               lprocfs_oh_clear(&osd->od_brw_stats.hist[i]);
+
+       return len;
+}
+
+LPROC_SEQ_FOPS(osd_brw_stats);
+
 static int osd_stats_init(struct osd_device *osd)
 {
        int result;
@@ -97,6 +201,8 @@ static int osd_stats_init(struct osd_device *osd)
                                LPROCFS_CNTR_AVGMINMAX,
                                "thandle_closing", "usec");
 #endif
+               result = lprocfs_seq_create(osd->od_proc_entry, "brw_stats",
+                                           0644, &osd_brw_stats_fops, osd);
        } else {
                result = -ENOMEM;
        }