From 16849a94d4375344fbe6af35868402effb485f68 Mon Sep 17 00:00:00 2001 From: alex Date: Thu, 8 Nov 2007 10:56:57 +0000 Subject: [PATCH] b=12797 i=adilger i=eeb new data sd_iostats collects are: * time requests spend in elevator's queue * time requests are served by disk * seek stats * queue depth stats --- .../patches/sd_iostats-2.6-rhel4.patch | 766 +++++++++++++-------- .../patches/sd_iostats-2.6-rhel5.patch | 650 +++++++++++++++++ lustre/kernel_patches/series/2.6-rhel5.series | 2 +- 3 files changed, 1111 insertions(+), 307 deletions(-) create mode 100644 lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch index 4e06c09..33160d9 100644 --- a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch +++ b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel4.patch @@ -1,9 +1,9 @@ Index: linux-2.6.9-5.0.3.EL/drivers/scsi/Kconfig =================================================================== -Index: linux+rhel4+chaos/drivers/scsi/Kconfig +Index: linux-2.6.9/drivers/scsi/Kconfig =================================================================== ---- linux+rhel4+chaos.orig/drivers/scsi/Kconfig -+++ linux+rhel4+chaos/drivers/scsi/Kconfig +--- linux-2.6.9.orig/drivers/scsi/Kconfig 2007-07-23 14:19:13.000000000 +0400 ++++ linux-2.6.9/drivers/scsi/Kconfig 2007-07-26 14:16:36.000000000 +0400 @@ -61,6 +61,14 @@ config SCSI_DUMP help SCSI dump support @@ -19,11 +19,25 @@ Index: linux+rhel4+chaos/drivers/scsi/Kconfig config CHR_DEV_ST tristate "SCSI tape support" depends on SCSI -Index: linux+rhel4+chaos/drivers/scsi/sd.c +Index: linux-2.6.9/drivers/scsi/scsi_proc.c =================================================================== ---- linux+rhel4+chaos.orig/drivers/scsi/sd.c -+++ linux+rhel4+chaos/drivers/scsi/sd.c -@@ -63,6 +63,38 @@ +--- linux-2.6.9.orig/drivers/scsi/scsi_proc.c 2007-03-13 02:47:28.000000000 +0300 ++++ linux-2.6.9/drivers/scsi/scsi_proc.c 2007-07-26 14:16:36.000000000 +0400 +@@ -38,7 +38,8 @@ + /* 4K page size, but our output routines, use some slack for overruns */ + #define PROC_BLOCK_SIZE (3*1024) + +-static struct proc_dir_entry *proc_scsi; ++struct proc_dir_entry *proc_scsi; ++EXPORT_SYMBOL(proc_scsi); + + /* Protect sht->present and sht->proc_dir */ + static DECLARE_MUTEX(global_host_template_sem); +Index: linux-2.6.9/drivers/scsi/sd.c +=================================================================== +--- linux-2.6.9.orig/drivers/scsi/sd.c 2007-03-13 02:47:27.000000000 +0300 ++++ linux-2.6.9/drivers/scsi/sd.c 2007-07-28 14:55:56.000000000 +0400 +@@ -63,6 +63,67 @@ #include "scsi_logging.h" @@ -41,28 +55,57 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c + iostat_counter_t iostat_read_histogram[IOSTAT_NCOUNTERS]; + iostat_counter_t iostat_write_histogram[IOSTAT_NCOUNTERS]; + struct timeval iostat_timeval; -+} iostat_stats_t; ++ ++ /* queue depth: how well the pipe is filled up */ ++ unsigned long long iostat_queue_ticks[IOSTAT_NCOUNTERS]; ++ unsigned long long iostat_queue_ticks_sum; ++ unsigned long iostat_queue_depth; ++ unsigned long iostat_queue_stamp; ++ ++ /* seeks: how linear the traffic is */ ++ unsigned long long iostat_next_sector; ++ unsigned long long iostat_seek_sectors; ++ unsigned long long iostat_seeks; ++ unsigned long long iostat_sectors; ++ unsigned long long iostat_reqs; ++ unsigned long iostat_read_reqs; ++ unsigned long iostat_write_reqs; ++ ++ /* process time: how long it takes to process requests */ ++ unsigned long iostat_rtime[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime[IOSTAT_NCOUNTERS]; ++ ++ /* queue time: how long process spent in elevator's queue */ ++ unsigned long iostat_rtime_in_queue[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime_in_queue[IOSTAT_NCOUNTERS]; ++ ++ char iostat_name[32]; ++ ++ /* must be the last field, as it's used to know size to be memset'ed */ ++ spinlock_t iostat_lock; ++} ____cacheline_aligned_in_smp iostat_stats_t; + +iostat_stats_t **sd_iostats; -+spinlock_t sd_iostats_lock; +struct proc_dir_entry *sd_iostats_procdir; +char sd_iostats_procdir_name[] = "sd_iostats"; + +extern void sd_iostats_init(void); +extern void sd_iostats_init_disk(struct gendisk *); +extern void sd_iostats_fini(void); -+extern void sd_iostats_bump(int disk, unsigned int nsect, int iswrite); ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt); ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt); +#else +static inline void sd_iostats_init(void) {} +static inline void sd_iostats_init_disk(struct gendisk *disk) {} +static inline void sd_iostats_fini(void) {} -+static inline void sd_iostats_bump(int disk, unsigned int nsect, int iswrite) {} ++static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {} ++static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {} +#endif + /* * More than enough for everybody ;) The huge number of majors * is a leftover from 16bit dev_t days, we don't really need that -@@ -76,6 +108,7 @@ +@@ -76,6 +137,7 @@ */ #define SD_MAX_DISKS (((26 * 26) + 26 + 1) * 26) @@ -70,17 +113,16 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c /* * Time out in seconds for disks and Magneto-opticals (which are slower). */ -@@ -278,6 +311,9 @@ static int sd_init_command(struct scsi_c +@@ -278,6 +340,8 @@ static int sd_init_command(struct scsi_c SCSI_LOG_HLQUEUE(2, printk("%s : block=%llu\n", disk->disk_name, (unsigned long long)block)); -+ sd_iostats_bump(scsi_disk(disk)->index, this_count, -+ rq_data_dir(SCpnt->request) == WRITE); ++ sd_iostats_start_req(SCpnt); + /* * If we have a 1K hardware sectorsize, prevent access to single * 512 byte sectors. In theory we could handle this - in fact -@@ -474,6 +510,7 @@ static int sd_open(struct inode *inode, +@@ -474,6 +538,7 @@ static int sd_open(struct inode *inode, scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } @@ -88,30 +130,17 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c return 0; error_out: -@@ -500,8 +537,20 @@ static int sd_release(struct inode *inod - - SCSI_LOG_HLQUEUE(3, printk("sd_release: disk=%s\n", disk->disk_name)); - -- if (!--sdkp->openers && sdev->removable) { -- if (scsi_block_when_processing_errors(sdev)) -+ if (!--sdkp->openers) { -+ /* -+ * Remove sd_iostats information about this disk -+ */ -+ if (sd_iostats_procdir != NULL) { -+ remove_proc_entry(disk->disk_name, sd_iostats_procdir); -+ } -+ if (sd_iostats != NULL) { -+ if (sd_iostats[sdkp->index] != NULL) { -+ kfree (sd_iostats[sdkp->index]); -+ sd_iostats[sdkp->index] = NULL; -+ } -+ } -+ if (sdev->removable && scsi_block_when_processing_errors(sdev)) - scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); +@@ -849,6 +914,9 @@ static void sd_rw_intr(struct scsi_cmnd + break; + } } - -@@ -1575,6 +1624,342 @@ static void sd_shutdown(struct device *d ++ ++ sd_iostats_finish_req(SCpnt); ++ + /* + * This calls the generic completion function, now that we know + * how many actual sectors finished, and how many sectors we need +@@ -1575,6 +1643,481 @@ static void sd_shutdown(struct device *d sd_sync_cache(sdp); } @@ -119,101 +148,155 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c +static int +sd_iostats_seq_show(struct seq_file *seq, void *v) +{ -+ struct timeval now; -+ struct gendisk *disk; -+ iostat_stats_t *stats; -+ unsigned long long read_len; -+ unsigned long long read_len_tot; -+ unsigned long read_num; -+ unsigned long read_num_tot; -+ unsigned long long write_len; -+ unsigned long long write_len_tot; -+ unsigned long write_num; -+ unsigned long write_num_tot; -+ int i; -+ int maxi; -+ -+ if (seq == NULL || seq->private == NULL) { -+ printk(KERN_ERR "sd_iostats_seq_show: NULL disk\n"); ++ struct timeval now; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats; ++ unsigned long long read_len; ++ unsigned long long read_len_tot; ++ unsigned long read_num; ++ unsigned long read_num_tot; ++ unsigned long long write_len; ++ unsigned long long write_len_tot; ++ unsigned long write_num; ++ unsigned long write_num_tot; ++ int i; ++ int maxi; ++ ++ if (sd_iostats == NULL) { ++ printk(KERN_ERR "sd_iostats_seq_show: NULL stats array\n"); + BUG(); + } + -+ disk = seq->private; ++ stats = sd_iostats[scsi_disk(disk)->index]; ++ if (stats == NULL) { ++ printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n"); ++ BUG(); ++ } + -+ if (scsi_disk(disk) == NULL || (disk->flags & GENHD_FL_UP) == 0) { -+ seq_printf(seq, "sd_iostats_seq_show: Device %s " -+ "does not exist\n", disk->disk_name); -+ return 0; ++ do_gettimeofday(&now); ++ now.tv_sec -= stats->iostat_timeval.tv_sec; ++ now.tv_usec -= stats->iostat_timeval.tv_usec; ++ if (now.tv_usec < 0) { ++ now.tv_usec += 1000000; ++ now.tv_sec--; + } + -+ if (sd_iostats == NULL) { -+ printk(KERN_ERR "sd_iostats_seq_show: NULL stats array\n"); -+ BUG(); -+ } ++ /* this sampling races with updates */ ++ seq_printf(seq, "index: %lu snapshot_time: %lu.%06lu\n", ++ (unsigned long) scsi_disk(disk)->index, ++ now.tv_sec, now.tv_usec); ++ ++ for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--) ++ if (stats->iostat_read_histogram[i].iostat_count != 0 || ++ stats->iostat_write_histogram[i].iostat_count != 0) ++ break; ++ maxi = i; ++ ++ seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", ++ "reads", "total", "writes", "total"); ++ ++ read_len_tot = write_len_tot = 0; ++ read_num_tot = write_num_tot = 0; ++ for (i = 0; i <= maxi; i++) { ++ read_len = stats->iostat_read_histogram[i].iostat_size; ++ read_len_tot += read_len; ++ read_num = stats->iostat_read_histogram[i].iostat_count; ++ read_num_tot += read_num; ++ ++ write_len = stats->iostat_write_histogram[i].iostat_size; ++ write_len_tot += write_len; ++ write_num = stats->iostat_write_histogram[i].iostat_count; ++ write_num_tot += write_num; ++ ++ seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", ++ 512<index]; -+ if (stats == NULL) { -+ seq_printf(seq, "sd_iostats_seq_show: sd_iostats " -+ "entry %d does not exist\n", -+ scsi_disk(disk)->index); -+ return 0; -+ } ++ seq_printf(seq, "%8s %8lu %12llu %8lu %12llu\n\n", "total", ++ read_num_tot, read_len_tot, ++ write_num_tot, write_len_tot); ++ ++ seq_printf(seq, "%8s %8s %8s\n", "qdepth", "ticks", "%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long long ticks, percent; ++ ticks = stats->iostat_queue_ticks[i]; ++ if (ticks == 0) ++ continue; ++ percent = stats->iostat_queue_ticks[i] * 100; ++ do_div(percent, stats->iostat_queue_ticks_sum); ++ seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent); ++ } + -+ do_gettimeofday(&now); -+ now.tv_sec -= stats->iostat_timeval.tv_sec; -+ now.tv_usec -= stats->iostat_timeval.tv_usec; -+ if (now.tv_usec < 0) { -+ now.tv_usec += 1000000; -+ now.tv_sec--; -+ } ++ if (stats->iostat_reqs != 0) { ++ unsigned long long aveseek = 0, percent = 0; + -+ /* this sampling races with updates */ -+ seq_printf(seq, "index: %lu snapshot_time: %lu.%06lu\n", -+ scsi_disk(disk)->index, now.tv_sec, now.tv_usec); -+ -+ for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--) -+ if (stats->iostat_read_histogram[i].iostat_count != 0 || -+ stats->iostat_write_histogram[i].iostat_count != 0) -+ break; -+ maxi = i; -+ -+ seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", -+ "reads", "total", "writes", "total"); -+ -+ read_len_tot = write_len_tot = 0; -+ read_num_tot = write_num_tot = 0; -+ for (i = 0; i <= maxi; i++) { -+ read_len = stats->iostat_read_histogram[i].iostat_size; -+ read_len_tot += read_len; -+ read_num = stats->iostat_read_histogram[i].iostat_count; -+ read_num_tot += read_num; -+ -+ write_len = stats->iostat_write_histogram[i].iostat_size; -+ write_len_tot += write_len; -+ write_num = stats->iostat_write_histogram[i].iostat_count; -+ write_num_tot += write_num; -+ -+ seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", -+ 512<iostat_seeks) { ++ aveseek = stats->iostat_seek_sectors; ++ do_div(aveseek, stats->iostat_seeks); ++ percent = stats->iostat_seeks * 100; ++ do_div(percent, stats->iostat_reqs); ++ } ++ ++ seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over " ++ "%llu sectors in ave, %llu%% of all reqs\n", ++ stats->iostat_sectors, stats->iostat_reqs, ++ stats->iostat_seeks, aveseek, percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime[i] == 0 && ++ stats->iostat_rtime[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime[i], read_percent, ++ stats->iostat_wtime[i], write_percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime_in_queue[i] == 0 && ++ stats->iostat_rtime_in_queue[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime_in_queue[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime_in_queue[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime_in_queue[i], ++ read_percent, ++ stats->iostat_wtime_in_queue[i], ++ write_percent); ++ } ++ ++ return 0; +} + +static void * +sd_iostats_seq_start(struct seq_file *p, loff_t *pos) +{ -+ return (*pos == 0) ? (void *)1 : NULL; ++ return (*pos == 0) ? (void *)1 : NULL; +} + +static void * +sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ -+ ++*pos; -+ return NULL; ++ ++*pos; ++ return NULL; +} + +static void @@ -222,50 +305,54 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c +} + +static struct seq_operations sd_iostats_seqops = { -+ .start = sd_iostats_seq_start, -+ .stop = sd_iostats_seq_stop, -+ .next = sd_iostats_seq_next, -+ .show = sd_iostats_seq_show, ++ .start = sd_iostats_seq_start, ++ .stop = sd_iostats_seq_stop, ++ .next = sd_iostats_seq_next, ++ .show = sd_iostats_seq_show, +}; + +static int +sd_iostats_seq_open (struct inode *inode, struct file *file) +{ -+ int rc; ++ int rc; + -+ rc = seq_open(file, &sd_iostats_seqops); -+ if (rc != 0) -+ return rc; ++ rc = seq_open(file, &sd_iostats_seqops); ++ if (rc != 0) ++ return rc; + -+ ((struct seq_file *)file->private_data)->private = PDE(inode)->data; -+ return 0; ++ ((struct seq_file *)file->private_data)->private = PDE(inode)->data; ++ return 0; +} + +static ssize_t +sd_iostats_seq_write(struct file *file, const char *buffer, + size_t len, loff_t *off) +{ -+ struct seq_file *seq = file->private_data; -+ struct gendisk *disk = seq->private; -+ iostat_stats_t *stats = sd_iostats[scsi_disk(disk)->index]; -+ unsigned long flags; -+ -+ -+ spin_lock_irqsave (&sd_iostats_lock, flags); -+ memset (stats, 0, sizeof(*stats)); -+ do_gettimeofday(&stats->iostat_timeval); -+ spin_unlock_irqrestore (&sd_iostats_lock, flags); -+ -+ return len; ++ struct seq_file *seq = file->private_data; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats = sd_iostats[scsi_disk(disk)->index]; ++ unsigned long flags; ++ unsigned long qdepth; ++ ++ ++ spin_lock_irqsave (&stats->iostat_lock, flags); ++ qdepth = stats->iostat_queue_depth; ++ memset (stats, 0, offsetof(iostat_stats_t, iostat_lock)); ++ do_gettimeofday(&stats->iostat_timeval); ++ stats->iostat_queue_stamp = jiffies; ++ stats->iostat_queue_depth = qdepth; ++ spin_unlock_irqrestore (&stats->iostat_lock, flags); ++ ++ return len; +} + +static struct file_operations sd_iostats_proc_fops = { -+ .owner = THIS_MODULE, -+ .open = sd_iostats_seq_open, -+ .read = seq_read, -+ .write = sd_iostats_seq_write, -+ .llseek = seq_lseek, -+ .release = seq_release, ++ .owner = THIS_MODULE, ++ .open = sd_iostats_seq_open, ++ .read = seq_read, ++ .write = sd_iostats_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, +}; + +extern struct proc_dir_entry *proc_scsi; @@ -273,188 +360,269 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c +void +sd_iostats_init(void) +{ -+ int i; -+ -+ spin_lock_init(&sd_iostats_lock); -+ -+ sd_iostats = kmalloc(SD_STATS * sizeof(iostat_stats_t *), GFP_KERNEL); -+ if (sd_iostats == NULL) { -+ printk(KERN_WARNING "Can't keep sd iostats: " -+ "ENOMEM allocating stats array size %ld\n", -+ SD_STATS * sizeof(iostat_stats_t *)); -+ return; -+ } ++ int i; ++ ++ sd_iostats = kmalloc(SD_STATS * sizeof(iostat_stats_t *), GFP_KERNEL); ++ if (sd_iostats == NULL) { ++ printk(KERN_WARNING "Can't keep sd iostats: " ++ "ENOMEM allocating stats array size %d\n", ++ SD_STATS * sizeof(iostat_stats_t *)); ++ return; ++ } + -+ for (i = 0; i < SD_STATS; i++) -+ sd_iostats[i] = NULL; ++ for (i = 0; i < SD_STATS; i++) ++ sd_iostats[i] = NULL; + -+ if (proc_scsi == NULL) { -+ printk(KERN_WARNING "No access to sd iostats: " -+ "proc_scsi is NULL\n"); -+ return; -+ } ++ if (proc_scsi == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "proc_scsi is NULL\n"); ++ return; ++ } + -+ sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name, -+ S_IFDIR | S_IRUGO | S_IXUGO, -+ proc_scsi); -+ if (sd_iostats_procdir == NULL) { -+ printk(KERN_WARNING "No access to sd iostats: " -+ "can't create /proc/scsi/%s\n", sd_iostats_procdir_name); -+ return; ++ sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name, ++ S_IFDIR | S_IRUGO | S_IXUGO, ++ proc_scsi); ++ if (sd_iostats_procdir == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "can't create /proc/scsi/%s\n", sd_iostats_procdir_name); ++ return; + } +} + +void +sd_iostats_init_disk(struct gendisk *disk) +{ -+ struct proc_dir_entry *pde; -+ unsigned long flags; -+ iostat_stats_t *stats; -+ -+ if (sd_iostats == NULL || -+ sd_iostats_procdir == NULL) -+ return; -+ -+ if (scsi_disk(disk)->index > SD_STATS) { -+ printk(KERN_ERR "sd_iostats_init_disk: " -+ "unexpected disk index %d(%d)\n", -+ scsi_disk(disk)->index, SD_STATS); -+ return; -+ } ++ struct proc_dir_entry *pde; ++ unsigned long flags; ++ iostat_stats_t *stats; ++ ++ if (sd_iostats == NULL || sd_iostats_procdir == NULL) ++ return; ++ ++ if (scsi_disk(disk)->index > SD_STATS) { ++ printk(KERN_ERR "sd_iostats_init_disk: " ++ "unexpected disk index %d(%d)\n", ++ scsi_disk(disk)->index, SD_STATS); ++ return; ++ } + -+ if (sd_iostats[scsi_disk(disk)->index] != NULL) -+ return; ++ if (sd_iostats[scsi_disk(disk)->index] != NULL) ++ return; + -+ stats = kmalloc(sizeof(*stats), GFP_KERNEL); -+ if (stats == NULL) { -+ printk(KERN_WARNING "Can't keep %s iostats: " -+ "ENOMEM allocating stats size %ld\n", -+ disk->disk_name, sizeof(*stats)); -+ return; -+ } ++ stats = kmalloc(sizeof(*stats), GFP_KERNEL); ++ if (stats == NULL) { ++ printk(KERN_WARNING "Can't keep %s iostats: " ++ "ENOMEM allocating stats size %d\n", ++ disk->disk_name, sizeof(*stats)); ++ return; ++ } + -+ memset (stats, 0, sizeof(*stats)); -+ do_gettimeofday(&stats->iostat_timeval); ++ memset (stats, 0, sizeof(*stats)); ++ do_gettimeofday(&stats->iostat_timeval); ++ stats->iostat_queue_stamp = jiffies; ++ spin_lock_init(&stats->iostat_lock); + -+ spin_lock_irqsave(&sd_iostats_lock, flags); + -+ if (sd_iostats[scsi_disk(disk)->index] != NULL) { -+ spin_unlock_irqrestore(&sd_iostats_lock, flags); -+ kfree (stats); -+ return; -+ } ++ spin_lock_irqsave(&stats->iostat_lock, flags); + -+ sd_iostats[scsi_disk(disk)->index] = stats; -+ -+ spin_unlock_irqrestore(&sd_iostats_lock, flags); -+ -+ pde = create_proc_entry(disk->disk_name, S_IRUGO | S_IWUSR, -+ sd_iostats_procdir); -+ if (pde == NULL) { -+ printk(KERN_WARNING "Can't create /proc/scsi/%s/%s\n", -+ sd_iostats_procdir_name, disk->disk_name); -+ } else { -+ pde->proc_fops = &sd_iostats_proc_fops; -+ pde->data = disk; -+ } ++ if (sd_iostats[scsi_disk(disk)->index] != NULL) { ++ spin_unlock_irqrestore(&stats->iostat_lock, flags); ++ kfree (stats); ++ return; ++ } ++ ++ sd_iostats[scsi_disk(disk)->index] = stats; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, flags); ++ ++ strncpy(stats->iostat_name, disk->disk_name, ++ sizeof(stats->iostat_name)-1); ++ ++ pde = create_proc_entry(stats->iostat_name, S_IRUGO | S_IWUSR, ++ sd_iostats_procdir); ++ if (pde == NULL) { ++ printk(KERN_WARNING "Can't create /proc/scsi/%s/%s\n", ++ sd_iostats_procdir_name, disk->disk_name); ++ } else { ++ pde->proc_fops = &sd_iostats_proc_fops; ++ pde->data = disk; ++ } +} + -+static void sd_devname(unsigned int disknum, char *buffer) ++void sd_iostats_fini(void) +{ -+ if (disknum < 26) -+ sprintf(buffer, "sd%c", 'a' + disknum); -+ else { -+ unsigned int min1; -+ unsigned int min2; -+ /* -+ * For larger numbers of disks, we need to go to a new -+ * naming scheme. -+ */ -+ min1 = disknum / 26; -+ min2 = disknum % 26; -+ sprintf(buffer, "sd%c%c", 'a' + min1 - 1, 'a' + min2); -+ } ++ int i; ++ ++ if (sd_iostats == NULL) ++ return; ++ ++ for (i = 0; i < SD_STATS; i++) { ++ if (sd_iostats[i] == NULL) ++ continue; ++ if (sd_iostats_procdir != NULL) ++ remove_proc_entry(sd_iostats[i]->iostat_name, ++ sd_iostats_procdir); ++ kfree(sd_iostats[i]); ++ } ++ ++ if (proc_scsi != NULL && sd_iostats_procdir != NULL) ++ remove_proc_entry(sd_iostats_procdir_name, proc_scsi); ++ ++ sd_iostats_procdir = NULL; ++ kfree(sd_iostats); ++ sd_iostats = NULL; +} + -+void -+sd_iostats_fini(void) ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) +{ -+ char name[6]; -+ int i; -+ -+ if (sd_iostats_procdir != NULL) { -+ for (i = 0; i < SD_STATS; i++) { -+ sd_devname(i, name); -+ remove_proc_entry(name, sd_iostats_procdir); -+ } -+ -+ if (proc_scsi == NULL) { -+ printk(KERN_ERR "sd_iostats_fini: proc_scsi NULL\n"); -+ BUG(); -+ } -+ remove_proc_entry(sd_iostats_procdir_name, -+ proc_scsi); -+ -+ sd_iostats_procdir = NULL; -+ } -+ -+ if (sd_iostats != NULL) { -+ for (i = 0; i < SD_STATS; i++) { -+ if (sd_iostats[i] != NULL) -+ kfree (sd_iostats[i]); -+ } -+ -+ kfree(sd_iostats); -+ sd_iostats = NULL; -+ } ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ unsigned long *tcounter; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ int disk, i; ++ ++ disk = scsi_disk(rq->rq_disk)->index; ++ ++ if (sd_iostats == NULL) ++ return; ++ ++ if (disk < 0 || disk >= SD_STATS) { ++ printk(KERN_ERR "sd_iostats_bump: unexpected disk index " ++ "%d([0-%d])\n", disk, SD_STATS); ++ BUG(); ++ } ++ ++ stats = sd_iostats[disk]; ++ if (stats == NULL) ++ return; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket); ++ ++ tcounter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket]; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update delay stats */ ++ (*tcounter)++; ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_depth--; ++ ++ /* update seek stats. XXX: not sure about nr_sectors */ ++ stats->iostat_sectors += rq->nr_sectors; ++ stats->iostat_reqs++; ++ if (rq->sector != stats->iostat_next_sector) { ++ stats->iostat_seek_sectors += ++ rq->sector > stats->iostat_next_sector ? ++ rq->sector - stats->iostat_next_sector : ++ stats->iostat_next_sector - rq->sector; ++ stats->iostat_seeks++; ++ } ++ stats->iostat_next_sector = rq->sector + rq->nr_sectors; ++ ++ stats->iostat_queue_stamp = jiffies; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); +} + -+void -+sd_iostats_bump(int disk, unsigned int nsect, int iswrite) ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt) +{ -+ iostat_stats_t *stats; -+ iostat_counter_t *counter; -+ int bucket; -+ int tmp; -+ unsigned long irqflags; -+ -+ if (sd_iostats == NULL) -+ return; -+ -+ if (disk < 0 || disk >= SD_STATS) { -+ printk(KERN_ERR "sd_iostats_bump: unexpected disk index %d([0-%d])\n", -+ disk, SD_STATS); -+ BUG(); -+ } ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ iostat_counter_t *counter; ++ int bucket; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ int disk, i; ++ int nsect; ++ ++ disk = scsi_disk(rq->rq_disk)->index; ++ ++ if (sd_iostats == NULL) ++ return; ++ ++ if (disk < 0 || disk >= SD_STATS) { ++ printk(KERN_ERR "sd_iostats_bump: unexpected disk index %d([0-%d])\n", ++ disk, SD_STATS); ++ BUG(); ++ } + -+ for (bucket = 0, tmp = nsect; tmp > 1; bucket++) -+ tmp /= 2; ++ stats = sd_iostats[disk]; ++ if (stats == NULL) ++ return; + -+ if (bucket >= IOSTAT_NCOUNTERS) { -+ printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect); -+ BUG(); -+ } ++ nsect = SCpnt->request_bufflen >> 9; ++ for (bucket = 0, tmp = nsect; tmp > 1; bucket++) ++ tmp >>= 1; + -+ spin_lock_irqsave(&sd_iostats_lock, irqflags); -+ -+ stats = sd_iostats[disk]; -+ if (stats != NULL) { -+ counter = iswrite ? -+ &stats->iostat_write_histogram[bucket] : -+ &stats->iostat_read_histogram[bucket]; ++ if (bucket >= IOSTAT_NCOUNTERS) { ++ printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect); ++ BUG(); ++ } + -+ counter->iostat_size += nsect; -+ counter->iostat_count++; -+ } ++ counter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_write_histogram[bucket] : ++ &stats->iostat_read_histogram[bucket]; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket); ++ ++ /* an ugly hack to know exact processing time. the right ++ * solution is to add one more field to struct request ++ * hopefully it will break nothing ... */ ++ rq->start_time = jiffies; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_depth++; ++ ++ /* update delay stats */ ++ if (rq_data_dir(rq) == WRITE) { ++ stats->iostat_wtime_in_queue[tbucket]++; ++ stats->iostat_write_reqs++; ++ } else { ++ stats->iostat_rtime_in_queue[tbucket]++; ++ stats->iostat_read_reqs++; ++ } ++ ++ /* update size stats */ ++ counter->iostat_size += nsect; ++ counter->iostat_count++; ++ ++ stats->iostat_queue_stamp = jiffies; + -+ spin_unlock_irqrestore(&sd_iostats_lock, irqflags); ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); +} +#endif + /** * init_sd - entry point for this driver (both when built in or when * a module). -@@ -1584,6 +1969,7 @@ static void sd_shutdown(struct device *d +@@ -1584,6 +2127,7 @@ static void sd_shutdown(struct device *d static int __init init_sd(void) { int majors = 0, i; @@ -462,7 +630,7 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); -@@ -1594,7 +1980,10 @@ static int __init init_sd(void) +@@ -1594,7 +2138,10 @@ static int __init init_sd(void) if (!majors) return -ENODEV; @@ -474,7 +642,7 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c } /** -@@ -1608,6 +1997,7 @@ static void __exit exit_sd(void) +@@ -1608,6 +2155,7 @@ static void __exit exit_sd(void) SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n")); @@ -482,17 +650,3 @@ Index: linux+rhel4+chaos/drivers/scsi/sd.c scsi_unregister_driver(&sd_template.gendrv); for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); -Index: linux+rhel4+chaos/drivers/scsi/scsi_proc.c -=================================================================== ---- linux+rhel4+chaos.orig/drivers/scsi/scsi_proc.c -+++ linux+rhel4+chaos/drivers/scsi/scsi_proc.c -@@ -38,7 +38,8 @@ - /* 4K page size, but our output routines, use some slack for overruns */ - #define PROC_BLOCK_SIZE (3*1024) - --static struct proc_dir_entry *proc_scsi; -+struct proc_dir_entry *proc_scsi; -+EXPORT_SYMBOL(proc_scsi); - - /* Protect sht->present and sht->proc_dir */ - static DECLARE_MUTEX(global_host_template_sem); diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch new file mode 100644 index 0000000..e38e22a --- /dev/null +++ b/lustre/kernel_patches/patches/sd_iostats-2.6-rhel5.patch @@ -0,0 +1,650 @@ +Index: linux-2.6.9-5.0.3.EL/drivers/scsi/Kconfig +=================================================================== +Index: linux-2.6.9/drivers/scsi/Kconfig +=================================================================== +--- linux-2.6.9.orig/drivers/scsi/Kconfig 2007-07-23 14:19:13.000000000 +0400 ++++ linux-2.6.9/drivers/scsi/Kconfig 2007-07-26 14:16:36.000000000 +0400 +@@ -61,6 +61,14 @@ config SCSI_DUMP + help + SCSI dump support + ++config SD_IOSTATS ++ bool "Enable SCSI disk I/O stats" ++ depends on BLK_DEV_SD ++ default y ++ ---help--- ++ This enables SCSI disk I/O stats collection. You must also enable ++ /proc file system support if you want this feature. ++ + config CHR_DEV_ST + tristate "SCSI tape support" + depends on SCSI +Index: linux-2.6.9/drivers/scsi/scsi_proc.c +=================================================================== +--- linux-2.6.9.orig/drivers/scsi/scsi_proc.c 2007-03-13 02:47:28.000000000 +0300 ++++ linux-2.6.9/drivers/scsi/scsi_proc.c 2007-07-26 14:16:36.000000000 +0400 +@@ -38,7 +38,8 @@ + /* 4K page size, but our output routines, use some slack for overruns */ + #define PROC_BLOCK_SIZE (3*1024) + +-static struct proc_dir_entry *proc_scsi; ++struct proc_dir_entry *proc_scsi; ++EXPORT_SYMBOL(proc_scsi); + + /* Protect sht->present and sht->proc_dir */ + static DECLARE_MUTEX(global_host_template_sem); +Index: linux-2.6.9/drivers/scsi/sd.c +=================================================================== +--- linux-2.6.9.orig/drivers/scsi/sd.c 2007-03-13 02:47:27.000000000 +0300 ++++ linux-2.6.9/drivers/scsi/sd.c 2007-07-28 14:55:56.000000000 +0400 +@@ -63,6 +63,67 @@ + + #include "scsi_logging.h" + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++# include ++# include ++ ++typedef struct { ++ unsigned long long iostat_size; ++ unsigned long long iostat_count; ++} iostat_counter_t; ++ ++#define IOSTAT_NCOUNTERS 16 ++typedef struct { ++ iostat_counter_t iostat_read_histogram[IOSTAT_NCOUNTERS]; ++ iostat_counter_t iostat_write_histogram[IOSTAT_NCOUNTERS]; ++ struct timeval iostat_timeval; ++ ++ /* queue depth: how well the pipe is filled up */ ++ unsigned long long iostat_queue_ticks[IOSTAT_NCOUNTERS]; ++ unsigned long long iostat_queue_ticks_sum; ++ unsigned long iostat_queue_depth; ++ unsigned long iostat_queue_stamp; ++ ++ /* seeks: how linear the traffic is */ ++ unsigned long long iostat_next_sector; ++ unsigned long long iostat_seek_sectors; ++ unsigned long long iostat_seeks; ++ unsigned long long iostat_sectors; ++ unsigned long long iostat_reqs; ++ unsigned long iostat_read_reqs; ++ unsigned long iostat_write_reqs; ++ ++ /* process time: how long it takes to process requests */ ++ unsigned long iostat_rtime[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime[IOSTAT_NCOUNTERS]; ++ ++ /* queue time: how long process spent in elevator's queue */ ++ unsigned long iostat_rtime_in_queue[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime_in_queue[IOSTAT_NCOUNTERS]; ++ ++ char iostat_name[32]; ++ ++ /* must be the last field, as it's used to know size to be memset'ed */ ++ spinlock_t iostat_lock; ++} ____cacheline_aligned_in_smp iostat_stats_t; ++ ++iostat_stats_t **sd_iostats; ++struct proc_dir_entry *sd_iostats_procdir; ++char sd_iostats_procdir_name[] = "sd_iostats"; ++ ++extern void sd_iostats_init(void); ++extern void sd_iostats_init_disk(struct gendisk *); ++extern void sd_iostats_fini(void); ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt); ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt); ++#else ++static inline void sd_iostats_init(void) {} ++static inline void sd_iostats_init_disk(struct gendisk *disk) {} ++static inline void sd_iostats_fini(void) {} ++static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {} ++static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {} ++#endif ++ + /* + * More than enough for everybody ;) The huge number of majors + * is a leftover from 16bit dev_t days, we don't really need that +@@ -76,6 +137,7 @@ + */ + #define SD_MAX_DISKS (((26 * 26) + 26 + 1) * 26) + ++#define SD_STATS 256 + /* + * Time out in seconds for disks and Magneto-opticals (which are slower). + */ +@@ -278,6 +340,8 @@ static int sd_init_command(struct scsi_c + SCSI_LOG_HLQUEUE(2, printk("%s : block=%llu\n", + disk->disk_name, (unsigned long long)block)); + ++ sd_iostats_start_req(SCpnt); ++ + /* + * If we have a 1K hardware sectorsize, prevent access to single + * 512 byte sectors. In theory we could handle this - in fact +@@ -474,6 +538,7 @@ static int sd_open(struct inode *inode, + scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); + } + ++ sd_iostats_init_disk(disk); + return 0; + + error_out: +@@ -849,6 +914,7 @@ static void sd_rw_intr(struct scsi_cmnd + break; + } + out: ++ sd_iostats_finish_req(SCpnt); + scsi_io_completion(SCpnt, good_bytes); + } + +@@ -1575,6 +1643,481 @@ static void sd_shutdown(struct device *d + sd_sync_cache(sdp); + } + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++static int ++sd_iostats_seq_show(struct seq_file *seq, void *v) ++{ ++ struct timeval now; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats; ++ unsigned long long read_len; ++ unsigned long long read_len_tot; ++ unsigned long read_num; ++ unsigned long read_num_tot; ++ unsigned long long write_len; ++ unsigned long long write_len_tot; ++ unsigned long write_num; ++ unsigned long write_num_tot; ++ int i; ++ int maxi; ++ ++ if (sd_iostats == NULL) { ++ printk(KERN_ERR "sd_iostats_seq_show: NULL stats array\n"); ++ BUG(); ++ } ++ ++ stats = sd_iostats[scsi_disk(disk)->index]; ++ if (stats == NULL) { ++ printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n"); ++ BUG(); ++ } ++ ++ do_gettimeofday(&now); ++ now.tv_sec -= stats->iostat_timeval.tv_sec; ++ now.tv_usec -= stats->iostat_timeval.tv_usec; ++ if (now.tv_usec < 0) { ++ now.tv_usec += 1000000; ++ now.tv_sec--; ++ } ++ ++ /* this sampling races with updates */ ++ seq_printf(seq, "index: %lu snapshot_time: %lu.%06lu\n", ++ (unsigned long) scsi_disk(disk)->index, ++ now.tv_sec, now.tv_usec); ++ ++ for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--) ++ if (stats->iostat_read_histogram[i].iostat_count != 0 || ++ stats->iostat_write_histogram[i].iostat_count != 0) ++ break; ++ maxi = i; ++ ++ seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", ++ "reads", "total", "writes", "total"); ++ ++ read_len_tot = write_len_tot = 0; ++ read_num_tot = write_num_tot = 0; ++ for (i = 0; i <= maxi; i++) { ++ read_len = stats->iostat_read_histogram[i].iostat_size; ++ read_len_tot += read_len; ++ read_num = stats->iostat_read_histogram[i].iostat_count; ++ read_num_tot += read_num; ++ ++ write_len = stats->iostat_write_histogram[i].iostat_size; ++ write_len_tot += write_len; ++ write_num = stats->iostat_write_histogram[i].iostat_count; ++ write_num_tot += write_num; ++ ++ seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", ++ 512<iostat_queue_ticks[i]; ++ if (ticks == 0) ++ continue; ++ percent = stats->iostat_queue_ticks[i] * 100; ++ do_div(percent, stats->iostat_queue_ticks_sum); ++ seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent); ++ } ++ ++ if (stats->iostat_reqs != 0) { ++ unsigned long long aveseek = 0, percent = 0; ++ ++ if (stats->iostat_seeks) { ++ aveseek = stats->iostat_seek_sectors; ++ do_div(aveseek, stats->iostat_seeks); ++ percent = stats->iostat_seeks * 100; ++ do_div(percent, stats->iostat_reqs); ++ } ++ ++ seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over " ++ "%llu sectors in ave, %llu%% of all reqs\n", ++ stats->iostat_sectors, stats->iostat_reqs, ++ stats->iostat_seeks, aveseek, percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime[i] == 0 && ++ stats->iostat_rtime[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime[i], read_percent, ++ stats->iostat_wtime[i], write_percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime_in_queue[i] == 0 && ++ stats->iostat_rtime_in_queue[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime_in_queue[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime_in_queue[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime_in_queue[i], ++ read_percent, ++ stats->iostat_wtime_in_queue[i], ++ write_percent); ++ } ++ ++ return 0; ++} ++ ++static void * ++sd_iostats_seq_start(struct seq_file *p, loff_t *pos) ++{ ++ return (*pos == 0) ? (void *)1 : NULL; ++} ++ ++static void * ++sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return NULL; ++} ++ ++static void ++sd_iostats_seq_stop(struct seq_file *p, void *v) ++{ ++} ++ ++static struct seq_operations sd_iostats_seqops = { ++ .start = sd_iostats_seq_start, ++ .stop = sd_iostats_seq_stop, ++ .next = sd_iostats_seq_next, ++ .show = sd_iostats_seq_show, ++}; ++ ++static int ++sd_iostats_seq_open (struct inode *inode, struct file *file) ++{ ++ int rc; ++ ++ rc = seq_open(file, &sd_iostats_seqops); ++ if (rc != 0) ++ return rc; ++ ++ ((struct seq_file *)file->private_data)->private = PDE(inode)->data; ++ return 0; ++} ++ ++static ssize_t ++sd_iostats_seq_write(struct file *file, const char *buffer, ++ size_t len, loff_t *off) ++{ ++ struct seq_file *seq = file->private_data; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats = sd_iostats[scsi_disk(disk)->index]; ++ unsigned long flags; ++ unsigned long qdepth; ++ ++ ++ spin_lock_irqsave (&stats->iostat_lock, flags); ++ qdepth = stats->iostat_queue_depth; ++ memset (stats, 0, offsetof(iostat_stats_t, iostat_lock)); ++ do_gettimeofday(&stats->iostat_timeval); ++ stats->iostat_queue_stamp = jiffies; ++ stats->iostat_queue_depth = qdepth; ++ spin_unlock_irqrestore (&stats->iostat_lock, flags); ++ ++ return len; ++} ++ ++static struct file_operations sd_iostats_proc_fops = { ++ .owner = THIS_MODULE, ++ .open = sd_iostats_seq_open, ++ .read = seq_read, ++ .write = sd_iostats_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++extern struct proc_dir_entry *proc_scsi; ++ ++void ++sd_iostats_init(void) ++{ ++ int i; ++ ++ sd_iostats = kmalloc(SD_STATS * sizeof(iostat_stats_t *), GFP_KERNEL); ++ if (sd_iostats == NULL) { ++ printk(KERN_WARNING "Can't keep sd iostats: " ++ "ENOMEM allocating stats array size %d\n", ++ SD_STATS * sizeof(iostat_stats_t *)); ++ return; ++ } ++ ++ for (i = 0; i < SD_STATS; i++) ++ sd_iostats[i] = NULL; ++ ++ if (proc_scsi == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "proc_scsi is NULL\n"); ++ return; ++ } ++ ++ sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name, ++ S_IFDIR | S_IRUGO | S_IXUGO, ++ proc_scsi); ++ if (sd_iostats_procdir == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "can't create /proc/scsi/%s\n", sd_iostats_procdir_name); ++ return; ++ } ++} ++ ++void ++sd_iostats_init_disk(struct gendisk *disk) ++{ ++ struct proc_dir_entry *pde; ++ unsigned long flags; ++ iostat_stats_t *stats; ++ ++ if (sd_iostats == NULL || sd_iostats_procdir == NULL) ++ return; ++ ++ if (scsi_disk(disk)->index > SD_STATS) { ++ printk(KERN_ERR "sd_iostats_init_disk: " ++ "unexpected disk index %d(%d)\n", ++ scsi_disk(disk)->index, SD_STATS); ++ return; ++ } ++ ++ if (sd_iostats[scsi_disk(disk)->index] != NULL) ++ return; ++ ++ stats = kmalloc(sizeof(*stats), GFP_KERNEL); ++ if (stats == NULL) { ++ printk(KERN_WARNING "Can't keep %s iostats: " ++ "ENOMEM allocating stats size %d\n", ++ disk->disk_name, sizeof(*stats)); ++ return; ++ } ++ ++ memset (stats, 0, sizeof(*stats)); ++ do_gettimeofday(&stats->iostat_timeval); ++ stats->iostat_queue_stamp = jiffies; ++ spin_lock_init(&stats->iostat_lock); ++ ++ ++ spin_lock_irqsave(&stats->iostat_lock, flags); ++ ++ if (sd_iostats[scsi_disk(disk)->index] != NULL) { ++ spin_unlock_irqrestore(&stats->iostat_lock, flags); ++ kfree (stats); ++ return; ++ } ++ ++ sd_iostats[scsi_disk(disk)->index] = stats; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, flags); ++ ++ strncpy(stats->iostat_name, disk->disk_name, ++ sizeof(stats->iostat_name)-1); ++ ++ pde = create_proc_entry(stats->iostat_name, S_IRUGO | S_IWUSR, ++ sd_iostats_procdir); ++ if (pde == NULL) { ++ printk(KERN_WARNING "Can't create /proc/scsi/%s/%s\n", ++ sd_iostats_procdir_name, disk->disk_name); ++ } else { ++ pde->proc_fops = &sd_iostats_proc_fops; ++ pde->data = disk; ++ } ++} ++ ++void sd_iostats_fini(void) ++{ ++ int i; ++ ++ if (sd_iostats == NULL) ++ return; ++ ++ for (i = 0; i < SD_STATS; i++) { ++ if (sd_iostats[i] == NULL) ++ continue; ++ if (sd_iostats_procdir != NULL) ++ remove_proc_entry(sd_iostats[i]->iostat_name, ++ sd_iostats_procdir); ++ kfree(sd_iostats[i]); ++ } ++ ++ if (proc_scsi != NULL && sd_iostats_procdir != NULL) ++ remove_proc_entry(sd_iostats_procdir_name, proc_scsi); ++ ++ sd_iostats_procdir = NULL; ++ kfree(sd_iostats); ++ sd_iostats = NULL; ++} ++ ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) ++{ ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ unsigned long *tcounter; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ int disk, i; ++ ++ disk = scsi_disk(rq->rq_disk)->index; ++ ++ if (sd_iostats == NULL) ++ return; ++ ++ if (disk < 0 || disk >= SD_STATS) { ++ printk(KERN_ERR "sd_iostats_bump: unexpected disk index " ++ "%d([0-%d])\n", disk, SD_STATS); ++ BUG(); ++ } ++ ++ stats = sd_iostats[disk]; ++ if (stats == NULL) ++ return; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket); ++ ++ tcounter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket]; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update delay stats */ ++ (*tcounter)++; ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_depth--; ++ ++ /* update seek stats. XXX: not sure about nr_sectors */ ++ stats->iostat_sectors += rq->nr_sectors; ++ stats->iostat_reqs++; ++ if (rq->sector != stats->iostat_next_sector) { ++ stats->iostat_seek_sectors += ++ rq->sector > stats->iostat_next_sector ? ++ rq->sector - stats->iostat_next_sector : ++ stats->iostat_next_sector - rq->sector; ++ stats->iostat_seeks++; ++ } ++ stats->iostat_next_sector = rq->sector + rq->nr_sectors; ++ ++ stats->iostat_queue_stamp = jiffies; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); ++} ++ ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt) ++{ ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ iostat_counter_t *counter; ++ int bucket; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ int disk, i; ++ int nsect; ++ ++ disk = scsi_disk(rq->rq_disk)->index; ++ ++ if (sd_iostats == NULL) ++ return; ++ ++ if (disk < 0 || disk >= SD_STATS) { ++ printk(KERN_ERR "sd_iostats_bump: unexpected disk index %d([0-%d])\n", ++ disk, SD_STATS); ++ BUG(); ++ } ++ ++ stats = sd_iostats[disk]; ++ if (stats == NULL) ++ return; ++ ++ nsect = SCpnt->request_bufflen >> 9; ++ for (bucket = 0, tmp = nsect; tmp > 1; bucket++) ++ tmp >>= 1; ++ ++ if (bucket >= IOSTAT_NCOUNTERS) { ++ printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect); ++ BUG(); ++ } ++ ++ counter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_write_histogram[bucket] : ++ &stats->iostat_read_histogram[bucket]; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket); ++ ++ /* an ugly hack to know exact processing time. the right ++ * solution is to add one more field to struct request ++ * hopefully it will break nothing ... */ ++ rq->start_time = jiffies; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_depth++; ++ ++ /* update delay stats */ ++ if (rq_data_dir(rq) == WRITE) { ++ stats->iostat_wtime_in_queue[tbucket]++; ++ stats->iostat_write_reqs++; ++ } else { ++ stats->iostat_rtime_in_queue[tbucket]++; ++ stats->iostat_read_reqs++; ++ } ++ ++ /* update size stats */ ++ counter->iostat_size += nsect; ++ counter->iostat_count++; ++ ++ stats->iostat_queue_stamp = jiffies; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); ++} ++#endif ++ + /** + * init_sd - entry point for this driver (both when built in or when + * a module). +@@ -1584,6 +2127,7 @@ static void sd_shutdown(struct device *d + static int __init init_sd(void) + { + int majors = 0, i; ++ int rc = 0; + + SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); + +@@ -1594,7 +2138,10 @@ static int __init init_sd(void) + if (!majors) + return -ENODEV; + +- return scsi_register_driver(&sd_template.gendrv); ++ rc = scsi_register_driver(&sd_template.gendrv); ++ if (rc == 0) ++ sd_iostats_init(); ++ return rc; + } + + /** +@@ -1608,6 +2155,7 @@ static void __exit exit_sd(void) + + SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n")); + ++ sd_iostats_fini(); + scsi_unregister_driver(&sd_template.gendrv); + for (i = 0; i < SD_MAJORS; i++) + unregister_blkdev(sd_major(i), "sd"); diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series index 1d46583..421296f 100644 --- a/lustre/kernel_patches/series/2.6-rhel5.series +++ b/lustre/kernel_patches/series/2.6-rhel5.series @@ -7,6 +7,6 @@ dev_read_only-2.6.18-vanilla.patch export-2.6.18-vanilla.patch 8kstack-2.6.12.patch export-show_task-2.6.18-vanilla.patch -sd_iostats-2.6-rhel4.patch +sd_iostats-2.6-rhel5.patch export_symbol_numa-2.6-fc5.patch jbd-stats-2.6-rhel5.patch -- 1.8.3.1