From 67d2886bd7f120e6fdce90cbae11a05be6ac9bb4 Mon Sep 17 00:00:00 2001 From: johann Date: Thu, 31 Jul 2008 23:30:25 +0000 Subject: [PATCH 1/1] Branch b1_8_gate b=12755,16494,16404 i=bzzz i=adilger cvs add missing patch (for sles9). --- .../patches/sd_iostats-2.6-suse.patch | 579 +++++++++++++++++++++ 1 file changed, 579 insertions(+) create mode 100644 lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch b/lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch new file mode 100644 index 0000000..6d00acd --- /dev/null +++ b/lustre/kernel_patches/patches/sd_iostats-2.6-suse.patch @@ -0,0 +1,579 @@ +Index: linux-2.6.5-7.311/drivers/scsi/Kconfig +=================================================================== +--- linux-2.6.5-7.311.orig/drivers/scsi/Kconfig ++++ linux-2.6.5-7.311/drivers/scsi/Kconfig +@@ -67,6 +67,14 @@ config SCSI_DUMP + polling I/O. If it doesn't, LKCD will fall back to ordinary + interrupt-driven I/O. + ++config SD_IOSTATS ++ bool "Enable SCSI disk I/O stats" ++ depends on BLK_DEV_SD ++ default y ++ ---help--- ++ This enables SCSI disk I/O stats collection. You must also enable ++ /proc file system support if you want this feature. ++ + config CHR_DEV_ST + tristate "SCSI tape support" + depends on SCSI +Index: linux-2.6.5-7.311/drivers/scsi/scsi_proc.c +=================================================================== +--- linux-2.6.5-7.311.orig/drivers/scsi/scsi_proc.c ++++ linux-2.6.5-7.311/drivers/scsi/scsi_proc.c +@@ -38,7 +38,8 @@ + /* 4K page size, but our output routines, use some slack for overruns */ + #define PROC_BLOCK_SIZE (3*1024) + +-static struct proc_dir_entry *proc_scsi; ++struct proc_dir_entry *proc_scsi; ++EXPORT_SYMBOL(proc_scsi); + + /* Protect sht->present and sht->proc_dir */ + static DECLARE_MUTEX(global_host_template_sem); +Index: linux-2.6.5-7.311/drivers/scsi/sd.c +=================================================================== +--- linux-2.6.5-7.311.orig/drivers/scsi/sd.c ++++ linux-2.6.5-7.311/drivers/scsi/sd.c +@@ -66,6 +66,63 @@ + #define SD_MAJORS 16 + #define SD_DISKS 32768 /* anything between 256 and 262144 */ + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++# include ++# include ++ ++typedef struct { ++ unsigned long long iostat_size; ++ unsigned long long iostat_count; ++} iostat_counter_t; ++ ++#define IOSTAT_NCOUNTERS 16 ++typedef struct { ++ iostat_counter_t iostat_read_histogram[IOSTAT_NCOUNTERS]; ++ iostat_counter_t iostat_write_histogram[IOSTAT_NCOUNTERS]; ++ struct timeval iostat_timeval; ++ ++ /* queue depth: how well the pipe is filled up */ ++ unsigned long long iostat_queue_ticks[IOSTAT_NCOUNTERS]; ++ unsigned long long iostat_queue_ticks_sum; ++ unsigned long iostat_queue_depth; ++ unsigned long iostat_queue_stamp; ++ ++ /* seeks: how linear the traffic is */ ++ unsigned long long iostat_next_sector; ++ unsigned long long iostat_seek_sectors; ++ unsigned long long iostat_seeks; ++ unsigned long long iostat_sectors; ++ unsigned long long iostat_reqs; ++ unsigned long iostat_read_reqs; ++ unsigned long iostat_write_reqs; ++ ++ /* process time: how long it takes to process requests */ ++ unsigned long iostat_rtime[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime[IOSTAT_NCOUNTERS]; ++ ++ /* queue time: how long process spent in elevator's queue */ ++ unsigned long iostat_rtime_in_queue[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime_in_queue[IOSTAT_NCOUNTERS]; ++ ++ /* must be the last field, as it's used to know size to be memset'ed */ ++ spinlock_t iostat_lock; ++} ____cacheline_aligned_in_smp iostat_stats_t; ++ ++struct proc_dir_entry *sd_iostats_procdir = NULL; ++char sd_iostats_procdir_name[] = "sd_iostats"; ++static struct file_operations sd_iostats_proc_fops; ++ ++extern void sd_iostats_init(void); ++extern void sd_iostats_fini(void); ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt); ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt); ++#else ++static inline void sd_iostats_init(void) {} ++static inline void sd_iostats_fini(void) {} ++static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {} ++static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {} ++#endif ++ + /* + * Time out in seconds for disks and Magneto-opticals (which are slower). + */ +@@ -96,6 +153,9 @@ struct scsi_disk { + u8 device_ready; + unsigned WCE : 1; /* state of disk WCE bit */ + unsigned RCD : 1; /* state of disk RCD bit, unused */ ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++ iostat_stats_t *stats; /* scsi disk statistics */ ++#endif + }; + + +@@ -384,6 +444,8 @@ queue: + SCpnt->allowed = SD_MAX_RETRIES; + SCpnt->timeout_per_command = timeout; + ++ sd_iostats_start_req(SCpnt); ++ + /* + * This is the completion routine we use. This is matched in terms + * of capability to this function. +@@ -884,6 +946,9 @@ static void sd_rw_intr(struct scsi_cmnd + break; + } + } ++ ++ sd_iostats_finish_req(SCpnt); ++ + /* + * This calls the generic completion function, now that we know + * how many actual sectors finished, and how many sectors we need +@@ -1527,6 +1592,36 @@ static int sd_probe(struct device *dev) + if (!sdkp->device_ready || sdp->host->no_partition_check) + gd->flags |= GENHD_FL_NO_PARTITION_CHECK; + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++ sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL); ++ if (!sdkp->stats) { ++ printk(KERN_WARNING "cannot allocate iostat structure for" ++ "%s\n", gd->disk_name); ++ } else { ++ do_gettimeofday(&sdkp->stats->iostat_timeval); ++ sdkp->stats->iostat_queue_stamp = jiffies; ++ spin_lock_init(&sdkp->stats->iostat_lock); ++ if (sd_iostats_procdir) { ++ struct proc_dir_entry *pde; ++ pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR, ++ sd_iostats_procdir); ++ if (!pde) { ++ printk(KERN_WARNING "Can't create /proc/scsi/" ++ "%s/%s\n", ++ sd_iostats_procdir_name, ++ gd->disk_name); ++ kfree(sdkp->stats); ++ sdkp->stats = NULL; ++ } else { ++ pde->proc_fops = &sd_iostats_proc_fops; ++ pde->data = gd; ++ } ++ } else { ++ kfree(sdkp->stats); ++ sdkp->stats = NULL; ++ } ++ } ++#endif + dev_set_drvdata(dev, sdkp); + add_disk(gd); + +@@ -1574,7 +1669,14 @@ static int sd_remove(struct device *dev) + static void scsi_disk_release(struct kobject *kobj) + { + struct scsi_disk *sdkp = to_scsi_disk(kobj); +- ++ ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++ if (sdkp->stats) { ++ remove_proc_entry(sdkp->disk->disk_name, sd_iostats_procdir); ++ kfree(sdkp->stats); ++ sdkp->stats = NULL; ++ } ++#endif + put_disk(sdkp->disk); + + spin_lock(&sd_index_lock); +@@ -1605,6 +1707,366 @@ static void sd_shutdown(struct device *d + sd_sync_cache(sdp); + } + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++static int ++sd_iostats_seq_show(struct seq_file *seq, void *v) ++{ ++ struct timeval now; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats; ++ unsigned long long read_len; ++ unsigned long long read_len_tot; ++ unsigned long read_num; ++ unsigned long read_num_tot; ++ unsigned long long write_len; ++ unsigned long long write_len_tot; ++ unsigned long write_num; ++ unsigned long write_num_tot; ++ int i; ++ int maxi; ++ ++ stats = scsi_disk(disk)->stats; ++ if (stats == NULL) { ++ printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n"); ++ BUG(); ++ } ++ ++ do_gettimeofday(&now); ++ now.tv_sec -= stats->iostat_timeval.tv_sec; ++ now.tv_usec -= stats->iostat_timeval.tv_usec; ++ if (now.tv_usec < 0) { ++ now.tv_usec += 1000000; ++ now.tv_sec--; ++ } ++ ++ /* this sampling races with updates */ ++ seq_printf(seq, "index: %lu snapshot_time: %lu.%06lu\n", ++ (unsigned long) scsi_disk(disk)->index, ++ now.tv_sec, now.tv_usec); ++ ++ for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--) ++ if (stats->iostat_read_histogram[i].iostat_count != 0 || ++ stats->iostat_write_histogram[i].iostat_count != 0) ++ break; ++ maxi = i; ++ ++ seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", ++ "reads", "total", "writes", "total"); ++ ++ read_len_tot = write_len_tot = 0; ++ read_num_tot = write_num_tot = 0; ++ for (i = 0; i <= maxi; i++) { ++ read_len = stats->iostat_read_histogram[i].iostat_size; ++ read_len_tot += read_len; ++ read_num = stats->iostat_read_histogram[i].iostat_count; ++ read_num_tot += read_num; ++ ++ write_len = stats->iostat_write_histogram[i].iostat_size; ++ write_len_tot += write_len; ++ write_num = stats->iostat_write_histogram[i].iostat_count; ++ write_num_tot += write_num; ++ ++ seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", ++ 512<iostat_queue_ticks[i]; ++ if (ticks == 0) ++ continue; ++ percent = stats->iostat_queue_ticks[i] * 100; ++ do_div(percent, stats->iostat_queue_ticks_sum); ++ seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent); ++ } ++ ++ if (stats->iostat_reqs != 0) { ++ unsigned long long aveseek = 0, percent = 0; ++ ++ if (stats->iostat_seeks) { ++ aveseek = stats->iostat_seek_sectors; ++ do_div(aveseek, stats->iostat_seeks); ++ percent = stats->iostat_seeks * 100; ++ do_div(percent, stats->iostat_reqs); ++ } ++ ++ seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over " ++ "%llu sectors in ave, %llu%% of all reqs\n", ++ stats->iostat_sectors, stats->iostat_reqs, ++ stats->iostat_seeks, aveseek, percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime[i] == 0 && ++ stats->iostat_rtime[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime[i], read_percent, ++ stats->iostat_wtime[i], write_percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime_in_queue[i] == 0 && ++ stats->iostat_rtime_in_queue[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime_in_queue[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime_in_queue[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime_in_queue[i], ++ read_percent, ++ stats->iostat_wtime_in_queue[i], ++ write_percent); ++ } ++ ++ return 0; ++} ++ ++static void * ++sd_iostats_seq_start(struct seq_file *p, loff_t *pos) ++{ ++ return (*pos == 0) ? (void *)1 : NULL; ++} ++ ++static void * ++sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return NULL; ++} ++ ++static void ++sd_iostats_seq_stop(struct seq_file *p, void *v) ++{ ++} ++ ++static struct seq_operations sd_iostats_seqops = { ++ .start = sd_iostats_seq_start, ++ .stop = sd_iostats_seq_stop, ++ .next = sd_iostats_seq_next, ++ .show = sd_iostats_seq_show, ++}; ++ ++static int ++sd_iostats_seq_open (struct inode *inode, struct file *file) ++{ ++ int rc; ++ ++ rc = seq_open(file, &sd_iostats_seqops); ++ if (rc != 0) ++ return rc; ++ ++ ((struct seq_file *)file->private_data)->private = PDE(inode)->data; ++ return 0; ++} ++ ++static ssize_t ++sd_iostats_seq_write(struct file *file, const char *buffer, ++ size_t len, loff_t *off) ++{ ++ struct seq_file *seq = file->private_data; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats = scsi_disk(disk)->stats; ++ unsigned long flags; ++ unsigned long qdepth; ++ ++ ++ spin_lock_irqsave (&stats->iostat_lock, flags); ++ qdepth = stats->iostat_queue_depth; ++ memset (stats, 0, offsetof(iostat_stats_t, iostat_lock)); ++ do_gettimeofday(&stats->iostat_timeval); ++ stats->iostat_queue_stamp = jiffies; ++ stats->iostat_queue_depth = qdepth; ++ spin_unlock_irqrestore (&stats->iostat_lock, flags); ++ ++ return len; ++} ++ ++static struct file_operations sd_iostats_proc_fops = { ++ .owner = THIS_MODULE, ++ .open = sd_iostats_seq_open, ++ .read = seq_read, ++ .write = sd_iostats_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++extern struct proc_dir_entry *proc_scsi; ++ ++void ++sd_iostats_init(void) ++{ ++ if (proc_scsi == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "proc_scsi is NULL\n"); ++ return; ++ } ++ ++ sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name, ++ S_IFDIR | S_IRUGO | S_IXUGO, ++ proc_scsi); ++ if (sd_iostats_procdir == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "can't create /proc/scsi/%s\n", sd_iostats_procdir_name); ++ return; ++ } ++} ++ ++void sd_iostats_fini(void) ++{ ++ if (proc_scsi != NULL && sd_iostats_procdir != NULL) ++ remove_proc_entry(sd_iostats_procdir_name, proc_scsi); ++ ++ sd_iostats_procdir = NULL; ++} ++ ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) ++{ ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ unsigned long *tcounter; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ unsigned long i; ++ ++ stats = scsi_disk(rq->rq_disk)->stats; ++ if (stats == NULL) ++ return; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket); ++ ++ tcounter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket]; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update delay stats */ ++ (*tcounter)++; ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ BUG_ON(stats->iostat_queue_depth == 0); ++ stats->iostat_queue_depth--; ++ ++ /* update seek stats. XXX: not sure about nr_sectors */ ++ stats->iostat_sectors += rq->nr_sectors; ++ stats->iostat_reqs++; ++ if (rq->sector != stats->iostat_next_sector) { ++ stats->iostat_seek_sectors += ++ rq->sector > stats->iostat_next_sector ? ++ rq->sector - stats->iostat_next_sector : ++ stats->iostat_next_sector - rq->sector; ++ stats->iostat_seeks++; ++ } ++ stats->iostat_next_sector = rq->sector + rq->nr_sectors; ++ ++ stats->iostat_queue_stamp = jiffies; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); ++} ++ ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt) ++{ ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ iostat_counter_t *counter; ++ int bucket; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ unsigned long i; ++ int nsect; ++ ++ stats = scsi_disk(rq->rq_disk)->stats; ++ if (stats == NULL) ++ return; ++ ++ nsect = SCpnt->request_bufflen >> 9; ++ for (bucket = 0, tmp = nsect; tmp > 1; bucket++) ++ tmp >>= 1; ++ ++ if (bucket >= IOSTAT_NCOUNTERS) { ++ printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect); ++ BUG(); ++ } ++ ++ counter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_write_histogram[bucket] : ++ &stats->iostat_read_histogram[bucket]; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket); ++ ++ /* an ugly hack to know exact processing time. the right ++ * solution is to add one more field to struct request ++ * hopefully it will break nothing ... */ ++ rq->start_time = jiffies; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_depth++; ++ ++ /* update delay stats */ ++ if (rq_data_dir(rq) == WRITE) { ++ stats->iostat_wtime_in_queue[tbucket]++; ++ stats->iostat_write_reqs++; ++ } else { ++ stats->iostat_rtime_in_queue[tbucket]++; ++ stats->iostat_read_reqs++; ++ } ++ ++ /* update size stats */ ++ counter->iostat_size += nsect; ++ counter->iostat_count++; ++ ++ stats->iostat_queue_stamp = jiffies; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); ++} ++#endif ++ + /** + * init_sd - entry point for this driver (both when built in or when + * a module). +@@ -1614,6 +2076,7 @@ static void sd_shutdown(struct device *d + static int __init init_sd(void) + { + int majors = 0, i; ++ int rc = 0; + + SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); + +@@ -1624,7 +2087,11 @@ static int __init init_sd(void) + if (!majors) + return -ENODEV; + +- return scsi_register_driver(&sd_template.gendrv); ++ sd_iostats_init(); ++ rc = scsi_register_driver(&sd_template.gendrv); ++ if (rc) ++ sd_iostats_fini(); ++ return rc; + } + + /** +@@ -1641,6 +2108,7 @@ static void __exit exit_sd(void) + scsi_unregister_driver(&sd_template.gendrv); + for (i = 0; i < SD_MAJORS; i++) + unregister_blkdev(sd_major(i), "sd"); ++ sd_iostats_fini(); + } + + MODULE_LICENSE("GPL"); -- 1.8.3.1