From 314b7595afcea60905037b585bc2fdcdecca05a0 Mon Sep 17 00:00:00 2001 From: yangsheng Date: Fri, 22 May 2009 05:20:49 +0000 Subject: [PATCH] Branch b1_8 b=18668 i=shadow, johann SLES11 kernel patches. --- .../patches/dev_read_only-2.6.27-vanilla.patch | 145 ++++++ .../patches/export-2.6.27-vanilla.patch | 37 ++ .../patches/export-show_task-2.6.27-vanilla.patch | 12 + .../patches/md-mmp-unplug-dev-sles11.patch | 22 + .../quota-support-64-bit-quota-format.patch | 282 ++++++++++ .../patches/sd_iostats-2.6.27-vanilla.patch | 579 +++++++++++++++++++++ 6 files changed, 1077 insertions(+) create mode 100644 lustre/kernel_patches/patches/dev_read_only-2.6.27-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-2.6.27-vanilla.patch create mode 100644 lustre/kernel_patches/patches/export-show_task-2.6.27-vanilla.patch create mode 100644 lustre/kernel_patches/patches/md-mmp-unplug-dev-sles11.patch create mode 100644 lustre/kernel_patches/patches/quota-support-64-bit-quota-format.patch create mode 100644 lustre/kernel_patches/patches/sd_iostats-2.6.27-vanilla.patch diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.27-vanilla.patch new file mode 100644 index 0000000..0ea4b21 --- /dev/null +++ b/lustre/kernel_patches/patches/dev_read_only-2.6.27-vanilla.patch @@ -0,0 +1,145 @@ +Index: linux-2.6.22.5/block/blk-core.c +=================================================================== +--- linux-2.6.22.5.orig/block/blk-core.c ++++ linux-2.6.22.5/block/blk-core.c +@@ -3101,6 +3101,8 @@ static inline int should_fail_request(st + + #endif /* CONFIG_FAIL_MAKE_REQUEST */ + ++int dev_check_rdonly(struct block_device *bdev); ++ + /* + * Check whether this bio extends beyond the end of the device. + */ +@@ -3185,6 +3187,12 @@ end_io: + + if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) + goto end_io; ++ /* this is cfs's dev_rdonly check */ ++ if (bio->bi_rw == WRITE && ++ dev_check_rdonly(bio->bi_bdev)) { ++ bio_endio(bio, 0); ++ break; ++ } + + if (should_fail_request(bio)) + goto end_io; +@@ -3850,6 +3858,91 @@ void swap_io_context(struct io_context * + } + EXPORT_SYMBOL(kblockd_flush_work); + ++ /* ++ * Debug code for turning block devices "read-only" (will discard writes ++ * silently). This is for filesystem crash/recovery testing. ++ */ ++struct deventry { ++ dev_t dev; ++ struct deventry *next; ++}; ++ ++static struct deventry *devlist = NULL; ++static spinlock_t devlock = SPIN_LOCK_UNLOCKED; ++ ++int dev_check_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur; ++ if (!bdev) return 0; ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ return 1; ++ } ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++ return 0; ++} ++ ++void dev_set_rdonly(struct block_device *bdev) ++{ ++ struct deventry *newdev, *cur; ++ ++ if (!bdev) ++ return; ++ newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL); ++ if (!newdev) ++ return; ++ ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ spin_unlock(&devlock); ++ kfree(newdev); ++ return; ++ } ++ cur = cur->next; ++ } ++ newdev->dev = bdev->bd_dev; ++ newdev->next = devlist; ++ devlist = newdev; ++ spin_unlock(&devlock); ++ printk(KERN_WARNING "Turning device %s (%#x) read-only\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev); ++} ++ ++void dev_clear_rdonly(struct block_device *bdev) ++{ ++ struct deventry *cur, *last = NULL; ++ if (!bdev) return; ++ spin_lock(&devlock); ++ cur = devlist; ++ while(cur) { ++ if (bdev->bd_dev == cur->dev) { ++ if (last) ++ last->next = cur->next; ++ else ++ devlist = cur->next; ++ spin_unlock(&devlock); ++ kfree(cur); ++ printk(KERN_WARNING "Removing read-only on %s (%#x)\n", ++ bdev->bd_disk ? bdev->bd_disk->disk_name : ++ "unknown block", bdev->bd_dev); ++ return; ++ } ++ last = cur; ++ cur = cur->next; ++ } ++ spin_unlock(&devlock); ++} ++ ++EXPORT_SYMBOL(dev_set_rdonly); ++EXPORT_SYMBOL(dev_clear_rdonly); ++EXPORT_SYMBOL(dev_check_rdonly); + int __init blk_dev_init(void) + { + int i; +Index: linux-2.6.22.5/fs/block_dev.c +=================================================================== +--- linux-2.6.22.5.orig/fs/block_dev.c ++++ linux-2.6.22.5/fs/block_dev.c +@@ -1294,6 +1294,7 @@ static int __blkdev_put(struct block_dev + if (bdev != bdev->bd_contains) + victim = bdev->bd_contains; + bdev->bd_contains = NULL; ++ dev_clear_rdonly(bdev); + } + unlock_kernel(); + mutex_unlock(&bdev->bd_mutex); +Index: linux-2.6.22.5/include/linux/fs.h +=================================================================== +--- linux-2.6.22.5.orig/include/linux/fs.h ++++ linux-2.6.22.5/include/linux/fs.h +@@ -1744,6 +1744,10 @@ struct bio; + extern void submit_bio(int, struct bio *); + extern int bdev_read_only(struct block_device *); + #endif ++#define HAVE_CLEAR_RDONLY_ON_PUT ++extern void dev_set_rdonly(struct block_device *bdev); ++extern int dev_check_rdonly(struct block_device *bdev); ++extern void dev_clear_rdonly(struct block_device *bdev); + extern int set_blocksize(struct block_device *, int); + extern int sb_set_blocksize(struct super_block *, int); + extern int sb_min_blocksize(struct super_block *, int); diff --git a/lustre/kernel_patches/patches/export-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/export-2.6.27-vanilla.patch new file mode 100644 index 0000000..0a979c3 --- /dev/null +++ b/lustre/kernel_patches/patches/export-2.6.27-vanilla.patch @@ -0,0 +1,37 @@ +Index: linux-2.6/fs/jbd/journal.c +=================================================================== +--- linux-2.6.orig/fs/jbd2/journal.c 2006-07-15 16:13:50.000000000 +0800 ++++ linux-2.6/fs/jbd2/journal.c 2006-07-15 16:22:04.000000000 +0800 +@@ -74,6 +74,7 @@ EXPORT_SYMBOL(journal_abort); + spin_unlock(&journal->j_state_lock); + return ret; + } ++EXPORT_SYMBOL(jbd2_log_start_commit); + + /* + * Force and wait upon a commit if the calling process is not within +Index: linux-2.6/kernel/sys.c +=================================================================== +--- linux-2.6.orig/kernel/sys.c 2006-07-15 16:13:50.000000000 +0800 ++++ linux-2.6/kernel/sys.c 2006-07-15 16:22:04.000000000 +0800 +@@ -74,6 +74,8 @@ + EXPORT_SYMBOL(in_egroup_p); + + DECLARE_RWSEM(uts_sem); ++ ++EXPORT_SYMBOL(uts_sem); + + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) + { +Index: linux-2.6/kernel/sys.c +=================================================================== +--- linux-2.6.orig/security/security.c ++++ linux-2.6/security/security.c +@@ -74,6 +74,7 @@ + return 0; + return security_ops->inode_unlink(dir, dentry, mnt); + } ++EXPORT_SYMBOL(security_inode_unlink); + + int security_inode_symlink(struct inode *dir, struct dentry *dentry, + struct vfsmount *mnt, const char *old_name) diff --git a/lustre/kernel_patches/patches/export-show_task-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/export-show_task-2.6.27-vanilla.patch new file mode 100644 index 0000000..d9b287e --- /dev/null +++ b/lustre/kernel_patches/patches/export-show_task-2.6.27-vanilla.patch @@ -0,0 +1,12 @@ +Index: linux-2.6/kernel/sched.c +=================================================================== +--- linux-2.6.orig/kernel/sched.c 2006-07-15 11:51:46.000000000 +0800 ++++ linux-2.6/kernel/sched.c 2006-07-15 16:24:35.000000000 +0800 +@@ -4652,6 +4652,7 @@ static inline struct task_struct *younge + + show_stack(p, NULL); + } ++EXPORT_SYMBOL(sched_show_task); + + void show_state_filter(unsigned long state_filter) + { diff --git a/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles11.patch b/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles11.patch new file mode 100644 index 0000000..f9f84b1 --- /dev/null +++ b/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles11.patch @@ -0,0 +1,22 @@ +Index: linux-2.6.16.60-0.33/drivers/md/raid5.c +=================================================================== +--- linux-2.6.16.60-0.33.orig/drivers/md/raid5.c ++++ linux-2.6.16.60-0.33/drivers/md/raid5.c +@@ -900,6 +900,8 @@ static int add_stripe_bio(struct stripe_ + bi->bi_next = *bip; + *bip = bi; + bi->bi_phys_segments++; ++ if (bio_sync(bi) && !forwrite) ++ clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */ + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + +@@ -1617,6 +1619,8 @@ static int make_request (request_queue_t + + bi->bi_end_io(bi, bytes, 0); + } ++ if (bio_sync(bi)) ++ raid5_unplug_device(q); + return 0; + } + diff --git a/lustre/kernel_patches/patches/quota-support-64-bit-quota-format.patch b/lustre/kernel_patches/patches/quota-support-64-bit-quota-format.patch new file mode 100644 index 0000000..14fe9a8 --- /dev/null +++ b/lustre/kernel_patches/patches/quota-support-64-bit-quota-format.patch @@ -0,0 +1,282 @@ +From: Jan Kara + +Implement conversion functions for new version (version 1) of quota format +which supports 64-bit block and inode limits and 64-bit inode usage. The +original implementation has been written by Andrew Perepechko. + +Signed-off-by: Andrew Perepechko +Signed-off-by: Jan Kara +Signed-off-by: Andrew Morton +--- + + fs/quota_v2.c | 140 ++++++++++++++++++++++++++++++++++++---------- + fs/quotaio_v2.h | 26 ++++++-- + 2 files changed, 132 insertions(+), 34 deletions(-) + +diff -puN fs/quota_v2.c~quota-support-64-bit-quota-format fs/quota_v2.c +--- a/fs/quota_v2.c~quota-support-64-bit-quota-format ++++ a/fs/quota_v2.c +@@ -23,14 +23,24 @@ MODULE_LICENSE("GPL"); + + #define __QUOTA_V2_PARANOIA + +-static void v2_mem2diskdqb(void *dp, struct dquot *dquot); +-static void v2_disk2memdqb(struct dquot *dquot, void *dp); +-static int v2_is_id(void *dp, struct dquot *dquot); +- +-static struct qtree_fmt_operations v2_qtree_ops = { +- .mem2disk_dqblk = v2_mem2diskdqb, +- .disk2mem_dqblk = v2_disk2memdqb, +- .is_id = v2_is_id, ++static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot); ++static void v2r0_disk2memdqb(struct dquot *dquot, void *dp); ++static int v2r0_is_id(void *dp, struct dquot *dquot); ++ ++static struct qtree_fmt_operations v2r0_qtree_ops = { ++ .mem2disk_dqblk = v2r0_mem2diskdqb, ++ .disk2mem_dqblk = v2r0_disk2memdqb, ++ .is_id = v2r0_is_id, ++}; ++ ++static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); ++static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); ++static int v2r1_is_id(void *dp, struct dquot *dquot); ++ ++static struct qtree_fmt_operations v2r1_qtree_ops = { ++ .mem2disk_dqblk = v2r1_mem2diskdqb, ++ .disk2mem_dqblk = v2r1_disk2memdqb, ++ .is_id = v2r1_is_id, + }; + + #define QUOTABLOCK_BITS 10 +@@ -46,8 +56,7 @@ static inline qsize_t v2_qbtos(qsize_t b + return blocks << QUOTABLOCK_BITS; + } + +-/* Check whether given file is really vfsv0 quotafile */ +-static int v2_check_quota_file(struct super_block *sb, int type) ++static int v2_check_quota_file_header(struct super_block *sb, int type) + { + struct v2_disk_dqheader dqhead; + ssize_t size; +@@ -58,12 +67,20 @@ static int v2_check_quota_file(struct su + if (size != sizeof(struct v2_disk_dqheader)) { + printk("quota_v2: failed read expected=%zd got=%zd\n", + sizeof(struct v2_disk_dqheader), size); +- return 0; ++ return -EIO; + } +- if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || +- le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) +- return 0; +- return 1; ++ if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type]) ++ return -ENOENT; ++ if (le32_to_cpu(dqhead.dqh_version) > quota_versions[type]) ++ return -EOPNOTSUPP; ++ return le32_to_cpu(dqhead.dqh_version); ++} ++ ++ ++/* Check whether given file is really vfsv0 quotafile */ ++static int v2_check_quota_file(struct super_block *sb, int type) ++{ ++ return v2_check_quota_file_header(sb, type) >= 0; + } + + /* Read information header from quota file */ +@@ -73,7 +90,13 @@ static int v2_read_file_info(struct supe + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo; + ssize_t size; ++ int version = v2_check_quota_file_header(sb, type); + ++ if (version < 0) { ++ printk(KERN_WARNING "Cannot identify quota file version on " ++ "device %s: %d\n", sb->s_id, version); ++ return -1; ++ } + size = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + if (size != sizeof(struct v2_disk_dqinfo)) { +@@ -88,9 +111,14 @@ static int v2_read_file_info(struct supe + return -1; + } + qinfo = info->dqi_priv; +- /* limits are stored as unsigned 32-bit data */ +- info->dqi_maxblimit = 0xffffffff; +- info->dqi_maxilimit = 0xffffffff; ++ if (version == 0) { ++ /* limits are stored as unsigned 32-bit data */ ++ info->dqi_maxblimit = 0xffffffff; ++ info->dqi_maxilimit = 0xffffffff; ++ } else { ++ info->dqi_maxblimit = 0x7fffffffffffffffULL; ++ info->dqi_maxilimit = 0x7fffffffffffffffULL; ++ } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); +@@ -102,8 +130,13 @@ static int v2_read_file_info(struct supe + qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; + qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; + qinfo->dqi_qtree_depth = qtree_depth(qinfo); +- qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk); +- qinfo->dqi_ops = &v2_qtree_ops; ++ if (version == 0) { ++ qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk); ++ qinfo->dqi_ops = &v2r0_qtree_ops; ++ } else { ++ qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); ++ qinfo->dqi_ops = &v2r1_qtree_ops; ++ } + return 0; + } + +@@ -134,9 +167,9 @@ static int v2_write_file_info(struct sup + return 0; + } + +-static void v2_disk2memdqb(struct dquot *dquot, void *dp) ++static void v2r0_disk2memdqb(struct dquot *dquot, void *dp) + { +- struct v2_disk_dqblk *d = dp, empty; ++ struct v2r0_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + + m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); +@@ -148,15 +181,15 @@ static void v2_disk2memdqb(struct dquot + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ +- memset(&empty, 0, sizeof(struct v2_disk_dqblk)); ++ memset(&empty, 0, sizeof(struct v2r0_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); +- if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk))) ++ if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk))) + m->dqb_itime = 0; + } + +-static void v2_mem2diskdqb(void *dp, struct dquot *dquot) ++static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot) + { +- struct v2_disk_dqblk *d = dp; ++ struct v2r0_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; +@@ -174,9 +207,60 @@ static void v2_mem2diskdqb(void *dp, str + d->dqb_itime = cpu_to_le64(1); + } + +-static int v2_is_id(void *dp, struct dquot *dquot) ++static int v2r0_is_id(void *dp, struct dquot *dquot) ++{ ++ struct v2r0_disk_dqblk *d = dp; ++ struct qtree_mem_dqinfo *info = ++ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; ++ ++ if (qtree_entry_unused(info, dp)) ++ return 0; ++ return le32_to_cpu(d->dqb_id) == dquot->dq_id; ++} ++ ++static void v2r1_disk2memdqb(struct dquot *dquot, void *dp) ++{ ++ struct v2r1_disk_dqblk *d = dp, empty; ++ struct mem_dqblk *m = &dquot->dq_dqb; ++ ++ m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); ++ m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); ++ m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); ++ m->dqb_itime = le64_to_cpu(d->dqb_itime); ++ m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit)); ++ m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit)); ++ m->dqb_curspace = le64_to_cpu(d->dqb_curspace); ++ m->dqb_btime = le64_to_cpu(d->dqb_btime); ++ /* We need to escape back all-zero structure */ ++ memset(&empty, 0, sizeof(struct v2r1_disk_dqblk)); ++ empty.dqb_itime = cpu_to_le64(1); ++ if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk))) ++ m->dqb_itime = 0; ++} ++ ++static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot) ++{ ++ struct v2r1_disk_dqblk *d = dp; ++ struct mem_dqblk *m = &dquot->dq_dqb; ++ struct qtree_mem_dqinfo *info = ++ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; ++ ++ d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); ++ d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); ++ d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); ++ d->dqb_itime = cpu_to_le64(m->dqb_itime); ++ d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit)); ++ d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit)); ++ d->dqb_curspace = cpu_to_le64(m->dqb_curspace); ++ d->dqb_btime = cpu_to_le64(m->dqb_btime); ++ d->dqb_id = cpu_to_le32(dquot->dq_id); ++ if (qtree_entry_unused(info, dp)) ++ d->dqb_itime = cpu_to_le64(1); ++} ++ ++static int v2r1_is_id(void *dp, struct dquot *dquot) + { +- struct v2_disk_dqblk *d = dp; ++ struct v2r1_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + +diff -puN fs/quotaio_v2.h~quota-support-64-bit-quota-format fs/quotaio_v2.h +--- a/fs/quotaio_v2.h~quota-support-64-bit-quota-format ++++ a/fs/quotaio_v2.h +@@ -17,8 +17,8 @@ + } + + #define V2_INITQVERSIONS {\ +- 0, /* USRQUOTA */\ +- 0 /* GRPQUOTA */\ ++ 1, /* USRQUOTA */\ ++ 1 /* GRPQUOTA */\ + } + + /* First generic header */ +@@ -28,11 +28,11 @@ struct v2_disk_dqheader { + }; + + /* +- * The following structure defines the format of the disk quota file +- * (as it appears on disk) - the file is a radix tree whose leaves point +- * to blocks of these structures. ++ * The following structure defines the format of the disk quota file in version ++ * 0 - the file is a radix tree whose leaves point to blocks of these ++ * structures. + */ +-struct v2_disk_dqblk { ++struct v2r0_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le32 dqb_isoftlimit; /* preferred inode limit */ +@@ -44,6 +44,20 @@ struct v2_disk_dqblk { + __le64 dqb_itime; /* time limit for excessive inode use */ + }; + ++/* The same structure in quota file version 1 */ ++struct v2r1_disk_dqblk { ++ __le32 dqb_id; /* id this quota applies to */ ++ __le32 dqb_padding; /* padding field */ ++ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ ++ __le64 dqb_isoftlimit; /* preferred inode limit */ ++ __le64 dqb_curinodes; /* current # allocated inodes */ ++ __le64 dqb_bhardlimit; /* absolute limit on disk space */ ++ __le64 dqb_bsoftlimit; /* preferred limit on disk space */ ++ __le64 dqb_curspace; /* current space occupied (in bytes) */ ++ __le64 dqb_btime; /* time limit for excessive disk use */ ++ __le64 dqb_itime; /* time limit for excessive inode use */ ++}; ++ + /* Header with type and version specific information */ + struct v2_disk_dqinfo { + __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ +_ diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/sd_iostats-2.6.27-vanilla.patch new file mode 100644 index 0000000..e1924a0 --- /dev/null +++ b/lustre/kernel_patches/patches/sd_iostats-2.6.27-vanilla.patch @@ -0,0 +1,579 @@ +Index: linux-2.6.22.19/drivers/scsi/Kconfig +=================================================================== +--- linux-2.6.22.19.orig/drivers/scsi/Kconfig ++++ linux-2.6.22.19/drivers/scsi/Kconfig +@@ -76,6 +76,14 @@ config BLK_DEV_SD + In this case, do not compile the driver for your SCSI host adapter + (below) as a module either. + ++config SD_IOSTATS ++ bool "Enable SCSI disk I/O stats" ++ depends on BLK_DEV_SD ++ default y ++ ---help--- ++ This enables SCSI disk I/O stats collection. You must also enable ++ /proc file system support if you want this feature. ++ + config CHR_DEV_ST + tristate "SCSI tape support" + depends on SCSI +Index: linux-2.6.22.19/drivers/scsi/scsi_proc.c +=================================================================== +--- linux-2.6.22.19.orig/drivers/scsi/scsi_proc.c ++++ linux-2.6.22.19/drivers/scsi/scsi_proc.c +@@ -40,7 +40,8 @@ + /* 4K page size, but our output routines, use some slack for overruns */ + #define PROC_BLOCK_SIZE (3*1024) + +-static struct proc_dir_entry *proc_scsi; ++struct proc_dir_entry *proc_scsi; ++EXPORT_SYMBOL(proc_scsi); + + /* Protect sht->present and sht->proc_dir */ + static DEFINE_MUTEX(global_host_template_mutex); +Index: linux-2.6.22.19/drivers/scsi/sd.c +=================================================================== +--- linux-2.6.22.19.orig/drivers/scsi/sd.c ++++ linux-2.6.22.19/drivers/scsi/sd.c +@@ -94,6 +94,24 @@ static DEFINE_SPINLOCK(sd_index_lock); + * object after last put) */ + static DEFINE_MUTEX(sd_ref_mutex); + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++# include ++# include ++struct proc_dir_entry *sd_iostats_procdir = NULL; ++char sd_iostats_procdir_name[] = "sd_iostats"; ++static struct file_operations sd_iostats_proc_fops; ++ ++extern void sd_iostats_init(void); ++extern void sd_iostats_fini(void); ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt); ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt); ++#else ++static inline void sd_iostats_init(void) {} ++static inline void sd_iostats_fini(void) {} ++static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {} ++static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {} ++#endif ++ + static const char *sd_cache_types[] = { + "write through", "none", "write back", + "write back, no read (daft)" +@@ -498,6 +516,8 @@ static int sd_init_command(struct scsi_c + */ + SCpnt->done = sd_rw_intr; + ++ sd_iostats_start_req(SCpnt); ++ + /* + * This indicates that the command is ready from our end to be + * queued. +@@ -980,6 +1000,7 @@ static void sd_done(struct scsi_cmnd + break; + } + out: ++ sd_iostats_finish_req(SCpnt); + if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt)) + sd_dif_complete(SCpnt, good_bytes); + +@@ -1666,6 +1687,36 @@ static int sd_probe(struct device *dev) + if (sdp->removable) + gd->flags |= GENHD_FL_REMOVABLE; + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++ sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL); ++ if (!sdkp->stats) { ++ printk(KERN_WARNING "cannot allocate iostat structure for" ++ "%s\n", gd->disk_name); ++ } else { ++ do_gettimeofday(&sdkp->stats->iostat_timeval); ++ sdkp->stats->iostat_queue_stamp = jiffies; ++ spin_lock_init(&sdkp->stats->iostat_lock); ++ if (sd_iostats_procdir) { ++ struct proc_dir_entry *pde; ++ pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR, ++ sd_iostats_procdir); ++ if (!pde) { ++ printk(KERN_WARNING "Can't create /proc/scsi/" ++ "%s/%s\n", ++ sd_iostats_procdir_name, ++ gd->disk_name); ++ kfree(sdkp->stats); ++ sdkp->stats = NULL; ++ } else { ++ pde->proc_fops = &sd_iostats_proc_fops; ++ pde->data = gd; ++ } ++ } else { ++ kfree(sdkp->stats); ++ sdkp->stats = NULL; ++ } ++ } ++#endif + dev_set_drvdata(dev, sdkp); + add_disk(gd); + +@@ -1709,6 +1760,366 @@ static int sd_remove(struct device *dev) + return 0; + } + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++static int ++sd_iostats_seq_show(struct seq_file *seq, void *v) ++{ ++ struct timeval now; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats; ++ unsigned long long read_len; ++ unsigned long long read_len_tot; ++ unsigned long read_num; ++ unsigned long read_num_tot; ++ unsigned long long write_len; ++ unsigned long long write_len_tot; ++ unsigned long write_num; ++ unsigned long write_num_tot; ++ int i; ++ int maxi; ++ ++ stats = scsi_disk(disk)->stats; ++ if (stats == NULL) { ++ printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n"); ++ BUG(); ++ } ++ ++ do_gettimeofday(&now); ++ now.tv_sec -= stats->iostat_timeval.tv_sec; ++ now.tv_usec -= stats->iostat_timeval.tv_usec; ++ if (now.tv_usec < 0) { ++ now.tv_usec += 1000000; ++ now.tv_sec--; ++ } ++ ++ /* this sampling races with updates */ ++ seq_printf(seq, "index: %lu snapshot_time: %lu.%06lu\n", ++ (unsigned long) scsi_disk(disk)->index, ++ now.tv_sec, now.tv_usec); ++ ++ for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--) ++ if (stats->iostat_read_histogram[i].iostat_count != 0 || ++ stats->iostat_write_histogram[i].iostat_count != 0) ++ break; ++ maxi = i; ++ ++ seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", ++ "reads", "total", "writes", "total"); ++ ++ read_len_tot = write_len_tot = 0; ++ read_num_tot = write_num_tot = 0; ++ for (i = 0; i <= maxi; i++) { ++ read_len = stats->iostat_read_histogram[i].iostat_size; ++ read_len_tot += read_len; ++ read_num = stats->iostat_read_histogram[i].iostat_count; ++ read_num_tot += read_num; ++ ++ write_len = stats->iostat_write_histogram[i].iostat_size; ++ write_len_tot += write_len; ++ write_num = stats->iostat_write_histogram[i].iostat_count; ++ write_num_tot += write_num; ++ ++ seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", ++ 512<iostat_queue_ticks[i]; ++ if (ticks == 0) ++ continue; ++ percent = stats->iostat_queue_ticks[i] * 100; ++ do_div(percent, stats->iostat_queue_ticks_sum); ++ seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent); ++ } ++ ++ if (stats->iostat_reqs != 0) { ++ unsigned long long aveseek = 0, percent = 0; ++ ++ if (stats->iostat_seeks) { ++ aveseek = stats->iostat_seek_sectors; ++ do_div(aveseek, stats->iostat_seeks); ++ percent = stats->iostat_seeks * 100; ++ do_div(percent, stats->iostat_reqs); ++ } ++ ++ seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over " ++ "%llu sectors in ave, %llu%% of all reqs\n", ++ stats->iostat_sectors, stats->iostat_reqs, ++ stats->iostat_seeks, aveseek, percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime[i] == 0 && ++ stats->iostat_rtime[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime[i], read_percent, ++ stats->iostat_wtime[i], write_percent); ++ } ++ ++ seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads", ++ "%%", "writes", "%%"); ++ for (i = 0; i < IOSTAT_NCOUNTERS; i++) { ++ unsigned long read_percent = 0, write_percent = 0; ++ if (stats->iostat_wtime_in_queue[i] == 0 && ++ stats->iostat_rtime_in_queue[i] == 0) ++ continue; ++ if (stats->iostat_read_reqs) ++ read_percent = stats->iostat_rtime_in_queue[i] * 100 / ++ stats->iostat_read_reqs; ++ if (stats->iostat_write_reqs) ++ write_percent = stats->iostat_wtime_in_queue[i] * 100 / ++ stats->iostat_write_reqs; ++ seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n", ++ jiffies_to_msecs(((1UL << i) >> 1) << 1), ++ stats->iostat_rtime_in_queue[i], ++ read_percent, ++ stats->iostat_wtime_in_queue[i], ++ write_percent); ++ } ++ ++ return 0; ++} ++ ++static void * ++sd_iostats_seq_start(struct seq_file *p, loff_t *pos) ++{ ++ return (*pos == 0) ? (void *)1 : NULL; ++} ++ ++static void * ++sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return NULL; ++} ++ ++static void ++sd_iostats_seq_stop(struct seq_file *p, void *v) ++{ ++} ++ ++static struct seq_operations sd_iostats_seqops = { ++ .start = sd_iostats_seq_start, ++ .stop = sd_iostats_seq_stop, ++ .next = sd_iostats_seq_next, ++ .show = sd_iostats_seq_show, ++}; ++ ++static int ++sd_iostats_seq_open (struct inode *inode, struct file *file) ++{ ++ int rc; ++ ++ rc = seq_open(file, &sd_iostats_seqops); ++ if (rc != 0) ++ return rc; ++ ++ ((struct seq_file *)file->private_data)->private = PDE(inode)->data; ++ return 0; ++} ++ ++static ssize_t ++sd_iostats_seq_write(struct file *file, const char *buffer, ++ size_t len, loff_t *off) ++{ ++ struct seq_file *seq = file->private_data; ++ struct gendisk *disk = seq->private; ++ iostat_stats_t *stats = scsi_disk(disk)->stats; ++ unsigned long flags; ++ unsigned long qdepth; ++ ++ ++ spin_lock_irqsave (&stats->iostat_lock, flags); ++ qdepth = stats->iostat_queue_depth; ++ memset (stats, 0, offsetof(iostat_stats_t, iostat_lock)); ++ do_gettimeofday(&stats->iostat_timeval); ++ stats->iostat_queue_stamp = jiffies; ++ stats->iostat_queue_depth = qdepth; ++ spin_unlock_irqrestore (&stats->iostat_lock, flags); ++ ++ return len; ++} ++ ++static struct file_operations sd_iostats_proc_fops = { ++ .owner = THIS_MODULE, ++ .open = sd_iostats_seq_open, ++ .read = seq_read, ++ .write = sd_iostats_seq_write, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++extern struct proc_dir_entry *proc_scsi; ++ ++void ++sd_iostats_init(void) ++{ ++ if (proc_scsi == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "proc_scsi is NULL\n"); ++ return; ++ } ++ ++ sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name, ++ S_IFDIR | S_IRUGO | S_IXUGO, ++ proc_scsi); ++ if (sd_iostats_procdir == NULL) { ++ printk(KERN_WARNING "No access to sd iostats: " ++ "can't create /proc/scsi/%s\n", sd_iostats_procdir_name); ++ return; ++ } ++} ++ ++void sd_iostats_fini(void) ++{ ++ if (proc_scsi != NULL && sd_iostats_procdir != NULL) ++ remove_proc_entry(sd_iostats_procdir_name, proc_scsi); ++ ++ sd_iostats_procdir = NULL; ++} ++ ++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) ++{ ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ unsigned long *tcounter; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ unsigned long i; ++ ++ stats = scsi_disk(rq->rq_disk)->stats; ++ if (stats == NULL) ++ return; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket); ++ ++ tcounter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket]; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update delay stats */ ++ (*tcounter)++; ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ BUG_ON(stats->iostat_queue_depth == 0); ++ stats->iostat_queue_depth--; ++ ++ /* update seek stats. XXX: not sure about nr_sectors */ ++ stats->iostat_sectors += rq->nr_sectors; ++ stats->iostat_reqs++; ++ if (rq->sector != stats->iostat_next_sector) { ++ stats->iostat_seek_sectors += ++ rq->sector > stats->iostat_next_sector ? ++ rq->sector - stats->iostat_next_sector : ++ stats->iostat_next_sector - rq->sector; ++ stats->iostat_seeks++; ++ } ++ stats->iostat_next_sector = rq->sector + rq->nr_sectors; ++ ++ stats->iostat_queue_stamp = jiffies; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); ++} ++ ++void sd_iostats_start_req(struct scsi_cmnd *SCpnt) ++{ ++ struct request *rq = SCpnt->request; ++ iostat_stats_t *stats; ++ iostat_counter_t *counter; ++ int bucket; ++ int tbucket; ++ int tmp; ++ unsigned long irqflags; ++ unsigned long i; ++ int nsect; ++ ++ stats = scsi_disk(rq->rq_disk)->stats; ++ if (stats == NULL) ++ return; ++ ++ nsect = scsi_bufflen(SCpnt) >> 9; ++ for (bucket = 0, tmp = nsect; tmp > 1; bucket++) ++ tmp >>= 1; ++ ++ if (bucket >= IOSTAT_NCOUNTERS) { ++ printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect); ++ BUG(); ++ } ++ ++ counter = rq_data_dir(rq) == WRITE ? ++ &stats->iostat_write_histogram[bucket] : ++ &stats->iostat_read_histogram[bucket]; ++ ++ tmp = jiffies - rq->start_time; ++ for (tbucket = 0; tmp > 1; tbucket++) ++ tmp >>= 1; ++ if (tbucket >= IOSTAT_NCOUNTERS) ++ tbucket = IOSTAT_NCOUNTERS - 1; ++ //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket); ++ ++ /* an ugly hack to know exact processing time. the right ++ * solution is to add one more field to struct request ++ * hopefully it will break nothing ... */ ++ rq->start_time = jiffies; ++ ++ spin_lock_irqsave(&stats->iostat_lock, irqflags); ++ ++ /* update queue depth stats */ ++ i = stats->iostat_queue_depth; ++ if (i >= IOSTAT_NCOUNTERS) ++ i = IOSTAT_NCOUNTERS - 1; ++ stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp; ++ stats->iostat_queue_depth++; ++ ++ /* update delay stats */ ++ if (rq_data_dir(rq) == WRITE) { ++ stats->iostat_wtime_in_queue[tbucket]++; ++ stats->iostat_write_reqs++; ++ } else { ++ stats->iostat_rtime_in_queue[tbucket]++; ++ stats->iostat_read_reqs++; ++ } ++ ++ /* update size stats */ ++ counter->iostat_size += nsect; ++ counter->iostat_count++; ++ ++ stats->iostat_queue_stamp = jiffies; ++ ++ spin_unlock_irqrestore(&stats->iostat_lock, irqflags); ++} ++#endif ++ + /** + * scsi_disk_release - Called to free the scsi_disk structure + * @cdev: pointer to embedded class device +@@ -1727,10 +2138,16 @@ static void scsi_disk_release(struct cla + idr_remove(&sd_index_idr, sdkp->index); + spin_unlock(&sd_index_lock); + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++ if (sdkp->stats) { ++ remove_proc_entry(disk->disk_name, sd_iostats_procdir); ++ kfree(sdkp->stats); ++ sdkp->stats = NULL; ++ } ++#endif + disk->private_data = NULL; + put_disk(disk); + put_device(&sdkp->device->sdev_gendev); +- + kfree(sdkp); + } + +@@ -1845,6 +2262,8 @@ static int __init init_sd(void) + if (!majors) + return -ENODEV; + ++ sd_iostats_init(); ++ + err = class_register(&sd_disk_class); + if (err) + goto err_out; +@@ -1860,6 +2279,7 @@ err_out_class: + err_out: + for (i = 0; i < SD_MAJORS; i++) + unregister_blkdev(sd_major(i), "sd"); ++ sd_iostats_fini(); + return err; + } + +Index: linux-2.6.22.19/include/scsi/sd.h +=================================================================== +--- linux-2.6.22.19.orig/drivers/scsi/sd.h ++++ linux-2.6.22.19/drivers/scsi/sd.h +@@ -31,6 +31,46 @@ + */ + #define SD_BUF_SIZE 512 + ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++typedef struct { ++ unsigned long long iostat_size; ++ unsigned long long iostat_count; ++} iostat_counter_t; ++ ++#define IOSTAT_NCOUNTERS 16 ++typedef struct { ++ iostat_counter_t iostat_read_histogram[IOSTAT_NCOUNTERS]; ++ iostat_counter_t iostat_write_histogram[IOSTAT_NCOUNTERS]; ++ struct timeval iostat_timeval; ++ ++ /* queue depth: how well the pipe is filled up */ ++ unsigned long long iostat_queue_ticks[IOSTAT_NCOUNTERS]; ++ unsigned long long iostat_queue_ticks_sum; ++ unsigned long iostat_queue_depth; ++ unsigned long iostat_queue_stamp; ++ ++ /* seeks: how linear the traffic is */ ++ unsigned long long iostat_next_sector; ++ unsigned long long iostat_seek_sectors; ++ unsigned long long iostat_seeks; ++ unsigned long long iostat_sectors; ++ unsigned long long iostat_reqs; ++ unsigned long iostat_read_reqs; ++ unsigned long iostat_write_reqs; ++ ++ /* process time: how long it takes to process requests */ ++ unsigned long iostat_rtime[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime[IOSTAT_NCOUNTERS]; ++ ++ /* queue time: how long process spent in elevator's queue */ ++ unsigned long iostat_rtime_in_queue[IOSTAT_NCOUNTERS]; ++ unsigned long iostat_wtime_in_queue[IOSTAT_NCOUNTERS]; ++ ++ /* must be the last field, as it's used to know size to be memset'ed */ ++ spinlock_t iostat_lock; ++} ____cacheline_aligned_in_smp iostat_stats_t; ++#endif ++ + struct scsi_disk { + struct scsi_driver *driver; /* always &sd_template */ + struct scsi_device *device; +@@ -44,6 +84,9 @@ struct scsi_disk { + unsigned WCE : 1; /* state of disk WCE bit */ + unsigned RCD : 1; /* state of disk RCD bit, unused */ + unsigned DPOFUA : 1; /* state of disk DPOFUA bit */ ++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS)) ++ iostat_stats_t *stats; /* scsi disk statistics */ ++#endif + }; + #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev) + -- 1.8.3.1