Whamcloud - gitweb
Branch b1_8
authoryangsheng <yangsheng>
Fri, 22 May 2009 05:20:49 +0000 (05:20 +0000)
committeryangsheng <yangsheng>
Fri, 22 May 2009 05:20:49 +0000 (05:20 +0000)
b=18668

i=shadow, johann

SLES11 kernel patches.

lustre/kernel_patches/patches/dev_read_only-2.6.27-vanilla.patch [new file with mode: 0644]
lustre/kernel_patches/patches/export-2.6.27-vanilla.patch [new file with mode: 0644]
lustre/kernel_patches/patches/export-show_task-2.6.27-vanilla.patch [new file with mode: 0644]
lustre/kernel_patches/patches/md-mmp-unplug-dev-sles11.patch [new file with mode: 0644]
lustre/kernel_patches/patches/quota-support-64-bit-quota-format.patch [new file with mode: 0644]
lustre/kernel_patches/patches/sd_iostats-2.6.27-vanilla.patch [new file with mode: 0644]

diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.27-vanilla.patch
new file mode 100644 (file)
index 0000000..0ea4b21
--- /dev/null
@@ -0,0 +1,145 @@
+Index: linux-2.6.22.5/block/blk-core.c
+===================================================================
+--- linux-2.6.22.5.orig/block/blk-core.c
++++ linux-2.6.22.5/block/blk-core.c
+@@ -3101,6 +3101,8 @@ static inline int should_fail_request(st
+ #endif /* CONFIG_FAIL_MAKE_REQUEST */
++int dev_check_rdonly(struct block_device *bdev);
++
+ /*
+  * Check whether this bio extends beyond the end of the device.
+  */
+@@ -3185,6 +3187,12 @@ end_io:
+               if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+                       goto end_io;
++              /* this is cfs's dev_rdonly check */
++              if (bio->bi_rw == WRITE &&
++                              dev_check_rdonly(bio->bi_bdev)) {
++                      bio_endio(bio, 0);
++                      break;
++              }
+               if (should_fail_request(bio))
+                       goto end_io;
+@@ -3850,6 +3858,91 @@ void swap_io_context(struct io_context *
+ }
+ EXPORT_SYMBOL(kblockd_flush_work);
++ /*
++ * Debug code for turning block devices "read-only" (will discard writes
++ * silently).  This is for filesystem crash/recovery testing.
++ */
++struct deventry {
++      dev_t dev;
++      struct deventry *next;
++};
++
++static struct deventry *devlist = NULL;
++static spinlock_t devlock = SPIN_LOCK_UNLOCKED; 
++
++int dev_check_rdonly(struct block_device *bdev) 
++{
++      struct deventry *cur;
++      if (!bdev) return 0;
++      spin_lock(&devlock);
++      cur = devlist;
++      while(cur) {
++              if (bdev->bd_dev == cur->dev) {
++                      spin_unlock(&devlock);
++                      return 1;
++      }
++              cur = cur->next;
++      }
++      spin_unlock(&devlock);
++      return 0;
++}
++
++void dev_set_rdonly(struct block_device *bdev)
++{
++      struct deventry *newdev, *cur;
++
++      if (!bdev) 
++              return;
++      newdev = kmalloc(sizeof(struct deventry), GFP_KERNEL);
++      if (!newdev) 
++              return;
++      
++      spin_lock(&devlock);
++      cur = devlist;
++      while(cur) {
++              if (bdev->bd_dev == cur->dev) {
++                      spin_unlock(&devlock);
++                      kfree(newdev);
++                      return;
++              }
++              cur = cur->next;
++      }
++      newdev->dev = bdev->bd_dev;
++      newdev->next = devlist;
++      devlist = newdev;
++      spin_unlock(&devlock);
++      printk(KERN_WARNING "Turning device %s (%#x) read-only\n",
++             bdev->bd_disk ? bdev->bd_disk->disk_name : "", bdev->bd_dev);
++}
++
++void dev_clear_rdonly(struct block_device *bdev) 
++{
++      struct deventry *cur, *last = NULL;
++      if (!bdev) return;
++      spin_lock(&devlock);
++      cur = devlist;
++      while(cur) {
++              if (bdev->bd_dev == cur->dev) {
++                      if (last) 
++                              last->next = cur->next;
++                      else
++                              devlist = cur->next;
++                      spin_unlock(&devlock);
++                      kfree(cur);
++                      printk(KERN_WARNING "Removing read-only on %s (%#x)\n",
++                             bdev->bd_disk ? bdev->bd_disk->disk_name :
++                                             "unknown block", bdev->bd_dev);
++                      return;
++              }
++              last = cur;
++              cur = cur->next;
++      }
++      spin_unlock(&devlock);
++}
++
++EXPORT_SYMBOL(dev_set_rdonly);
++EXPORT_SYMBOL(dev_clear_rdonly);
++EXPORT_SYMBOL(dev_check_rdonly);
+ int __init blk_dev_init(void)
+ {
+       int i;
+Index: linux-2.6.22.5/fs/block_dev.c
+===================================================================
+--- linux-2.6.22.5.orig/fs/block_dev.c
++++ linux-2.6.22.5/fs/block_dev.c
+@@ -1294,6 +1294,7 @@ static int __blkdev_put(struct block_dev
+               if (bdev != bdev->bd_contains)
+                       victim = bdev->bd_contains;
+               bdev->bd_contains = NULL;
++              dev_clear_rdonly(bdev);
+       }
+       unlock_kernel();
+       mutex_unlock(&bdev->bd_mutex);
+Index: linux-2.6.22.5/include/linux/fs.h
+===================================================================
+--- linux-2.6.22.5.orig/include/linux/fs.h
++++ linux-2.6.22.5/include/linux/fs.h
+@@ -1744,6 +1744,10 @@ struct bio;
+ extern void submit_bio(int, struct bio *);
+ extern int bdev_read_only(struct block_device *);
+ #endif
++#define HAVE_CLEAR_RDONLY_ON_PUT
++extern void dev_set_rdonly(struct block_device *bdev);
++extern int dev_check_rdonly(struct block_device *bdev);
++extern void dev_clear_rdonly(struct block_device *bdev);
+ extern int set_blocksize(struct block_device *, int);
+ extern int sb_set_blocksize(struct super_block *, int);
+ extern int sb_min_blocksize(struct super_block *, int);
diff --git a/lustre/kernel_patches/patches/export-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/export-2.6.27-vanilla.patch
new file mode 100644 (file)
index 0000000..0a979c3
--- /dev/null
@@ -0,0 +1,37 @@
+Index: linux-2.6/fs/jbd/journal.c
+===================================================================
+--- linux-2.6.orig/fs/jbd2/journal.c   2006-07-15 16:13:50.000000000 +0800
++++ linux-2.6/fs/jbd2/journal.c        2006-07-15 16:22:04.000000000 +0800
+@@ -74,6 +74,7 @@ EXPORT_SYMBOL(journal_abort);
+       spin_unlock(&journal->j_state_lock);
+       return ret;
+ }
++EXPORT_SYMBOL(jbd2_log_start_commit);
+ /*
+  * Force and wait upon a commit if the calling process is not within
+Index: linux-2.6/kernel/sys.c
+===================================================================
+--- linux-2.6.orig/kernel/sys.c        2006-07-15 16:13:50.000000000 +0800
++++ linux-2.6/kernel/sys.c     2006-07-15 16:22:04.000000000 +0800
+@@ -74,6 +74,8 @@
+ EXPORT_SYMBOL(in_egroup_p);
+ DECLARE_RWSEM(uts_sem);
++
++EXPORT_SYMBOL(uts_sem);
+ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
+ {
+Index: linux-2.6/kernel/sys.c
+===================================================================
+--- linux-2.6.orig/security/security.c
++++ linux-2.6/security/security.c
+@@ -74,6 +74,7 @@
+               return 0;
+       return security_ops->inode_unlink(dir, dentry, mnt);
+ }
++EXPORT_SYMBOL(security_inode_unlink);
+ int security_inode_symlink(struct inode *dir, struct dentry *dentry,
+                          struct vfsmount *mnt, const char *old_name)
diff --git a/lustre/kernel_patches/patches/export-show_task-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/export-show_task-2.6.27-vanilla.patch
new file mode 100644 (file)
index 0000000..d9b287e
--- /dev/null
@@ -0,0 +1,12 @@
+Index: linux-2.6/kernel/sched.c
+===================================================================
+--- linux-2.6.orig/kernel/sched.c      2006-07-15 11:51:46.000000000 +0800
++++ linux-2.6/kernel/sched.c   2006-07-15 16:24:35.000000000 +0800
+@@ -4652,6 +4652,7 @@ static inline struct task_struct *younge
+       show_stack(p, NULL);
+ }
++EXPORT_SYMBOL(sched_show_task);
+ void show_state_filter(unsigned long state_filter)
+ {
diff --git a/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles11.patch b/lustre/kernel_patches/patches/md-mmp-unplug-dev-sles11.patch
new file mode 100644 (file)
index 0000000..f9f84b1
--- /dev/null
@@ -0,0 +1,22 @@
+Index: linux-2.6.16.60-0.33/drivers/md/raid5.c
+===================================================================
+--- linux-2.6.16.60-0.33.orig/drivers/md/raid5.c
++++ linux-2.6.16.60-0.33/drivers/md/raid5.c
+@@ -900,6 +900,8 @@ static int add_stripe_bio(struct stripe_
+               bi->bi_next = *bip;
+       *bip = bi;
+       bi->bi_phys_segments++;
++      if (bio_sync(bi) && !forwrite)
++              clear_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); /* force to read from disk. */
+       spin_unlock_irq(&conf->device_lock);
+       spin_unlock(&sh->lock);
+@@ -1617,6 +1619,8 @@ static int make_request (request_queue_t
+               bi->bi_end_io(bi, bytes, 0);
+       }
++      if (bio_sync(bi))
++              raid5_unplug_device(q);
+       return 0;
+ }
diff --git a/lustre/kernel_patches/patches/quota-support-64-bit-quota-format.patch b/lustre/kernel_patches/patches/quota-support-64-bit-quota-format.patch
new file mode 100644 (file)
index 0000000..14fe9a8
--- /dev/null
@@ -0,0 +1,282 @@
+From: Jan Kara <jack@suse.cz>
+
+Implement conversion functions for new version (version 1) of quota format
+which supports 64-bit block and inode limits and 64-bit inode usage.  The
+original implementation has been written by Andrew Perepechko.
+
+Signed-off-by: Andrew Perepechko <andrew.perepechko@sun.com>
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+---
+
+ fs/quota_v2.c   |  140 ++++++++++++++++++++++++++++++++++++----------
+ fs/quotaio_v2.h |   26 ++++++--
+ 2 files changed, 132 insertions(+), 34 deletions(-)
+
+diff -puN fs/quota_v2.c~quota-support-64-bit-quota-format fs/quota_v2.c
+--- a/fs/quota_v2.c~quota-support-64-bit-quota-format
++++ a/fs/quota_v2.c
+@@ -23,14 +23,24 @@ MODULE_LICENSE("GPL");
+ #define __QUOTA_V2_PARANOIA
+-static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+-static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+-static int v2_is_id(void *dp, struct dquot *dquot);
+-
+-static struct qtree_fmt_operations v2_qtree_ops = {
+-      .mem2disk_dqblk = v2_mem2diskdqb,
+-      .disk2mem_dqblk = v2_disk2memdqb,
+-      .is_id = v2_is_id,
++static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot);
++static void v2r0_disk2memdqb(struct dquot *dquot, void *dp);
++static int v2r0_is_id(void *dp, struct dquot *dquot);
++
++static struct qtree_fmt_operations v2r0_qtree_ops = {
++      .mem2disk_dqblk = v2r0_mem2diskdqb,
++      .disk2mem_dqblk = v2r0_disk2memdqb,
++      .is_id = v2r0_is_id,
++};
++
++static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
++static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
++static int v2r1_is_id(void *dp, struct dquot *dquot);
++
++static struct qtree_fmt_operations v2r1_qtree_ops = {
++      .mem2disk_dqblk = v2r1_mem2diskdqb,
++      .disk2mem_dqblk = v2r1_disk2memdqb,
++      .is_id = v2r1_is_id,
+ };
+ #define QUOTABLOCK_BITS 10
+@@ -46,8 +56,7 @@ static inline qsize_t v2_qbtos(qsize_t b
+       return blocks << QUOTABLOCK_BITS;
+ }
+-/* Check whether given file is really vfsv0 quotafile */
+-static int v2_check_quota_file(struct super_block *sb, int type)
++static int v2_check_quota_file_header(struct super_block *sb, int type)
+ {
+       struct v2_disk_dqheader dqhead;
+       ssize_t size;
+@@ -58,12 +67,20 @@ static int v2_check_quota_file(struct su
+       if (size != sizeof(struct v2_disk_dqheader)) {
+               printk("quota_v2: failed read expected=%zd got=%zd\n",
+                       sizeof(struct v2_disk_dqheader), size);
+-              return 0;
++              return -EIO;
+       }
+-      if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] ||
+-          le32_to_cpu(dqhead.dqh_version) != quota_versions[type])
+-              return 0;
+-      return 1;
++      if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type])
++              return -ENOENT;
++      if (le32_to_cpu(dqhead.dqh_version) > quota_versions[type])
++              return -EOPNOTSUPP;
++      return le32_to_cpu(dqhead.dqh_version);
++}
++
++
++/* Check whether given file is really vfsv0 quotafile */
++static int v2_check_quota_file(struct super_block *sb, int type)
++{
++      return v2_check_quota_file_header(sb, type) >= 0;
+ }
+ /* Read information header from quota file */
+@@ -73,7 +90,13 @@ static int v2_read_file_info(struct supe
+       struct mem_dqinfo *info = sb_dqinfo(sb, type);
+       struct qtree_mem_dqinfo *qinfo;
+       ssize_t size;
++      int version = v2_check_quota_file_header(sb, type);
++      if (version < 0) {
++              printk(KERN_WARNING "Cannot identify quota file version on "
++                     "device %s: %d\n", sb->s_id, version);
++              return -1;
++      }
+       size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+              sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+       if (size != sizeof(struct v2_disk_dqinfo)) {
+@@ -88,9 +111,14 @@ static int v2_read_file_info(struct supe
+               return -1;
+       }
+       qinfo = info->dqi_priv;
+-      /* limits are stored as unsigned 32-bit data */
+-      info->dqi_maxblimit = 0xffffffff;
+-      info->dqi_maxilimit = 0xffffffff;
++      if (version == 0) {
++              /* limits are stored as unsigned 32-bit data */
++              info->dqi_maxblimit = 0xffffffff;
++              info->dqi_maxilimit = 0xffffffff;
++      } else {
++              info->dqi_maxblimit = 0x7fffffffffffffffULL;
++              info->dqi_maxilimit = 0x7fffffffffffffffULL;
++      }
+       info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+       info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+       info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
+@@ -102,8 +130,13 @@ static int v2_read_file_info(struct supe
+       qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+       qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+       qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+-      qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+-      qinfo->dqi_ops = &v2_qtree_ops;
++      if (version == 0) {
++              qinfo->dqi_entry_size = sizeof(struct v2r0_disk_dqblk);
++              qinfo->dqi_ops = &v2r0_qtree_ops;
++      } else {
++              qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk);
++              qinfo->dqi_ops = &v2r1_qtree_ops;
++      }
+       return 0;
+ }
+@@ -134,9 +167,9 @@ static int v2_write_file_info(struct sup
+       return 0;
+ }
+-static void v2_disk2memdqb(struct dquot *dquot, void *dp)
++static void v2r0_disk2memdqb(struct dquot *dquot, void *dp)
+ {
+-      struct v2_disk_dqblk *d = dp, empty;
++      struct v2r0_disk_dqblk *d = dp, empty;
+       struct mem_dqblk *m = &dquot->dq_dqb;
+       m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
+@@ -148,15 +181,15 @@ static void v2_disk2memdqb(struct dquot 
+       m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+       m->dqb_btime = le64_to_cpu(d->dqb_btime);
+       /* We need to escape back all-zero structure */
+-      memset(&empty, 0, sizeof(struct v2_disk_dqblk));
++      memset(&empty, 0, sizeof(struct v2r0_disk_dqblk));
+       empty.dqb_itime = cpu_to_le64(1);
+-      if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
++      if (!memcmp(&empty, dp, sizeof(struct v2r0_disk_dqblk)))
+               m->dqb_itime = 0;
+ }
+-static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
++static void v2r0_mem2diskdqb(void *dp, struct dquot *dquot)
+ {
+-      struct v2_disk_dqblk *d = dp;
++      struct v2r0_disk_dqblk *d = dp;
+       struct mem_dqblk *m = &dquot->dq_dqb;
+       struct qtree_mem_dqinfo *info =
+                       sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+@@ -174,9 +207,60 @@ static void v2_mem2diskdqb(void *dp, str
+               d->dqb_itime = cpu_to_le64(1);
+ }
+-static int v2_is_id(void *dp, struct dquot *dquot)
++static int v2r0_is_id(void *dp, struct dquot *dquot)
++{
++      struct v2r0_disk_dqblk *d = dp;
++      struct qtree_mem_dqinfo *info =
++                      sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
++
++      if (qtree_entry_unused(info, dp))
++              return 0;
++      return le32_to_cpu(d->dqb_id) == dquot->dq_id;
++}
++
++static void v2r1_disk2memdqb(struct dquot *dquot, void *dp)
++{
++      struct v2r1_disk_dqblk *d = dp, empty;
++      struct mem_dqblk *m = &dquot->dq_dqb;
++
++      m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
++      m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
++      m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
++      m->dqb_itime = le64_to_cpu(d->dqb_itime);
++      m->dqb_bhardlimit = v2_qbtos(le64_to_cpu(d->dqb_bhardlimit));
++      m->dqb_bsoftlimit = v2_qbtos(le64_to_cpu(d->dqb_bsoftlimit));
++      m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
++      m->dqb_btime = le64_to_cpu(d->dqb_btime);
++      /* We need to escape back all-zero structure */
++      memset(&empty, 0, sizeof(struct v2r1_disk_dqblk));
++      empty.dqb_itime = cpu_to_le64(1);
++      if (!memcmp(&empty, dp, sizeof(struct v2r1_disk_dqblk)))
++              m->dqb_itime = 0;
++}
++
++static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot)
++{
++      struct v2r1_disk_dqblk *d = dp;
++      struct mem_dqblk *m = &dquot->dq_dqb;
++      struct qtree_mem_dqinfo *info =
++                      sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
++
++      d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
++      d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
++      d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
++      d->dqb_itime = cpu_to_le64(m->dqb_itime);
++      d->dqb_bhardlimit = cpu_to_le64(v2_stoqb(m->dqb_bhardlimit));
++      d->dqb_bsoftlimit = cpu_to_le64(v2_stoqb(m->dqb_bsoftlimit));
++      d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
++      d->dqb_btime = cpu_to_le64(m->dqb_btime);
++      d->dqb_id = cpu_to_le32(dquot->dq_id);
++      if (qtree_entry_unused(info, dp))
++              d->dqb_itime = cpu_to_le64(1);
++}
++
++static int v2r1_is_id(void *dp, struct dquot *dquot)
+ {
+-      struct v2_disk_dqblk *d = dp;
++      struct v2r1_disk_dqblk *d = dp;
+       struct qtree_mem_dqinfo *info =
+                       sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+diff -puN fs/quotaio_v2.h~quota-support-64-bit-quota-format fs/quotaio_v2.h
+--- a/fs/quotaio_v2.h~quota-support-64-bit-quota-format
++++ a/fs/quotaio_v2.h
+@@ -17,8 +17,8 @@
+ }
+ #define V2_INITQVERSIONS {\
+-      0,              /* USRQUOTA */\
+-      0               /* GRPQUOTA */\
++      1,              /* USRQUOTA */\
++      1               /* GRPQUOTA */\
+ }
+ /* First generic header */
+@@ -28,11 +28,11 @@ struct v2_disk_dqheader {
+ };
+ /*
+- * The following structure defines the format of the disk quota file
+- * (as it appears on disk) - the file is a radix tree whose leaves point
+- * to blocks of these structures.
++ * The following structure defines the format of the disk quota file in version
++ * 0 - the file is a radix tree whose leaves point to blocks of these
++ * structures.
+  */
+-struct v2_disk_dqblk {
++struct v2r0_disk_dqblk {
+       __le32 dqb_id;          /* id this quota applies to */
+       __le32 dqb_ihardlimit;  /* absolute limit on allocated inodes */
+       __le32 dqb_isoftlimit;  /* preferred inode limit */
+@@ -44,6 +44,20 @@ struct v2_disk_dqblk {
+       __le64 dqb_itime;       /* time limit for excessive inode use */
+ };
++/* The same structure in quota file version 1 */
++struct v2r1_disk_dqblk {
++      __le32 dqb_id;          /* id this quota applies to */
++      __le32 dqb_padding;     /* padding field */
++      __le64 dqb_ihardlimit;  /* absolute limit on allocated inodes */
++      __le64 dqb_isoftlimit;  /* preferred inode limit */
++      __le64 dqb_curinodes;   /* current # allocated inodes */
++      __le64 dqb_bhardlimit;  /* absolute limit on disk space */
++      __le64 dqb_bsoftlimit;  /* preferred limit on disk space */
++      __le64 dqb_curspace;    /* current space occupied (in bytes) */
++      __le64 dqb_btime;       /* time limit for excessive disk use */
++      __le64 dqb_itime;       /* time limit for excessive inode use */
++};
++
+ /* Header with type and version specific information */
+ struct v2_disk_dqinfo {
+       __le32 dqi_bgrace;      /* Time before block soft limit becomes hard limit */
+_
diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6.27-vanilla.patch b/lustre/kernel_patches/patches/sd_iostats-2.6.27-vanilla.patch
new file mode 100644 (file)
index 0000000..e1924a0
--- /dev/null
@@ -0,0 +1,579 @@
+Index: linux-2.6.22.19/drivers/scsi/Kconfig
+===================================================================
+--- linux-2.6.22.19.orig/drivers/scsi/Kconfig
++++ linux-2.6.22.19/drivers/scsi/Kconfig
+@@ -76,6 +76,14 @@ config BLK_DEV_SD
+         In this case, do not compile the driver for your SCSI host adapter
+         (below) as a module either.
++config SD_IOSTATS
++   bool "Enable SCSI disk I/O stats"
++   depends on BLK_DEV_SD
++   default y
++   ---help---
++     This enables SCSI disk I/O stats collection.  You must also enable
++     /proc file system support if you want this feature.
++
+ config CHR_DEV_ST
+       tristate "SCSI tape support"
+       depends on SCSI
+Index: linux-2.6.22.19/drivers/scsi/scsi_proc.c
+===================================================================
+--- linux-2.6.22.19.orig/drivers/scsi/scsi_proc.c
++++ linux-2.6.22.19/drivers/scsi/scsi_proc.c
+@@ -40,7 +40,8 @@
+ /* 4K page size, but our output routines, use some slack for overruns */
+ #define PROC_BLOCK_SIZE (3*1024)
+-static struct proc_dir_entry *proc_scsi;
++struct proc_dir_entry *proc_scsi;
++EXPORT_SYMBOL(proc_scsi);
+ /* Protect sht->present and sht->proc_dir */
+ static DEFINE_MUTEX(global_host_template_mutex);
+Index: linux-2.6.22.19/drivers/scsi/sd.c
+===================================================================
+--- linux-2.6.22.19.orig/drivers/scsi/sd.c
++++ linux-2.6.22.19/drivers/scsi/sd.c
+@@ -94,6 +94,24 @@ static DEFINE_SPINLOCK(sd_index_lock);
+  * object after last put) */
+ static DEFINE_MUTEX(sd_ref_mutex);
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++# include <linux/proc_fs.h>
++# include <linux/seq_file.h>
++struct proc_dir_entry *sd_iostats_procdir = NULL;
++char sd_iostats_procdir_name[] = "sd_iostats";
++static struct file_operations sd_iostats_proc_fops;
++
++extern void sd_iostats_init(void);
++extern void sd_iostats_fini(void);
++void sd_iostats_start_req(struct scsi_cmnd *SCpnt);
++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt);
++#else
++static inline void sd_iostats_init(void) {}
++static inline void sd_iostats_fini(void) {}
++static inline void sd_iostats_start_req(struct scsi_cmnd *SCpnt) {}
++static inline void sd_iostats_finish_req(struct scsi_cmnd *SCpnt) {}
++#endif
++
+ static const char *sd_cache_types[] = {
+       "write through", "none", "write back",
+       "write back, no read (daft)"
+@@ -498,6 +516,8 @@ static int sd_init_command(struct scsi_c
+        */
+       SCpnt->done = sd_rw_intr;
++      sd_iostats_start_req(SCpnt);
++
+       /*
+        * This indicates that the command is ready from our end to be
+        * queued.
+@@ -980,6 +1000,7 @@ static void sd_done(struct scsi_cmnd 
+               break;
+       }
+  out:
++      sd_iostats_finish_req(SCpnt);
+       if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt))
+               sd_dif_complete(SCpnt, good_bytes);
+@@ -1666,6 +1687,36 @@ static int sd_probe(struct device *dev)
+       if (sdp->removable)
+               gd->flags |= GENHD_FL_REMOVABLE;
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      sdkp->stats = kzalloc(sizeof(iostat_stats_t), GFP_KERNEL);
++      if (!sdkp->stats) {
++              printk(KERN_WARNING "cannot allocate iostat structure for"
++                                  "%s\n", gd->disk_name);
++      } else {
++              do_gettimeofday(&sdkp->stats->iostat_timeval);
++              sdkp->stats->iostat_queue_stamp = jiffies;
++              spin_lock_init(&sdkp->stats->iostat_lock);
++              if (sd_iostats_procdir) {
++                      struct proc_dir_entry *pde;
++                      pde = create_proc_entry(gd->disk_name, S_IRUGO | S_IWUSR,
++                                              sd_iostats_procdir);
++                      if (!pde) {
++                              printk(KERN_WARNING "Can't create /proc/scsi/"
++                                                  "%s/%s\n",
++                                                  sd_iostats_procdir_name,
++                                                  gd->disk_name);
++                              kfree(sdkp->stats);
++                              sdkp->stats = NULL;
++                      } else {
++                              pde->proc_fops = &sd_iostats_proc_fops;
++                              pde->data = gd;
++                      }
++              } else {
++                      kfree(sdkp->stats);
++                      sdkp->stats = NULL;
++              }
++      }
++#endif
+       dev_set_drvdata(dev, sdkp);
+       add_disk(gd);
+@@ -1709,6 +1760,366 @@ static int sd_remove(struct device *dev)
+       return 0;
+ }
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++static int
++sd_iostats_seq_show(struct seq_file *seq, void *v)
++{
++      struct timeval     now;
++      struct gendisk *disk = seq->private;
++      iostat_stats_t    *stats;
++      unsigned long long read_len;
++      unsigned long long read_len_tot;
++      unsigned long      read_num;
++      unsigned long      read_num_tot;
++      unsigned long long write_len;
++      unsigned long long write_len_tot;
++      unsigned long      write_num;
++      unsigned long      write_num_tot;
++      int                i;
++      int                maxi;
++
++      stats = scsi_disk(disk)->stats;
++      if (stats == NULL) {
++              printk(KERN_ERR "sd_iostats_seq_show: NULL stats entry\n");
++              BUG();
++      }
++
++      do_gettimeofday(&now);
++      now.tv_sec -= stats->iostat_timeval.tv_sec;
++      now.tv_usec -= stats->iostat_timeval.tv_usec;
++      if (now.tv_usec < 0) {
++              now.tv_usec += 1000000;
++              now.tv_sec--;
++      }
++
++      /* this sampling races with updates */
++      seq_printf(seq, "index:        %lu   snapshot_time:         %lu.%06lu\n",
++                      (unsigned long) scsi_disk(disk)->index,
++                      now.tv_sec, now.tv_usec);
++
++      for (i = IOSTAT_NCOUNTERS - 1; i > 0; i--)
++              if (stats->iostat_read_histogram[i].iostat_count != 0 ||
++                              stats->iostat_write_histogram[i].iostat_count != 0)
++                      break;
++      maxi = i;
++
++      seq_printf(seq, "%8s %8s %12s %8s %12s\n", "size", 
++                      "reads", "total", "writes", "total");
++
++      read_len_tot = write_len_tot = 0;
++      read_num_tot = write_num_tot = 0;
++      for (i = 0; i <= maxi; i++) {
++              read_len = stats->iostat_read_histogram[i].iostat_size;
++              read_len_tot += read_len;
++              read_num = stats->iostat_read_histogram[i].iostat_count;
++              read_num_tot += read_num;
++
++              write_len = stats->iostat_write_histogram[i].iostat_size;
++              write_len_tot += write_len;
++              write_num = stats->iostat_write_histogram[i].iostat_count;
++              write_num_tot += write_num;
++
++              seq_printf (seq, "%8d %8lu %12llu %8lu %12llu\n", 
++                              512<<i, read_num, read_len, write_num, write_len);
++      }
++
++      seq_printf(seq, "%8s %8lu %12llu %8lu %12llu\n\n", "total",
++                      read_num_tot, read_len_tot, 
++                      write_num_tot, write_len_tot);
++
++      seq_printf(seq, "%8s %8s %8s\n", "qdepth", "ticks", "%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long long ticks, percent;
++              ticks = stats->iostat_queue_ticks[i];
++              if (ticks == 0)
++                      continue;
++              percent = stats->iostat_queue_ticks[i] * 100;
++              do_div(percent, stats->iostat_queue_ticks_sum);
++              seq_printf(seq, "%8d %8llu %8llu\n", i, ticks, percent);
++      }
++
++      if (stats->iostat_reqs != 0) {
++              unsigned long long aveseek = 0, percent = 0;
++
++              if (stats->iostat_seeks) {
++                      aveseek = stats->iostat_seek_sectors;
++                      do_div(aveseek, stats->iostat_seeks);
++                      percent = stats->iostat_seeks * 100;
++                      do_div(percent, stats->iostat_reqs);
++              }
++
++              seq_printf(seq, "\n%llu sectors in %llu reqs: %llu seek(s) over "
++                              "%llu sectors in ave, %llu%% of all reqs\n",
++                              stats->iostat_sectors, stats->iostat_reqs,
++                              stats->iostat_seeks, aveseek, percent);
++      }
++
++      seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "process time", "reads",
++                      "%%", "writes", "%%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long read_percent = 0, write_percent = 0;
++              if (stats->iostat_wtime[i] == 0 &&
++                              stats->iostat_rtime[i] == 0)
++                      continue;
++              if (stats->iostat_read_reqs)
++                      read_percent = stats->iostat_rtime[i] * 100 / 
++                              stats->iostat_read_reqs;
++              if (stats->iostat_write_reqs)
++                      write_percent = stats->iostat_wtime[i] * 100 / 
++                              stats->iostat_write_reqs;
++              seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
++                              jiffies_to_msecs(((1UL << i) >> 1) << 1),
++                              stats->iostat_rtime[i], read_percent,
++                              stats->iostat_wtime[i], write_percent);
++      }
++
++      seq_printf(seq, "\n%16s %8s %8s %8s %8s\n", "time in queue", "reads",
++                      "%%", "writes", "%%");
++      for (i = 0; i < IOSTAT_NCOUNTERS; i++) {
++              unsigned long read_percent = 0, write_percent = 0;
++              if (stats->iostat_wtime_in_queue[i] == 0 &&
++                              stats->iostat_rtime_in_queue[i] == 0)
++                      continue;
++              if (stats->iostat_read_reqs)
++                      read_percent = stats->iostat_rtime_in_queue[i] * 100 / 
++                              stats->iostat_read_reqs;
++              if (stats->iostat_write_reqs)
++                      write_percent = stats->iostat_wtime_in_queue[i] * 100 / 
++                              stats->iostat_write_reqs;
++              seq_printf(seq, "%16u %8lu %8lu %8lu %8lu\n",
++                              jiffies_to_msecs(((1UL << i) >> 1) << 1),
++                              stats->iostat_rtime_in_queue[i],
++                              read_percent,
++                              stats->iostat_wtime_in_queue[i],
++                              write_percent);
++      }
++
++      return 0;
++}
++
++static void *
++sd_iostats_seq_start(struct seq_file *p, loff_t *pos)
++{
++      return (*pos == 0) ? (void *)1 : NULL;
++}
++
++static void *
++sd_iostats_seq_next(struct seq_file *p, void *v, loff_t *pos)
++{
++      ++*pos;
++      return NULL;
++}
++
++static void
++sd_iostats_seq_stop(struct seq_file *p, void *v)
++{
++}
++
++static struct seq_operations sd_iostats_seqops = {
++      .start = sd_iostats_seq_start,
++      .stop  = sd_iostats_seq_stop,
++      .next  = sd_iostats_seq_next,
++      .show  = sd_iostats_seq_show,
++};
++
++static int
++sd_iostats_seq_open (struct inode *inode, struct file *file)
++{
++      int rc;
++
++      rc = seq_open(file, &sd_iostats_seqops);
++      if (rc != 0)
++              return rc;
++
++      ((struct seq_file *)file->private_data)->private = PDE(inode)->data;
++      return 0;
++}
++
++static ssize_t
++sd_iostats_seq_write(struct file *file, const char *buffer,
++                   size_t len, loff_t *off)
++{
++      struct seq_file   *seq = file->private_data;
++      struct gendisk *disk = seq->private;
++      iostat_stats_t    *stats = scsi_disk(disk)->stats;
++      unsigned long      flags;
++      unsigned long      qdepth;
++
++
++      spin_lock_irqsave (&stats->iostat_lock, flags);
++      qdepth = stats->iostat_queue_depth;
++      memset (stats, 0, offsetof(iostat_stats_t, iostat_lock));
++      do_gettimeofday(&stats->iostat_timeval);
++      stats->iostat_queue_stamp = jiffies;
++      stats->iostat_queue_depth = qdepth;
++      spin_unlock_irqrestore (&stats->iostat_lock, flags);
++
++      return len;
++}
++
++static struct file_operations sd_iostats_proc_fops = {
++      .owner   = THIS_MODULE,
++      .open    = sd_iostats_seq_open,
++      .read    = seq_read,
++      .write   = sd_iostats_seq_write,
++      .llseek  = seq_lseek,
++      .release = seq_release,
++};
++
++extern struct proc_dir_entry *proc_scsi;
++
++void
++sd_iostats_init(void)
++{
++      if (proc_scsi == NULL) {
++              printk(KERN_WARNING "No access to sd iostats: "
++                      "proc_scsi is NULL\n");
++              return;
++      }
++
++      sd_iostats_procdir = create_proc_entry(sd_iostats_procdir_name,
++                                             S_IFDIR | S_IRUGO | S_IXUGO,
++                                              proc_scsi);
++      if (sd_iostats_procdir == NULL) {
++              printk(KERN_WARNING "No access to sd iostats: "
++                      "can't create /proc/scsi/%s\n", sd_iostats_procdir_name);
++              return;
++      }
++}
++
++void sd_iostats_fini(void)
++{
++      if (proc_scsi != NULL && sd_iostats_procdir != NULL)
++              remove_proc_entry(sd_iostats_procdir_name, proc_scsi);
++
++      sd_iostats_procdir = NULL;
++}
++
++void sd_iostats_finish_req(struct scsi_cmnd *SCpnt)
++{
++      struct request          *rq = SCpnt->request;
++      iostat_stats_t          *stats;
++      unsigned long           *tcounter;
++      int                     tbucket;
++      int                     tmp;
++      unsigned long           irqflags;
++      unsigned long           i;
++
++      stats = scsi_disk(rq->rq_disk)->stats;
++      if (stats == NULL)
++              return;
++
++      tmp = jiffies - rq->start_time;
++      for (tbucket = 0; tmp > 1; tbucket++)
++              tmp >>= 1;
++      if (tbucket >= IOSTAT_NCOUNTERS)
++              tbucket = IOSTAT_NCOUNTERS - 1;
++      //printk("%u ticks in D to %u\n", jiffies - rq->start_time, tbucket);
++
++      tcounter = rq_data_dir(rq) == WRITE ?
++              &stats->iostat_wtime[tbucket] : &stats->iostat_rtime[tbucket];
++
++      spin_lock_irqsave(&stats->iostat_lock, irqflags);
++
++      /* update delay stats */
++      (*tcounter)++;
++
++      /* update queue depth stats */
++      i = stats->iostat_queue_depth;
++      if (i >= IOSTAT_NCOUNTERS)
++              i = IOSTAT_NCOUNTERS - 1;
++      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      BUG_ON(stats->iostat_queue_depth == 0);
++      stats->iostat_queue_depth--;
++
++      /* update seek stats. XXX: not sure about nr_sectors */
++      stats->iostat_sectors += rq->nr_sectors;
++      stats->iostat_reqs++;
++      if (rq->sector != stats->iostat_next_sector) {
++              stats->iostat_seek_sectors +=
++                      rq->sector > stats->iostat_next_sector ?
++                      rq->sector - stats->iostat_next_sector :
++                      stats->iostat_next_sector - rq->sector;
++              stats->iostat_seeks++;
++      }
++      stats->iostat_next_sector = rq->sector + rq->nr_sectors;
++
++      stats->iostat_queue_stamp = jiffies;
++
++      spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
++}
++
++void sd_iostats_start_req(struct scsi_cmnd *SCpnt)
++{
++      struct request          *rq = SCpnt->request;
++      iostat_stats_t          *stats;
++      iostat_counter_t        *counter;
++      int                     bucket;
++      int                     tbucket;
++      int                     tmp;
++      unsigned long           irqflags;
++      unsigned long           i;
++      int                     nsect;
++
++      stats = scsi_disk(rq->rq_disk)->stats;
++      if (stats == NULL)
++              return;
++
++      nsect = scsi_bufflen(SCpnt) >> 9;
++      for (bucket = 0, tmp = nsect; tmp > 1; bucket++)
++              tmp >>= 1;
++
++      if (bucket >= IOSTAT_NCOUNTERS) {
++              printk (KERN_ERR "sd_iostats_bump: nsect %d too big\n", nsect);
++              BUG();
++      }
++
++      counter = rq_data_dir(rq) == WRITE ?
++              &stats->iostat_write_histogram[bucket] :
++              &stats->iostat_read_histogram[bucket];
++
++      tmp = jiffies - rq->start_time;
++      for (tbucket = 0; tmp > 1; tbucket++)
++              tmp >>= 1;
++      if (tbucket >= IOSTAT_NCOUNTERS)
++              tbucket = IOSTAT_NCOUNTERS - 1;
++      //printk("%u ticks in Q to %u\n", jiffies - rq->start_time, tbucket);
++
++      /* an ugly hack to know exact processing time. the right
++       * solution is to add one more field to struct request
++       * hopefully it will break nothing ... */
++      rq->start_time = jiffies;
++
++      spin_lock_irqsave(&stats->iostat_lock, irqflags);
++
++      /* update queue depth stats */
++      i = stats->iostat_queue_depth;
++      if (i >= IOSTAT_NCOUNTERS)
++              i = IOSTAT_NCOUNTERS - 1;
++      stats->iostat_queue_ticks[i] += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_ticks_sum += jiffies - stats->iostat_queue_stamp;
++      stats->iostat_queue_depth++;
++
++      /* update delay stats */
++      if (rq_data_dir(rq) == WRITE) {
++              stats->iostat_wtime_in_queue[tbucket]++;
++              stats->iostat_write_reqs++;
++      } else {
++              stats->iostat_rtime_in_queue[tbucket]++;
++              stats->iostat_read_reqs++;
++      }
++
++      /* update size stats */
++      counter->iostat_size += nsect;
++      counter->iostat_count++;
++
++      stats->iostat_queue_stamp = jiffies;
++
++      spin_unlock_irqrestore(&stats->iostat_lock, irqflags);
++}
++#endif
++
+ /**
+  *    scsi_disk_release - Called to free the scsi_disk structure
+  *    @cdev: pointer to embedded class device
+@@ -1727,10 +2138,16 @@ static void scsi_disk_release(struct cla
+       idr_remove(&sd_index_idr, sdkp->index);
+       spin_unlock(&sd_index_lock);
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      if (sdkp->stats) {
++              remove_proc_entry(disk->disk_name, sd_iostats_procdir);
++              kfree(sdkp->stats);
++              sdkp->stats = NULL;
++      }
++#endif
+       disk->private_data = NULL;
+       put_disk(disk);
+       put_device(&sdkp->device->sdev_gendev);
+-
+       kfree(sdkp);
+ }
+@@ -1845,6 +2262,8 @@ static int __init init_sd(void)
+       if (!majors)
+               return -ENODEV;
++      sd_iostats_init();
++
+       err = class_register(&sd_disk_class);
+       if (err)
+               goto err_out;
+@@ -1860,6 +2279,7 @@ err_out_class:
+ err_out:
+       for (i = 0; i < SD_MAJORS; i++)
+               unregister_blkdev(sd_major(i), "sd");
++      sd_iostats_fini();
+       return err;
+ }
+Index: linux-2.6.22.19/include/scsi/sd.h
+===================================================================
+--- linux-2.6.22.19.orig/drivers/scsi/sd.h
++++ linux-2.6.22.19/drivers/scsi/sd.h
+@@ -31,6 +31,46 @@
+  */
+ #define SD_BUF_SIZE           512
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++typedef struct {
++      unsigned long long iostat_size;
++      unsigned long long iostat_count;
++} iostat_counter_t;
++
++#define IOSTAT_NCOUNTERS 16
++typedef struct {
++      iostat_counter_t        iostat_read_histogram[IOSTAT_NCOUNTERS];
++      iostat_counter_t        iostat_write_histogram[IOSTAT_NCOUNTERS];
++      struct timeval          iostat_timeval;
++
++      /* queue depth: how well the pipe is filled up */
++      unsigned long long      iostat_queue_ticks[IOSTAT_NCOUNTERS];
++      unsigned long long      iostat_queue_ticks_sum;
++      unsigned long           iostat_queue_depth;
++      unsigned long           iostat_queue_stamp;
++
++      /* seeks: how linear the traffic is */
++      unsigned long long      iostat_next_sector;
++      unsigned long long      iostat_seek_sectors;
++      unsigned long long      iostat_seeks;
++      unsigned long long      iostat_sectors;
++      unsigned long long      iostat_reqs;
++      unsigned long           iostat_read_reqs;
++      unsigned long           iostat_write_reqs;
++
++      /* process time: how long it takes to process requests */
++      unsigned long           iostat_rtime[IOSTAT_NCOUNTERS];
++      unsigned long           iostat_wtime[IOSTAT_NCOUNTERS];
++
++      /* queue time: how long process spent in elevator's queue */
++      unsigned long           iostat_rtime_in_queue[IOSTAT_NCOUNTERS];
++      unsigned long           iostat_wtime_in_queue[IOSTAT_NCOUNTERS];
++
++      /* must be the last field, as it's used to know size to be memset'ed */
++      spinlock_t              iostat_lock;
++} ____cacheline_aligned_in_smp iostat_stats_t;
++#endif
++
+ struct scsi_disk {
+       struct scsi_driver *driver;     /* always &sd_template */
+       struct scsi_device *device;
+@@ -44,6 +84,9 @@ struct scsi_disk {
+       unsigned        WCE : 1;        /* state of disk WCE bit */
+       unsigned        RCD : 1;        /* state of disk RCD bit, unused */
+       unsigned        DPOFUA : 1;     /* state of disk DPOFUA bit */
++#if (defined(CONFIG_SD_IOSTATS) && defined(CONFIG_PROC_FS))
++      iostat_stats_t  *stats;         /* scsi disk statistics */
++#endif
+ };
+ #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,cdev)