Whamcloud - gitweb
Porting raid5 improvements to rhel5 kernels.
authorjxiong <jxiong>
Wed, 13 Feb 2008 11:51:09 +0000 (11:51 +0000)
committerjxiong <jxiong>
Wed, 13 Feb 2008 11:51:09 +0000 (11:51 +0000)
b=13648
r=alex,andreas

lustre/kernel_patches/patches/md-rebuild-policy.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid5-large-io-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid5-stats-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch [new file with mode: 0644]
lustre/kernel_patches/series/2.6-rhel5.series

diff --git a/lustre/kernel_patches/patches/md-rebuild-policy.patch b/lustre/kernel_patches/patches/md-rebuild-policy.patch
new file mode 100644 (file)
index 0000000..e6c9f9c
--- /dev/null
@@ -0,0 +1,137 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c
+--- linux-2.6.18-53.orig/drivers/md/md.c       2008-02-13 17:34:25.000000000 +0800
++++ linux-2.6.18-53/drivers/md/md.c    2008-02-13 17:39:28.000000000 +0800
+@@ -90,6 +90,8 @@ static void md_print_devices(void);
+ static int sysctl_speed_limit_min = 1000;
+ static int sysctl_speed_limit_max = 200000;
++static int sysctl_rebuild_window_size = 256;
++static int sysctl_disk_idle_size = 4096;
+ static inline int speed_min(mddev_t *mddev)
+ {
+       return mddev->sync_speed_min ?
+@@ -121,6 +123,22 @@ static ctl_table raid_table[] = {
+               .mode           = S_IRUGO|S_IWUSR,
+               .proc_handler   = &proc_dointvec,
+       },
++      {
++              .ctl_name       = DEV_RAID_REBUILD_WINDOW,
++              .procname       = "rebuild_window_size",
++              .data           = &sysctl_rebuild_window_size,
++              .maxlen         = sizeof(int),
++              .mode           = S_IRUGO|S_IWUSR,
++              .proc_handler   = &proc_dointvec,
++      },
++      {
++              .ctl_name       = DEV_RAID_DISK_IDLE_SIZE,
++              .procname       = "disk_idle_size",
++              .data           = &sysctl_disk_idle_size,
++              .maxlen         = sizeof(int),
++              .mode           = S_IRUGO|S_IWUSR,
++              .proc_handler   = &proc_dointvec,
++      },
+       { .ctl_name = 0 }
+ };
+@@ -4980,14 +4998,15 @@ static int is_mddev_idle(mddev_t *mddev)
+       mdk_rdev_t * rdev;
+       struct list_head *tmp;
+       int idle;
+-      unsigned long curr_events;
++      unsigned long rw, sync;
+       idle = 1;
+       ITERATE_RDEV(mddev,rdev,tmp) {
+               struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
+-              curr_events = disk_stat_read(disk, sectors[0]) + 
+-                              disk_stat_read(disk, sectors[1]) - 
+-                              atomic_read(&disk->sync_io);
++
++              rw = disk_stat_read(disk, sectors[READ])+disk_stat_read(disk, sectors[WRITE]);
++              sync = atomic_read(&disk->sync_io);
++
+               /* The difference between curr_events and last_events
+                * will be affected by any new non-sync IO (making
+                * curr_events bigger) and any difference in the amount of
+@@ -5001,9 +5020,9 @@ static int is_mddev_idle(mddev_t *mddev)
+                *
+                * Note: the following is an unsigned comparison.
+                */
+-              if ((curr_events - rdev->last_events + 4096) > 8192) {
+-                      rdev->last_events = curr_events;
++              if (rw - rdev->last_events > sync + sysctl_disk_idle_size) {
+                       idle = 0;
++                      rdev->last_events = rw - sync;
+               }
+       }
+       return idle;
+@@ -5069,8 +5088,7 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wa
+ void md_do_sync(mddev_t *mddev)
+ {
+       mddev_t *mddev2;
+-      unsigned int currspeed = 0,
+-               window;
++      unsigned int currspeed = 0;
+       sector_t max_sectors,j, io_sectors;
+       unsigned long mark[SYNC_MARKS];
+       sector_t mark_cnt[SYNC_MARKS];
+@@ -5190,9 +5208,8 @@ void md_do_sync(mddev_t *mddev)
+       /*
+        * Tune reconstruction:
+        */
+-      window = 32*(PAGE_SIZE/512);
+       printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
+-              window/2,(unsigned long long) max_sectors/2);
++              sysctl_rebuild_window_size/2,(unsigned long long) max_sectors/2);
+       atomic_set(&mddev->recovery_active, 0);
+       init_waitqueue_head(&mddev->recovery_wait);
+@@ -5230,7 +5247,7 @@ void md_do_sync(mddev_t *mddev)
+                        */
+                       md_new_event(mddev);
+-              if (last_check + window > io_sectors || j == max_sectors)
++              if (last_check + sysctl_rebuild_window_size > io_sectors || j == max_sectors)
+                       continue;
+               last_check = io_sectors;
+@@ -5251,7 +5268,6 @@ void md_do_sync(mddev_t *mddev)
+                       last_mark = next;
+               }
+-
+               if (kthread_should_stop()) {
+                       /*
+                        * got a signal, exit.
+@@ -5275,10 +5291,16 @@ void md_do_sync(mddev_t *mddev)
+               currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
+                       /((jiffies-mddev->resync_mark)/HZ +1) +1;
+-
+               if (currspeed > speed_min(mddev)) {
+                       if ((currspeed > speed_max(mddev)) ||
+                                       !is_mddev_idle(mddev)) {
++                              static unsigned long next_report;
++                              if (time_after(jiffies, next_report)) {
++                                      printk(KERN_INFO "md: rebuild %s throttled due to IO\n",
++                                              mdname(mddev));
++                                      /* once per 10 minutes */
++                                      next_report = jiffies + 600 * HZ;
++                              }
+                               msleep(500);
+                               goto repeat;
+                       }
+diff -pur linux-2.6.18-53.orig/include/linux/sysctl.h linux-2.6.18-53/include/linux/sysctl.h
+--- linux-2.6.18-53.orig/include/linux/sysctl.h        2008-02-13 17:35:25.000000000 +0800
++++ linux-2.6.18-53/include/linux/sysctl.h     2008-02-13 17:36:22.000000000 +0800
+@@ -903,7 +903,9 @@ enum {
+ /* /proc/sys/dev/raid */
+ enum {
+       DEV_RAID_SPEED_LIMIT_MIN=1,
+-      DEV_RAID_SPEED_LIMIT_MAX=2
++      DEV_RAID_SPEED_LIMIT_MAX=2,
++      DEV_RAID_REBUILD_WINDOW=3,
++      DEV_RAID_DISK_IDLE_SIZE=4
+ };
+ /* /proc/sys/dev/parport/default */
diff --git a/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch b/lustre/kernel_patches/patches/raid5-configurable-cachesize-rhel5.patch
new file mode 100644 (file)
index 0000000..be8f6c2
--- /dev/null
@@ -0,0 +1,31 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c    2007-12-06 17:23:39.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:24:14.000000000 +0800
+@@ -57,7 +57,7 @@
+  * Stripe cache
+  */
+-#define NR_STRIPES            256
++static int raid5_nr_stripes = 256 * 8;
+ #define STRIPE_SIZE           PAGE_SIZE
+ #define STRIPE_SHIFT          (PAGE_SHIFT - 9)
+ #define STRIPE_SECTORS                (STRIPE_SIZE>>9)
+@@ -3230,7 +3230,7 @@ static int run(mddev_t *mddev)
+       else
+               conf->max_degraded = 1;
+       conf->algorithm = mddev->layout;
+-      conf->max_nr_stripes = NR_STRIPES;
++      conf->max_nr_stripes = raid5_nr_stripes;
+       conf->expand_progress = mddev->reshape_position;
+       /* device size must be a multiple of chunk size */
+@@ -3821,6 +3821,7 @@ static void raid5_exit(void)
+ module_init(raid5_init);
+ module_exit(raid5_exit);
++module_param(raid5_nr_stripes, int, 0644);
+ MODULE_LICENSE("GPL");
+ MODULE_ALIAS("md-personality-4"); /* RAID5 */
+ MODULE_ALIAS("md-raid5");
+Only in linux-2.6.18-53/drivers/md: raid5.c.orig
+Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
diff --git a/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch b/lustre/kernel_patches/patches/raid5-large-io-rhel5.patch
new file mode 100644 (file)
index 0000000..a415611
--- /dev/null
@@ -0,0 +1,15 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c    2007-12-06 17:26:27.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:26:55.000000000 +0800
+@@ -3340,6 +3340,11 @@ static int run(mddev_t *mddev)
+       mddev->array_size =  mddev->size * (conf->previous_raid_disks -
+                                           conf->max_degraded);
++      /* in order to support large I/Os */
++      blk_queue_max_sectors(mddev->queue, conf->chunk_size * conf->previous_raid_disks >> 9);
++      mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
++      mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
++
+       return 0;
+ abort:
+       if (conf) {
diff --git a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch
new file mode 100644 (file)
index 0000000..735af2c
--- /dev/null
@@ -0,0 +1,185 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c    2007-12-28 18:55:24.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:08:15.000000000 +0800
+@@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip
+       }
+ }
++/*
++ * The whole idea is to collect all bio's and then issue them
++ * disk by disk to assist merging a bit -bzzz
++ */
++static void raid5_flush_bios(raid5_conf_t *conf, struct bio *bios[], int raid_disks)
++{
++      struct bio *bio, *nbio;
++      int i;
++      for (i = 0; i < raid_disks; i++) {
++              bio = bios[i];
++              while (bio) {
++                      nbio = bio->bi_next;
++                      bio->bi_next = NULL;
++                      generic_make_request(bio);
++                      bio = nbio;
++              }
++              bios[i] = NULL;
++      }
++}
+ /*
+  * Each stripe/dev can have one or more bion attached.
+@@ -1392,7 +1411,7 @@ static int stripe_to_pdidx(sector_t stri
+  *
+  */
+  
+-static void handle_stripe5(struct stripe_head *sh)
++static void handle_stripe5(struct stripe_head *sh, struct bio *bios[])
+ {
+       raid5_conf_t *conf = sh->raid_conf;
+       int disks = sh->disks;
+@@ -1939,7 +1958,11 @@ static void handle_stripe5(struct stripe
+                           test_bit(R5_ReWrite, &sh->dev[i].flags))
+                               atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+                       atomic_inc(&conf->out_reqs_in_queue);
+-                      generic_make_request(bi);
++                      if (bios) {
++                              bi->bi_next = bios[i];
++                              bios[i] = bi;
++                      } else
++                              generic_make_request(bi);
+               } else {
+                       if (rw == 1)
+                               set_bit(STRIPE_DEGRADED, &sh->state);
+@@ -1951,7 +1974,7 @@ static void handle_stripe5(struct stripe
+       }
+ }
+-static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
++static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
+ {
+       raid6_conf_t *conf = sh->raid_conf;
+       int disks = conf->raid_disks;
+@@ -2499,7 +2522,11 @@ static void handle_stripe6(struct stripe
+                       if (rw == WRITE &&
+                           test_bit(R5_ReWrite, &sh->dev[i].flags))
+                               atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+-                      generic_make_request(bi);
++                      if (bios) {
++                              bi->bi_next = bios[i];
++                              bios[i] = bi;
++                      } else
++                              generic_make_request(bi);
+                       atomic_inc(&conf->out_reqs_in_queue);
+               } else {
+                       if (rw == 1)
+@@ -2512,12 +2539,12 @@ static void handle_stripe6(struct stripe
+       }
+ }
+-static void handle_stripe(struct stripe_head *sh, struct page *tmp_page)
++static void handle_stripe(struct stripe_head *sh, struct page *tmp_page, struct bio *bios[])
+ {
+       if (sh->raid_conf->level == 6)
+-              handle_stripe6(sh, tmp_page);
++              handle_stripe6(sh, tmp_page, bios);
+       else
+-              handle_stripe5(sh);
++              handle_stripe5(sh, bios);
+ }
+@@ -2670,6 +2697,7 @@ static int make_request(request_queue_t 
+       int stripes_per_chunk, sectors_per_block;
+       int sectors_per_stripe;
+       int i, j;
++      struct bio *bios[MD_SB_DISKS];
+       DEFINE_WAIT(w);
+       int disks, data_disks;
+@@ -2698,6 +2726,7 @@ static int make_request(request_queue_t 
+       sectors = bi->bi_size >> 9;
+       stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
++      memset(&bios, 0, sizeof(bios));
+ redo_bio:
+       /* stripe by stripe handle needs a stable raid layout, so if this
+        * reuqest covers the expanding region, wait it over. 
+@@ -2756,8 +2785,10 @@ retry:
+                                        * the raid layout has been changed, we have to redo the 
+                                        * whole bio because we don't which sectors in it has been
+                                        * done, and which is not done. -jay */
+-                                      if (raid5_redo_bio(conf, bi, disks, logical_sector))
++                                      if (raid5_redo_bio(conf, bi, disks, logical_sector)) {
++                                              raid5_flush_bios(conf, bios, disks);
+                                               goto redo_bio;
++                                      }
+                                       if (test_bit(STRIPE_EXPANDING, &sh->state)) {
+                                               /* Stripe is busy expanding or
+@@ -2766,6 +2797,7 @@ retry:
+                                                */
+                                               release_stripe(sh);
+                                               sh = NULL;
++                                              raid5_flush_bios(conf, bios, disks);
+                                               raid5_unplug_device(mddev->queue);
+                                               schedule();
+                                               goto retry;
+@@ -2784,17 +2816,19 @@ retry:
+                        */
+                       if (r_sector >= mddev->suspend_lo &&
+                           r_sector < mddev->suspend_hi) {
+-                              handle_stripe(sh, NULL);
++                              handle_stripe(sh, NULL, NULL);
+                               release_stripe(sh);
+                               sh = NULL;
++                              raid5_flush_bios(conf, bios, disks);
+                               schedule();
+                               goto retry;
+                       }
+                       if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+-                              handle_stripe(sh, NULL);
++                              handle_stripe(sh, NULL, NULL);
+                               release_stripe(sh);
+                               sh = NULL;
++                              raid5_flush_bios(conf, bios, disks);
+                               raid5_unplug_device(mddev->queue);
+                               schedule();
+                               goto retry;
+@@ -2810,7 +2844,7 @@ retry:
+                       r_sector += sectors_per_chunk;
+               }
+               if (sh) {
+-                      handle_stripe(sh, NULL);
++                      handle_stripe(sh, NULL, NULL);
+                       release_stripe(sh);
+                       sh = NULL;
+               }
+@@ -2820,6 +2854,9 @@ retry:
+       if (sectors > 0)
+               goto repeat;
++      /* flush all of the bios */
++      raid5_flush_bios(conf, bios, disks);
++
+       spin_lock_irq(&conf->device_lock);
+       remaining = --bi->bi_phys_segments;
+       spin_unlock_irq(&conf->device_lock);
+@@ -3035,7 +3072,7 @@ static inline sector_t sync_request(mdde
+       clear_bit(STRIPE_INSYNC, &sh->state);
+       spin_unlock(&sh->lock);
+-      handle_stripe(sh, NULL);
++      handle_stripe(sh, NULL, NULL);
+       release_stripe(sh);
+       return STRIPE_SECTORS;
+@@ -3091,7 +3128,7 @@ static void raid5d (mddev_t *mddev)
+               
+               handled++;
+               atomic_inc(&conf->handled_in_raid5d);
+-              handle_stripe(sh, conf->spare_page);
++              handle_stripe(sh, conf->spare_page, NULL);
+               release_stripe(sh);
+               spin_lock_irq(&conf->device_lock);
diff --git a/lustre/kernel_patches/patches/raid5-stats-rhel5.patch b/lustre/kernel_patches/patches/raid5-stats-rhel5.patch
new file mode 100644 (file)
index 0000000..d1e43d6
--- /dev/null
@@ -0,0 +1,256 @@
+diff -pru linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c    2007-12-06 17:15:22.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-06 17:17:30.000000000 +0800
+@@ -115,10 +115,12 @@ static void __release_stripe(raid5_conf_
+                       if (test_bit(STRIPE_DELAYED, &sh->state)) {
+                               list_add_tail(&sh->lru, &conf->delayed_list);
+                               blk_plug_device(conf->mddev->queue);
++                              atomic_inc(&conf->delayed);
+                       } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+                                  sh->bm_seq - conf->seq_write > 0) {
+                               list_add_tail(&sh->lru, &conf->bitmap_list);
+                               blk_plug_device(conf->mddev->queue);
++                              atomic_inc(&conf->bit_delayed);
+                       } else {
+                               clear_bit(STRIPE_BIT_DELAY, &sh->state);
+                               list_add_tail(&sh->lru, &conf->handle_list);
+@@ -289,6 +291,7 @@ static struct stripe_head *get_active_st
+                       if (noblock && sh == NULL)
+                               break;
+                       if (!sh) {
++                              atomic_inc(&conf->out_of_stripes);
+                               conf->inactive_blocked = 1;
+                               wait_event_lock_irq(conf->wait_for_stripe,
+                                                   !list_empty(&conf->inactive_list) &&
+@@ -311,6 +314,10 @@ static struct stripe_head *get_active_st
+                                   !test_bit(STRIPE_EXPANDING, &sh->state))
+                                       BUG();
+                               list_del_init(&sh->lru);
++                              if (test_bit(STRIPE_DELAYED, &sh->state))
++                                      atomic_dec(&conf->delayed);
++                              if (test_bit(STRIPE_BIT_DELAY, &sh->state))
++                                      atomic_dec(&conf->bit_delayed);
+                       }
+               }
+       } while (sh == NULL);
+@@ -529,6 +536,8 @@ static int raid5_end_read_request(struct
+       if (bi->bi_size)
+               return 1;
++      atomic_dec(&conf->out_reqs_in_queue);
++
+       for (i=0 ; i<disks; i++)
+               if (bi == &sh->dev[i].req)
+                       break;
+@@ -642,6 +651,8 @@ static int raid5_end_write_request (stru
+       if (bi->bi_size)
+               return 1;
++      atomic_dec(&conf->out_reqs_in_queue);
++
+       for (i=0 ; i<disks; i++)
+               if (bi == &sh->dev[i].req)
+                       break;
+@@ -1402,6 +1413,8 @@ static void handle_stripe5(struct stripe
+       clear_bit(STRIPE_HANDLE, &sh->state);
+       clear_bit(STRIPE_DELAYED, &sh->state);
++      atomic_inc(&conf->handle_called);
++
+       syncing = test_bit(STRIPE_SYNCING, &sh->state);
+       expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
+       expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+@@ -1684,6 +1697,7 @@ static void handle_stripe5(struct stripe
+                                               set_bit(R5_LOCKED, &dev->flags);
+                                               set_bit(R5_Wantread, &dev->flags);
+                                               locked++;
++                                              atomic_inc(&conf->reads_for_rmw);
+                                       } else {
+                                               set_bit(STRIPE_DELAYED, &sh->state);
+                                               set_bit(STRIPE_HANDLE, &sh->state);
+@@ -1703,6 +1717,7 @@ static void handle_stripe5(struct stripe
+                                               set_bit(R5_LOCKED, &dev->flags);
+                                               set_bit(R5_Wantread, &dev->flags);
+                                               locked++;
++                                              atomic_inc(&conf->reads_for_rcw);
+                                       } else {
+                                               set_bit(STRIPE_DELAYED, &sh->state);
+                                               set_bit(STRIPE_HANDLE, &sh->state);
+@@ -1870,6 +1885,7 @@ static void handle_stripe5(struct stripe
+               bi->bi_next = NULL;
+               bi->bi_size = 0;
+               bi->bi_end_io(bi, bytes, 0);
++              atomic_dec(&conf->in_reqs_in_queue);
+       }
+       for (i=disks; i-- ;) {
+               int rw;
+@@ -1885,10 +1901,13 @@ static void handle_stripe5(struct stripe
+               bi = &sh->dev[i].req;
+  
+               bi->bi_rw = rw;
+-              if (rw)
++              if (rw) {
++                      atomic_inc(&conf->writes_out);
+                       bi->bi_end_io = raid5_end_write_request;
+-              else
++              } else {
++                      atomic_inc(&conf->reads_out);
+                       bi->bi_end_io = raid5_end_read_request;
++              }
+  
+               rcu_read_lock();
+               rdev = rcu_dereference(conf->disks[i].rdev);
+@@ -1919,6 +1938,7 @@ static void handle_stripe5(struct stripe
+                       if (rw == WRITE &&
+                           test_bit(R5_ReWrite, &sh->dev[i].flags))
+                               atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
++                      atomic_inc(&conf->out_reqs_in_queue);
+                       generic_make_request(bi);
+               } else {
+                       if (rw == 1)
+@@ -1955,6 +1975,8 @@ static void handle_stripe6(struct stripe
+       clear_bit(STRIPE_HANDLE, &sh->state);
+       clear_bit(STRIPE_DELAYED, &sh->state);
++      atomic_inc(&conf->handle_called);
++
+       syncing = test_bit(STRIPE_SYNCING, &sh->state);
+       /* Now to look around and see what can be done */
+@@ -2255,6 +2277,7 @@ static void handle_stripe6(struct stripe
+                                               set_bit(R5_LOCKED, &dev->flags);
+                                               set_bit(R5_Wantread, &dev->flags);
+                                               locked++;
++                                              atomic_inc(&conf->reads_for_rcw);
+                                       } else {
+                                               PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+                                                      (unsigned long long)sh->sector, i);
+@@ -2423,6 +2446,7 @@ static void handle_stripe6(struct stripe
+               bi->bi_next = NULL;
+               bi->bi_size = 0;
+               bi->bi_end_io(bi, bytes, 0);
++              atomic_dec(&conf->in_reqs_in_queue);
+       }
+       for (i=disks; i-- ;) {
+               int rw;
+@@ -2438,10 +2462,13 @@ static void handle_stripe6(struct stripe
+               bi = &sh->dev[i].req;
+               bi->bi_rw = rw;
+-              if (rw)
++              if (rw) {
++                      atomic_inc(&conf->writes_out);
+                       bi->bi_end_io = raid5_end_write_request;
+-              else
++              } else {
++                      atomic_inc(&conf->reads_out);
+                       bi->bi_end_io = raid5_end_read_request;
++              }
+               rcu_read_lock();
+               rdev = rcu_dereference(conf->disks[i].rdev);
+@@ -2473,6 +2500,7 @@ static void handle_stripe6(struct stripe
+                           test_bit(R5_ReWrite, &sh->dev[i].flags))
+                               atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
+                       generic_make_request(bi);
++                      atomic_inc(&conf->out_reqs_in_queue);
+               } else {
+                       if (rw == 1)
+                               set_bit(STRIPE_DEGRADED, &sh->state);
+@@ -2506,6 +2534,7 @@ static void raid5_activate_delayed(raid5
+                       if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+                               atomic_inc(&conf->preread_active_stripes);
+                       list_add_tail(&sh->lru, &conf->handle_list);
++                      atomic_dec(&conf->delayed);
+               }
+       }
+ }
+@@ -2608,6 +2637,8 @@ static int make_request(request_queue_t 
+       const int rw = bio_data_dir(bi);
+       int remaining;
++      atomic_inc(&conf->in_reqs_in_queue);
++
+       if (unlikely(bio_barrier(bi))) {
+               bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+               return 0;
+@@ -2617,6 +2648,11 @@ static int make_request(request_queue_t 
+       disk_stat_inc(mddev->gendisk, ios[rw]);
+       disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bi));
++      if (rw == WRITE)
++              atomic_inc(&conf->writes_in);
++      else
++              atomic_inc(&conf->reads_in);
++
+       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       last_sector = bi->bi_sector + (bi->bi_size>>9);
+@@ -2724,6 +2760,7 @@ static int make_request(request_queue_t 
+               if ( rw == WRITE )
+                       md_write_end(mddev);
++              atomic_dec(&conf->in_reqs_in_queue);
+               bi->bi_size = 0;
+               bi->bi_end_io(bi, bytes, 0);
+       }
+@@ -2985,6 +3022,7 @@ static void raid5d (mddev_t *mddev)
+               spin_unlock_irq(&conf->device_lock);
+               
+               handled++;
++              atomic_inc(&conf->handled_in_raid5d);
+               handle_stripe(sh, conf->spare_page);
+               release_stripe(sh);
+@@ -3381,6 +3419,21 @@ static void status (struct seq_file *seq
+                              conf->disks[i].rdev &&
+                              test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
+       seq_printf (seq, "]");
++      seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes",
++                      atomic_read(&conf->reads_in), atomic_read(&conf->writes_in),
++                      atomic_read(&conf->reads_out), atomic_read(&conf->writes_out));
++      seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called",
++                      atomic_read(&conf->handled_in_raid5d),
++                      atomic_read(&conf->out_of_stripes),
++                      atomic_read(&conf->handle_called));
++      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++                      atomic_read(&conf->reads_for_rmw),
++                      atomic_read(&conf->reads_for_rcw));
++      seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
++                      atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
++                      atomic_read(&conf->active_stripes),
++                      atomic_read(&conf->in_reqs_in_queue),
++                      atomic_read(&conf->out_reqs_in_queue));
+ #if RAID5_DEBUG
+       seq_printf (seq, "\n");
+       printall(seq, conf);
+diff -pru linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h    2007-12-06 17:15:22.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-06 17:15:32.000000000 +0800
+@@ -259,6 +259,25 @@ struct raid5_private_data {
+       int                     pool_size; /* number of disks in stripeheads in pool */
+       spinlock_t              device_lock;
+       struct disk_info        *disks;
++
++      /*
++       * Stats
++       */
++      atomic_t                reads_in;
++      atomic_t                writes_in;
++      atomic_t                reads_out;
++      atomic_t                writes_out;
++      atomic_t                handled_in_raid5d;
++      atomic_t                out_of_stripes;
++      atomic_t                reads_for_rmw;
++      atomic_t                reads_for_rcw;
++      atomic_t                writes_zcopy;
++      atomic_t                writes_copied;
++      atomic_t                handle_called;
++      atomic_t                delayed;
++      atomic_t                bit_delayed;
++      atomic_t                in_reqs_in_queue;
++      atomic_t                out_reqs_in_queue;
+ };
+ typedef struct raid5_private_data raid5_conf_t;
+Only in linux-2.6.18-53.orig/include/linux/raid: .raid5.h.swp
diff --git a/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch b/lustre/kernel_patches/patches/raid5-stripe-by-stripe-handling-rhel5.patch
new file mode 100644 (file)
index 0000000..4b72d95
--- /dev/null
@@ -0,0 +1,284 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c    2007-12-28 14:55:08.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 18:52:08.000000000 +0800
+@@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que
+       return ret;
+ }
++static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi)
++{
++      sector_t first_sector, last_sector;
++
++      if (likely(conf->expand_progress == MaxSector))
++              return 0;
++
++      first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
++      last_sector = bi->bi_sector + (bi->bi_size>>9);
++
++      return (first_sector < conf->expand_progress &&
++              last_sector >= conf->expand_lo);
++}
++
++static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector)
++{
++      int redo = 0;
++
++      if (likely(conf->expand_progress == MaxSector))
++              return 0;
++
++      spin_lock_irq(&conf->device_lock);
++      redo = (raid5_expanding_overlap(conf, bi) ||
++              (unlikely(sector < conf->expand_progress) &&
++              disks == conf->previous_raid_disks));
++      spin_unlock_irq(&conf->device_lock);
++      return redo;
++}
++
+ static int make_request(request_queue_t *q, struct bio * bi)
+ {
+       mddev_t *mddev = q->queuedata;
+@@ -2636,6 +2665,14 @@ static int make_request(request_queue_t 
+       struct stripe_head *sh;
+       const int rw = bio_data_dir(bi);
+       int remaining;
++      sector_t stripe, sectors, block, r_sector, b_sector;
++      int sectors_per_chunk = conf->chunk_size >> 9;
++      int stripes_per_chunk, sectors_per_block;
++      int sectors_per_stripe;
++      int i, j;
++
++      DEFINE_WAIT(w);
++      int disks, data_disks;
+       atomic_inc(&conf->in_reqs_in_queue);
+@@ -2653,105 +2690,136 @@ static int make_request(request_queue_t 
+       else
+               atomic_inc(&conf->reads_in);
+-
+       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+       last_sector = bi->bi_sector + (bi->bi_size>>9);
+       bi->bi_next = NULL;
+       bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+-      for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+-              DEFINE_WAIT(w);
+-              int disks, data_disks;
+-
+-      retry:
+-              prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+-              if (likely(conf->expand_progress == MaxSector))
+-                      disks = conf->raid_disks;
+-              else {
+-                      /* spinlock is needed as expand_progress may be
+-                       * 64bit on a 32bit platform, and so it might be
+-                       * possible to see a half-updated value
+-                       * Ofcourse expand_progress could change after
+-                       * the lock is dropped, so once we get a reference
+-                       * to the stripe that we think it is, we will have
+-                       * to check again.
+-                       */
+-                      spin_lock_irq(&conf->device_lock);
+-                      disks = conf->raid_disks;
+-                      if (logical_sector >= conf->expand_progress)
+-                              disks = conf->previous_raid_disks;
+-                      else {
+-                              if (logical_sector >= conf->expand_lo) {
+-                                      spin_unlock_irq(&conf->device_lock);
+-                                      schedule();
+-                                      goto retry;
+-                              }
+-                      }
+-                      spin_unlock_irq(&conf->device_lock);
+-              }
+-              data_disks = disks - conf->max_degraded;
++      sectors = bi->bi_size >> 9;
++      stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
+-              new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
+-                                                &dd_idx, &pd_idx, conf);
+-              PRINTK("raid5: make_request, sector %llu logical %llu\n",
+-                      (unsigned long long)new_sector, 
+-                      (unsigned long long)logical_sector);
++redo_bio:
++      /* stripe by stripe handle needs a stable raid layout, so if this
++       * reuqest covers the expanding region, wait it over. 
++       * Furthermore, we may get here with partial request handled, so
++       * wait for the bi_phys_segment to be 1 also. -jay */
++      spin_lock_irq(&conf->device_lock);
++      wait_event_lock_irq(conf->wait_for_overlap,
++                      (bi->bi_phys_segments == 1) &&
++                      !raid5_expanding_overlap(conf, bi),
++                      conf->device_lock,
++                      (unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap)));
++
++      disks = conf->raid_disks;
++      if (unlikely(logical_sector >= conf->expand_progress))
++              disks = conf->previous_raid_disks;
++      data_disks = disks - conf->max_degraded;
++      spin_unlock_irq(&conf->device_lock);
+-              sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
+-              if (sh) {
+-                      if (unlikely(conf->expand_progress != MaxSector)) {
+-                              /* expansion might have moved on while waiting for a
+-                               * stripe, so we must do the range check again.
+-                               * Expansion could still move past after this
+-                               * test, but as we are holding a reference to
+-                               * 'sh', we know that if that happens,
+-                               *  STRIPE_EXPANDING will get set and the expansion
+-                               * won't proceed until we finish with the stripe.
+-                               */
+-                              int must_retry = 0;
+-                              spin_lock_irq(&conf->device_lock);
+-                              if (logical_sector <  conf->expand_progress &&
+-                                  disks == conf->previous_raid_disks)
+-                                      /* mismatch, need to try again */
+-                                      must_retry = 1;
+-                              spin_unlock_irq(&conf->device_lock);
+-                              if (must_retry) {
+-                                      release_stripe(sh);
+-                                      goto retry;
++      /* compute the block # */
++      sectors_per_stripe = STRIPE_SECTORS * data_disks;
++      sectors_per_block = stripes_per_chunk * sectors_per_stripe;
++
++      block = logical_sector & ~((sector_t)sectors_per_block - 1);
++      sector_div(block, sectors_per_block);
++
++repeat:
++      stripe = block * (sectors_per_block / data_disks);
++      b_sector = stripe * data_disks;
++      /* iterate through all stripes in this block,
++       * where block is a set of internal stripes
++       * which covers chunk */
++
++      for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
++              r_sector = b_sector + (i * STRIPE_SECTORS);
++              sh = NULL;
++              /* iterrate through all pages in the stripe */
++              for (j = 0; j < data_disks && sectors > 0; j++) {
++                      DEFINE_WAIT(w);
++
++                      if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
++                          r_sector >= last_sector) {
++                              r_sector += sectors_per_chunk;
++                              continue;
++                      }
++
++retry:
++                      prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
++                      new_sector = raid5_compute_sector(r_sector, disks,
++                                                      data_disks, &dd_idx,
++                                                      &pd_idx, conf);
++                      if (sh == NULL) {
++                              sh = get_active_stripe(conf, new_sector, disks, pd_idx,
++                                                      (bi->bi_rw&RWA_MASK));
++                              if (sh) {
++                                      /* we're handling the bio stripe by stripe, so when we found
++                                       * the raid layout has been changed, we have to redo the 
++                                       * whole bio because we don't which sectors in it has been
++                                       * done, and which is not done. -jay */
++                                      if (raid5_redo_bio(conf, bi, disks, logical_sector))
++                                              goto redo_bio;
++
++                                      if (test_bit(STRIPE_EXPANDING, &sh->state)) {
++                                              /* Stripe is busy expanding or
++                                               * add failed due to overlap.  Flush everything
++                                               * and wait a while
++                                               */
++                                              release_stripe(sh);
++                                              sh = NULL;
++                                              raid5_unplug_device(mddev->queue);
++                                              schedule();
++                                              goto retry;
++                                      }
++                              } else {
++                                      /* cannot get stripe for read-ahead, just give-up */
++                                      finish_wait(&conf->wait_for_overlap, &w);
++                                      clear_bit(BIO_UPTODATE, &bi->bi_flags);
++                                      sectors = 0;
++                                      break;
+                               }
+                       }
++
+                       /* FIXME what if we get a false positive because these
+                        * are being updated.
+                        */
+-                      if (logical_sector >= mddev->suspend_lo &&
+-                          logical_sector < mddev->suspend_hi) {
++                      if (r_sector >= mddev->suspend_lo &&
++                          r_sector < mddev->suspend_hi) {
++                              handle_stripe(sh, NULL);
+                               release_stripe(sh);
++                              sh = NULL;
+                               schedule();
+                               goto retry;
+                       }
+-                      if (test_bit(STRIPE_EXPANDING, &sh->state) ||
+-                          !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
+-                              /* Stripe is busy expanding or
+-                               * add failed due to overlap.  Flush everything
+-                               * and wait a while
+-                               */
+-                              raid5_unplug_device(mddev->queue);
++                      if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
++                              handle_stripe(sh, NULL);
+                               release_stripe(sh);
++                              sh = NULL;
++                              raid5_unplug_device(mddev->queue);
+                               schedule();
+                               goto retry;
+                       }
+                       finish_wait(&conf->wait_for_overlap, &w);
++
++                      BUG_ON (new_sector != stripe);
++                      sectors -= STRIPE_SECTORS;
++                      if (bi->bi_sector > r_sector)
++                              sectors += bi->bi_sector - r_sector;
++                      if (r_sector + STRIPE_SECTORS > last_sector)
++                              sectors += r_sector + STRIPE_SECTORS - last_sector;
++                      r_sector += sectors_per_chunk;
++              }
++              if (sh) {
+                       handle_stripe(sh, NULL);
+                       release_stripe(sh);
+-              } else {
+-                      /* cannot get stripe for read-ahead, just give-up */
+-                      clear_bit(BIO_UPTODATE, &bi->bi_flags);
+-                      finish_wait(&conf->wait_for_overlap, &w);
+-                      break;
++                      sh = NULL;
+               }
+-                      
++              stripe += STRIPE_SECTORS;
+       }
++      block++;
++      if (sectors > 0)
++              goto repeat;
++
+       spin_lock_irq(&conf->device_lock);
+       remaining = --bi->bi_phys_segments;
+       spin_unlock_irq(&conf->device_lock);
+@@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq
+                       atomic_read(&conf->active_stripes),
+                       atomic_read(&conf->in_reqs_in_queue),
+                       atomic_read(&conf->out_reqs_in_queue));
++      seq_printf (seq, "\t\t%u expanding overlap\n",
++                      atomic_read(&conf->expanding_overlap));
+ #if RAID5_DEBUG
+       seq_printf (seq, "\n");
+       printall(seq, conf);
+diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h    2007-12-28 14:55:08.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 18:09:37.000000000 +0800
+@@ -278,6 +278,7 @@ struct raid5_private_data {
+       atomic_t                bit_delayed;
+       atomic_t                in_reqs_in_queue;
+       atomic_t                out_reqs_in_queue;
++      atomic_t                expanding_overlap;
+ };
+ typedef struct raid5_private_data raid5_conf_t;
diff --git a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch
new file mode 100644 (file)
index 0000000..fa92977
--- /dev/null
@@ -0,0 +1,446 @@
+diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
+--- linux-2.6.18-53.orig/drivers/md/raid5.c    2007-12-28 19:09:20.000000000 +0800
++++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:09:32.000000000 +0800
+@@ -633,6 +633,7 @@ static int raid5_end_read_request(struct
+               clear_buffer_uptodate(bh);
+       }
+ #endif
++      BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+       clear_bit(R5_LOCKED, &sh->dev[i].flags);
+       set_bit(STRIPE_HANDLE, &sh->state);
+       release_stripe(sh);
+@@ -671,6 +672,10 @@ static int raid5_end_write_request (stru
+       rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+       
++      if (test_bit(R5_Direct, &sh->dev[i].flags)) {
++              BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
++              sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
++      }
+       clear_bit(R5_LOCKED, &sh->dev[i].flags);
+       set_bit(STRIPE_HANDLE, &sh->state);
+       __release_stripe(conf, sh);
+@@ -911,7 +916,27 @@ static sector_t compute_blocknr(struct s
+       return r_sector;
+ }
++static struct page *zero_copy_data(struct bio *bio, sector_t sector)
++{
++      sector_t bi_sector = bio->bi_sector;
++      struct page *page = NULL;
++      struct bio_vec *bvl;
++      int i;
++      bio_for_each_segment(bvl, bio, i) {
++              if (sector == bi_sector)
++                      page = bio_iovec_idx(bio, i)->bv_page;
++              bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
++              if (bi_sector >= sector + STRIPE_SECTORS) {
++                      /* check if the stripe is covered by one page */
++                      if (page == bio_iovec_idx(bio, i)->bv_page &&
++                          PageConstant(page))
++                              return page;
++                      return NULL;
++              }
++      }
++      return NULL;
++}
+ /*
+  * Copy data between a page in the stripe cache, and one or more bion
+@@ -1003,8 +1028,9 @@ static void compute_parity5(struct strip
+ {
+       raid5_conf_t *conf = sh->raid_conf;
+       int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
+-      void *ptr[MAX_XOR_BLOCKS];
++      void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
+       struct bio *chosen;
++      struct page *page;
+       PRINTK("compute_parity5, stripe %llu, method %d\n",
+               (unsigned long long)sh->sector, method);
+@@ -1054,34 +1080,90 @@ static void compute_parity5(struct strip
+               count = 1;
+       }
+       
+-      for (i = disks; i--;)
+-              if (sh->dev[i].written) {
+-                      sector_t sector = sh->dev[i].sector;
+-                      struct bio *wbi = sh->dev[i].written;
+-                      while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+-                              copy_data(1, wbi, sh->dev[i].page, sector);
+-                              wbi = r5_next_bio(wbi, sector);
++      for (i = disks; i--;) {
++              struct r5dev *dev = &sh->dev[i];
++              struct bio *wbi = dev->written;
++              sector_t sector;
++
++              if (!wbi)
++                      continue;
++
++              sector = dev->sector;
++              set_bit(R5_LOCKED, &sh->dev[i].flags);
++              BUG_ON(test_bit(R5_Direct, &dev->flags));
++
++              /* check if it's covered by a single page
++                 and whole stripe is written at once.
++               * in this case we can avoid memcpy() */
++              if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
++                  test_bit(R5_Insync, &dev->flags)) {
++                      page = zero_copy_data(wbi, sector);
++                      if (page) {
++                              atomic_inc(&conf->writes_zcopy);
++                              dev->req.bi_io_vec[0].bv_page = page;
++                              set_bit(R5_Direct, &dev->flags);
++                              clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++                              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++                              continue;
+                       }
++              }
+-                      set_bit(R5_LOCKED, &sh->dev[i].flags);
+-                      set_bit(R5_UPTODATE, &sh->dev[i].flags);
++              /* do copy write */
++              atomic_inc(&conf->writes_copied);
++              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++              set_bit(R5_UPTODATE, &sh->dev[i].flags);
++              while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++                      copy_data(1, wbi, sh->dev[i].page, sector);
++                      wbi = r5_next_bio(wbi, sector);
+               }
++      }
++      h_ptr[0] = ptr[0];
+       switch(method) {
+       case RECONSTRUCT_WRITE:
+       case CHECK_PARITY:
+-              for (i=disks; i--;)
+-                      if (i != pd_idx) {
+-                              ptr[count++] = page_address(sh->dev[i].page);
+-                              check_xor();
++              for (i=disks; i--;) {
++                      if (i == pd_idx)
++                              continue;
++                      if (test_bit(R5_Direct, &sh->dev[i].flags))
++                              page = sh->dev[i].req.bi_io_vec[0].bv_page;
++                      else
++                              page = sh->dev[i].page;
++
++                      /* have to compute the parity immediately for
++                       * a highmem page. it would happen for zerocopy. -jay
++                       */
++                      if (PageHighMem(page)) {
++                              h_ptr[1] = kmap_atomic(page, KM_USER0);
++                              xor_block(2, STRIPE_SIZE, h_ptr);
++                              kunmap_atomic(page, KM_USER0);
++                      } else {
++                              ptr[count++] = page_address(page);
+                       }
++                      check_xor();
++              }
+               break;
+       case READ_MODIFY_WRITE:
+-              for (i = disks; i--;)
+-                      if (sh->dev[i].written) {
+-                              ptr[count++] = page_address(sh->dev[i].page);
+-                              check_xor();
++              for (i = disks; i--;) {
++                      if (!sh->dev[i].written)
++                              continue;
++                      if (test_bit(R5_Direct, &sh->dev[i].flags))
++                              page = sh->dev[i].req.bi_io_vec[0].bv_page;
++                      else
++                              page = sh->dev[i].page;
++
++                      /* have to compute the parity immediately for
++                       * a highmem page. it would happen for zerocopy. -jay
++                       */
++                      if (PageHighMem(page)) {
++                              h_ptr[1] = kmap_atomic(page, KM_USER0);
++                              xor_block(2, STRIPE_SIZE, h_ptr);
++                              kunmap_atomic(page, KM_USER0);
++                      } else {
++                              ptr[count++] = page_address(page);
+                       }
++                      check_xor();
++              }
+       }
+       if (count != 1)
+               xor_block(count, STRIPE_SIZE, ptr);
+@@ -1098,6 +1180,7 @@ static void compute_parity6(struct strip
+       raid6_conf_t *conf = sh->raid_conf;
+       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+       struct bio *chosen;
++      struct page *page;
+       /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+       void *ptrs[disks];
+@@ -1127,18 +1210,47 @@ static void compute_parity6(struct strip
+               BUG();          /* Not implemented yet */
+       }
+-      for (i = disks; i--;)
+-              if (sh->dev[i].written) {
+-                      sector_t sector = sh->dev[i].sector;
+-                      struct bio *wbi = sh->dev[i].written;
+-                      while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+-                              copy_data(1, wbi, sh->dev[i].page, sector);
+-                              wbi = r5_next_bio(wbi, sector);
++      for (i = disks; i--;) {
++              struct r5dev *dev = &sh->dev[i];
++              struct bio *wbi = dev->written;
++              sector_t sector;
++
++              if (!wbi)
++                      continue;
++
++              sector = sh->dev[i].sector;
++              set_bit(R5_LOCKED, &sh->dev[i].flags);
++              BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
++
++              /* check if it's covered by a single page
++               * and whole stripe is written at once.
++               * in this case we can avoid memcpy() */
++              if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
++                  test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
++                      page = zero_copy_data(wbi, sector);
++                      /* we don't do zerocopy on a HighMem page. Raid6 tend
++                       * to prepare all of the pages' content to be accessed
++                       * before computing PQ parity. If we need to support HighMem
++                       * page also, we have to modify the gen_syndrome()
++                       * algorithm. -jay */
++                      if (page && !PageHighMem(page)) {
++                              atomic_inc(&conf->writes_zcopy);
++                              sh->dev[i].req.bi_io_vec[0].bv_page = page;
++                              set_bit(R5_Direct, &sh->dev[i].flags);
++                              clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++                              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++                              continue;
+                       }
++              }
+-                      set_bit(R5_LOCKED, &sh->dev[i].flags);
+-                      set_bit(R5_UPTODATE, &sh->dev[i].flags);
++              atomic_inc(&conf->writes_copied);
++              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++              set_bit(R5_UPTODATE, &sh->dev[i].flags);
++              while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++                      copy_data(1, wbi, sh->dev[i].page, sector);
++                      wbi = r5_next_bio(wbi, sector);
+               }
++      }
+ //    switch(method) {
+ //    case RECONSTRUCT_WRITE:
+@@ -1149,8 +1261,12 @@ static void compute_parity6(struct strip
+               count = 0;
+               i = d0_idx;
+               do {
+-                      ptrs[count++] = page_address(sh->dev[i].page);
+-                      if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
++                      if (test_bit(R5_Direct, &sh->dev[i].flags))
++                              ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
++                      else
++                              ptrs[count++] = page_address(sh->dev[i].page);
++                      if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
++                          !test_bit(R5_Direct, &sh->dev[i].flags))
+                               printk("block %d/%d not uptodate on parity calc\n", i,count);
+                       i = raid6_next_disk(i, disks);
+               } while ( i != d0_idx );
+@@ -1597,7 +1713,8 @@ static void handle_stripe5(struct stripe
+               if (sh->dev[i].written) {
+                   dev = &sh->dev[i];
+                   if (!test_bit(R5_LOCKED, &dev->flags) &&
+-                       test_bit(R5_UPTODATE, &dev->flags) ) {
++                       (test_bit(R5_UPTODATE, &dev->flags) ||
++                        test_bit(R5_Direct, &dev->flags)) ) {
+                       /* We can return any write requests */
+                           struct bio *wbi, *wbi2;
+                           int bitmap_end = 0;
+@@ -1605,6 +1722,7 @@ static void handle_stripe5(struct stripe
+                           spin_lock_irq(&conf->device_lock);
+                           wbi = dev->written;
+                           dev->written = NULL;
++                          clear_bit(R5_Direct, &dev->flags);
+                           while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                   wbi2 = r5_next_bio(wbi, dev->sector);
+                                   if (--wbi->bi_phys_segments == 0) {
+@@ -2173,7 +2291,8 @@ static void handle_stripe6(struct stripe
+                       if (sh->dev[i].written) {
+                               dev = &sh->dev[i];
+                               if (!test_bit(R5_LOCKED, &dev->flags) &&
+-                                  test_bit(R5_UPTODATE, &dev->flags) ) {
++                                  (test_bit(R5_UPTODATE, &dev->flags) ||
++                                   test_bit(R5_Direct, &dev->flags)) ) {
+                                       /* We can return any write requests */
+                                       int bitmap_end = 0;
+                                       struct bio *wbi, *wbi2;
+@@ -2182,6 +2301,7 @@ static void handle_stripe6(struct stripe
+                                       spin_lock_irq(&conf->device_lock);
+                                       wbi = dev->written;
+                                       dev->written = NULL;
++                                      clear_bit(R5_Direct, &dev->flags);
+                                       while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                               wbi2 = r5_next_bio(wbi, dev->sector);
+                                               if (--wbi->bi_phys_segments == 0) {
+@@ -3450,6 +3570,9 @@ static int run(mddev_t *mddev)
+       mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
+       mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
++      /* raid5 device is able to do zcopy right now. */
++      mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
++
+       return 0;
+ abort:
+       if (conf) {
+@@ -3536,9 +3659,11 @@ static void status (struct seq_file *seq
+                       atomic_read(&conf->handled_in_raid5d),
+                       atomic_read(&conf->out_of_stripes),
+                       atomic_read(&conf->handle_called));
+-      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
+                       atomic_read(&conf->reads_for_rmw),
+-                      atomic_read(&conf->reads_for_rcw));
++                      atomic_read(&conf->reads_for_rcw),
++                      atomic_read(&conf->writes_zcopy),
++                      atomic_read(&conf->writes_copied));
+       seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
+                       atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
+                       atomic_read(&conf->active_stripes),
+diff -pur linux-2.6.18-53.orig/include/linux/backing-dev.h linux-2.6.18-53/include/linux/backing-dev.h
+--- linux-2.6.18-53.orig/include/linux/backing-dev.h   2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/include/linux/backing-dev.h        2007-12-28 19:09:32.000000000 +0800
+@@ -48,6 +48,7 @@ struct backing_dev_info {
+ #define BDI_CAP_READ_MAP      0x00000010      /* Can be mapped for reading */
+ #define BDI_CAP_WRITE_MAP     0x00000020      /* Can be mapped for writing */
+ #define BDI_CAP_EXEC_MAP      0x00000040      /* Can be mapped for execution */
++#define BDI_CAP_PAGE_CONSTANT_WRITE   0x00000080      /* Zcopy write - for raid5 */
+ #define BDI_CAP_VMFLAGS \
+       (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
+@@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
+ #define bdi_cap_account_dirty(bdi) \
+       (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
++#define bdi_cap_page_constant_write(bdi) \
++      ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
++
+ #define mapping_cap_writeback_dirty(mapping) \
+       bdi_cap_writeback_dirty((mapping)->backing_dev_info)
+ #define mapping_cap_account_dirty(mapping) \
+       bdi_cap_account_dirty((mapping)->backing_dev_info)
++#define mapping_cap_page_constant_write(mapping) \
++      bdi_cap_page_constant_write((mapping)->backing_dev_info)
++      
++
+ #endif                /* _LINUX_BACKING_DEV_H */
+diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/include/linux/page-flags.h
+--- linux-2.6.18-53.orig/include/linux/page-flags.h    2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/include/linux/page-flags.h 2007-12-28 19:09:32.000000000 +0800
+@@ -86,6 +86,7 @@
+ #define PG_reclaim            17      /* To be reclaimed asap */
+ #define PG_nosave_free                18      /* Free, should not be written */
+ #define PG_buddy              19      /* Page is free, on buddy lists */
++#define PG_constant           20      /* To mark if the page is constant */
+ /* PG_owner_priv_1 users should have descriptive aliases */
+ #define PG_checked              PG_owner_priv_1 /* Used by some filesystems */
+@@ -252,6 +253,14 @@
+ struct page;  /* forward declaration */
++#define PageConstant(page)    test_bit(PG_constant, &(page)->flags)
++#define SetPageConstant(page)         set_bit(PG_constant, &(page)->flags)
++#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
++#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
++
++extern int set_page_constant(struct page *page);
++extern void clear_page_constant(struct page *);
++
+ int test_clear_page_dirty(struct page *page);
+ int test_clear_page_writeback(struct page *page);
+ int test_set_page_writeback(struct page *page);
+diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
+--- linux-2.6.18-53.orig/include/linux/raid/raid5.h    2007-12-28 18:55:24.000000000 +0800
++++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 19:09:32.000000000 +0800
+@@ -156,8 +156,9 @@ struct stripe_head {
+ #define       R5_Overlap      7       /* There is a pending overlapping request on this block */
+ #define       R5_ReadError    8       /* seen a read error here recently */
+ #define       R5_ReWrite      9       /* have tried to over-write the readerror */
+-
+ #define       R5_Expanded     10      /* This block now has post-expand data */
++#define       R5_Direct       11      /* Use the pages in bio to do the write directly. */
++
+ /*
+  * Write method
+  */
+diff -pur linux-2.6.18-53.orig/mm/filemap.c linux-2.6.18-53/mm/filemap.c
+--- linux-2.6.18-53.orig/mm/filemap.c  2007-12-28 14:49:26.000000000 +0800
++++ linux-2.6.18-53/mm/filemap.c       2007-12-28 19:09:32.000000000 +0800
+@@ -30,6 +30,7 @@
+ #include <linux/security.h>
+ #include <linux/syscalls.h>
+ #include <linux/cpuset.h>
++#include <linux/rmap.h>
+ #include "filemap.h"
+ #include "internal.h"
+@@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
+               if (!test_clear_page_writeback(page))
+                       BUG();
+       }
++      clear_page_constant(page);
+       smp_mb__after_clear_bit();
+       wake_up_page(page, PG_writeback);
+ }
+ EXPORT_SYMBOL(end_page_writeback);
++/* Make a page to be constant, `constant' means any write to this page will
++ * be blocked until clear_page_constant is called.
++ * The page lock must be held.
++ */
++int set_page_constant(struct page *page)
++{
++      BUG_ON(!PageLocked(page));
++
++      /* If it's an anonymous page and haven't been added to swap cache,
++       * return directly because we have no way to swap this page.
++       */
++      if (page_mapping(page) == NULL)
++              return SWAP_FAIL;
++
++      BUG_ON(!PageUptodate(page));
++
++      /* I have to clear page uptodate before trying to remove
++       * it from user's page table because otherwise, the page may be
++       * reinstalled by a page access which happens between try_to_unmap()
++       * and ClearPageUptodate(). -jay
++       */
++      ClearPageUptodate(page);
++      if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
++              SetPageUptodate(page);
++              return SWAP_FAIL;
++      }
++      SetPageConstant(page);
++      return SWAP_SUCCESS;
++}
++
++void clear_page_constant(struct page *page)
++{
++      if (PageConstant(page)) {
++              BUG_ON(!PageLocked(page));
++              BUG_ON(PageUptodate(page));
++              ClearPageConstant(page);
++              SetPageUptodate(page);
++              unlock_page(page);
++      }
++}
++EXPORT_SYMBOL(set_page_constant);
++EXPORT_SYMBOL(clear_page_constant);
++
+ /**
+  * __lock_page - get a lock on the page, assuming we need to sleep to get it
+  * @page: the page to lock
index 421296f..1f2cb66 100644 (file)
@@ -10,3 +10,10 @@ export-show_task-2.6.18-vanilla.patch
 sd_iostats-2.6-rhel5.patch
 export_symbol_numa-2.6-fc5.patch
 jbd-stats-2.6-rhel5.patch
+raid5-stats-rhel5.patch
+raid5-configurable-cachesize-rhel5.patch
+raid5-large-io-rhel5.patch
+raid5-stripe-by-stripe-handling-rhel5.patch
+raid5-merge-ios-rhel5.patch
+raid5-zerocopy-rhel5.patch
+md-rebuild-policy.patch