Whamcloud - gitweb
b=10896
authorjxiong <jxiong>
Thu, 10 Jan 2008 06:19:02 +0000 (06:19 +0000)
committerjxiong <jxiong>
Thu, 10 Jan 2008 06:19:02 +0000 (06:19 +0000)
r=alex,adilger

porting the improvments of raid5 to raid6.

lustre/kernel_patches/patches/raid6-configurable-cachesize.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid6-large-io.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid6-merge-ios.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid6-stats.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch [new file with mode: 0644]
lustre/kernel_patches/patches/raid6-zerocopy.patch [new file with mode: 0644]
lustre/kernel_patches/series/2.6-rhel4.series

diff --git a/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch b/lustre/kernel_patches/patches/raid6-configurable-cachesize.patch
new file mode 100644 (file)
index 0000000..fa28bc3
--- /dev/null
@@ -0,0 +1,45 @@
+--- linux-2.6.9.orig/drivers/md/raid6main.c    2006-09-07 23:10:43.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c 2006-09-07 23:11:25.000000000 +0800
+@@ -33,7 +33,7 @@
+  * Stripe cache
+  */
+-#define NR_STRIPES            256
++static int raid6_nr_stripes = 256 * 8;
+ #define STRIPE_SIZE           PAGE_SIZE
+ #define STRIPE_SHIFT          (PAGE_SHIFT - 9)
+ #define STRIPE_SECTORS                (STRIPE_SIZE>>9)
+@@ -111,7 +111,7 @@ static inline void __release_stripe(raid
+                       list_add_tail(&sh->lru, &conf->inactive_list);
+                       atomic_dec(&conf->active_stripes);
+                       if (!conf->inactive_blocked ||
+-                          atomic_read(&conf->active_stripes) < (NR_STRIPES*3/4))
++                          atomic_read(&conf->active_stripes) < (raid6_nr_stripes*3/4))
+                               wake_up(&conf->wait_for_stripe);
+               }
+       }
+@@ -274,7 +274,7 @@ static struct stripe_head *get_active_st
+                               conf->inactive_blocked = 1;
+                               wait_event_lock_irq(conf->wait_for_stripe,
+                                                   !list_empty(&conf->inactive_list) &&
+-                                                  (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
++                                                  (atomic_read(&conf->active_stripes) < (raid6_nr_stripes *3/4)
+                                                    || !conf->inactive_blocked),
+                                                   conf->device_lock,
+                                                   unplug_slaves(conf->mddev);
+@@ -1805,7 +1805,7 @@ static int run (mddev_t *mddev)
+       conf->chunk_size = mddev->chunk_size;
+       conf->level = mddev->level;
+       conf->algorithm = mddev->layout;
+-      conf->max_nr_stripes = NR_STRIPES;
++      conf->max_nr_stripes = raid6_nr_stripes;
+       /* device size must be a multiple of chunk size */
+       mddev->size &= ~(mddev->chunk_size/1024 -1);
+@@ -2139,5 +2139,6 @@ static void raid6_exit (void)
+ module_init(raid6_init);
+ module_exit(raid6_exit);
++module_param(raid6_nr_stripes, int, 0644);
+ MODULE_LICENSE("GPL");
+ MODULE_ALIAS("md-personality-8"); /* RAID6 */
diff --git a/lustre/kernel_patches/patches/raid6-large-io.patch b/lustre/kernel_patches/patches/raid6-large-io.patch
new file mode 100644 (file)
index 0000000..85a7f43
--- /dev/null
@@ -0,0 +1,14 @@
+--- linux-2.6.9.orig/drivers/md/raid6main.c    2006-09-07 23:12:09.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c 2006-09-07 23:12:44.000000000 +0800
+@@ -1775,6 +1775,11 @@ static int run (mddev_t *mddev)
+       mddev->queue->unplug_fn = raid6_unplug_device;
+       mddev->queue->issue_flush_fn = raid6_issue_flush;
++      /* in order to support large I/Os */
++      blk_queue_max_sectors(mddev->queue, mddev->chunk_size * mddev->raid_disks >> 9);
++      mddev->queue->max_phys_segments = mddev->chunk_size * mddev->raid_disks >> PAGE_SHIFT;
++      mddev->queue->max_hw_segments = mddev->chunk_size * mddev->raid_disks >> PAGE_SHIFT;
++
+       PRINTK("raid6: run(%s) called.\n", mdname(mddev));
+       ITERATE_RDEV(mddev,rdev,tmp) {
diff --git a/lustre/kernel_patches/patches/raid6-merge-ios.patch b/lustre/kernel_patches/patches/raid6-merge-ios.patch
new file mode 100644 (file)
index 0000000..e245ba7
--- /dev/null
@@ -0,0 +1,126 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c    2008-01-10 13:51:32.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 13:52:20.000000000 +0800
+@@ -956,6 +956,26 @@ static void add_stripe_bio (struct strip
+       }
+ }
++/*
++ * The whole idea is to collect all bio's and then issue them
++ * disk by disk to assist merging a bit -bzzz
++ */
++static void raid6_flush_bios(raid6_conf_t *conf, struct bio *bios[], int raid_disks)
++{
++      struct bio *bio, *nbio;
++      int i;
++ 
++      for (i = 0; i < raid_disks; i++) {
++              bio = bios[i];
++              while (bio) {
++                      nbio = bio->bi_next;
++                      bio->bi_next = NULL;
++                      generic_make_request(bio);
++                      bio = nbio;
++              }
++              bios[i] = NULL;
++      }
++}
+ /*
+  * handle_stripe - do things to a stripe.
+@@ -975,7 +995,7 @@ static void add_stripe_bio (struct strip
+  *
+  */
+-static void handle_stripe(struct stripe_head *sh)
++static void handle_stripe(struct stripe_head *sh, struct bio *bios[])
+ {
+       raid6_conf_t *conf = sh->raid_conf;
+       int disks = conf->raid_disks;
+@@ -1452,7 +1472,11 @@ static void handle_stripe(struct stripe_
+                       bi->bi_size = STRIPE_SIZE;
+                       bi->bi_next = NULL;
+                       atomic_inc(&conf->out_reqs_in_queue);
+-                      generic_make_request(bi);
++                      if(bios) {
++                              bi->bi_next = bios[i];
++                              bios[i] = bi;
++                      } else 
++                              generic_make_request(bi);
+               } else {
+                       PRINTK("skip op %ld on disc %d for sector %llu\n",
+                               bi->bi_rw, i, (unsigned long long)sh->sector);
+@@ -1575,6 +1599,7 @@ static int make_request (request_queue_t
+       int sectors_per_chunk;
+       int stripes_per_chunk, sectors_per_block;
+       int sectors_per_stripe;
++      struct bio *bios[MD_SB_DISKS];
+       int i, j;
+       atomic_inc(&conf->in_reqs_in_queue);
+@@ -1611,6 +1636,7 @@ static int make_request (request_queue_t
+       sector_div(block, sectors_per_block);
+       sectors = bi->bi_size >> 9;
+  
++      memset(&bios, 0, sizeof(bios));
+  repeat:
+       stripe = block * (sectors_per_block / data_disks);
+       b_sector = stripe * data_disks;
+@@ -1630,9 +1656,17 @@ static int make_request (request_queue_t
+                       new_sector = raid6_compute_sector(r_sector, raid_disks,
+                                                       data_disks, &dd_idx, 
+                                                       &pd_idx, conf);
+-                      if (sh == NULL)
+-                              sh = get_active_stripe(conf, new_sector, pd_idx,
+-                                                      (bi->bi_rw&RWA_MASK));
++                      if (sh == NULL) {
++                              /* first, try to get stripe w/o blocking
++                               * if we can't, then it's time to submit
++                               * all collected bio's in order to free
++                               * some space in the cache -bzzz */
++                              sh = get_active_stripe(conf, new_sector, pd_idx, 1);
++                              if (!sh && !(bi->bi_rw&RWA_MASK)) {
++                                      raid6_flush_bios(conf, bios, raid_disks);
++                                      sh = get_active_stripe(conf, new_sector, pd_idx, 0);
++                              }
++                      }
+                       if (sh) {
+                               add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
+                       } else {
+@@ -1653,7 +1687,7 @@ static int make_request (request_queue_t
+               if (sh) {
+                       raid6_plug_device(conf);
+-                      handle_stripe(sh);
++                      handle_stripe(sh, bios);
+                       release_stripe(sh);
+                       sh = NULL;
+               }
+@@ -1664,6 +1698,9 @@ static int make_request (request_queue_t
+       if(sectors > 0)
+               goto repeat;
++      /* now flush all bio's */
++      raid6_flush_bios(conf, bios, raid_disks);
++
+       spin_lock_irq(&conf->device_lock);
+       if (--bi->bi_phys_segments == 0) {
+               int bytes = bi->bi_size;
+@@ -1719,7 +1756,7 @@ static int sync_request (mddev_t *mddev,
+       clear_bit(STRIPE_INSYNC, &sh->state);
+       spin_unlock(&sh->lock);
+-      handle_stripe(sh);
++      handle_stripe(sh, NULL);
+       release_stripe(sh);
+       return STRIPE_SECTORS;
+@@ -1769,7 +1806,7 @@ static void raid6d (mddev_t *mddev)
+               handled++;
+               atomic_inc(&conf->handled_in_raid5d);
+-              handle_stripe(sh);
++              handle_stripe(sh, NULL);
+               release_stripe(sh);
+               spin_lock_irq(&conf->device_lock);
diff --git a/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch b/lustre/kernel_patches/patches/raid6-serialize-ovelapping-reqs.patch
new file mode 100644 (file)
index 0000000..5bc0a3e
--- /dev/null
@@ -0,0 +1,150 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c    2008-01-10 13:55:37.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 13:55:56.000000000 +0800
+@@ -749,6 +749,10 @@ static void compute_parity(struct stripe
+                       if ( i != pd_idx && i != qd_idx && sh->dev[i].towrite ) {
+                               chosen = sh->dev[i].towrite;
+                               sh->dev[i].towrite = NULL;
++
++                              if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
++                                      wake_up(&conf->wait_for_overlap);
++
+                               if (sh->dev[i].written) BUG();
+                               sh->dev[i].written = chosen;
+                       }
+@@ -907,7 +911,7 @@ static void compute_block_2(struct strip
+  * toread/towrite point to the first in a chain.
+  * The bi_next chain must be in order.
+  */
+-static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
++static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+ {
+       struct bio **bip;
+       raid6_conf_t *conf = sh->raid_conf;
+@@ -924,10 +928,13 @@ static void add_stripe_bio (struct strip
+       else
+               bip = &sh->dev[dd_idx].toread;
+       while (*bip && (*bip)->bi_sector < bi->bi_sector) {
+-              BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
++              if((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
++                      goto overlap;
+               bip = & (*bip)->bi_next;
+       }
+-/* FIXME do I need to worry about overlapping bion */
++      if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
++              goto overlap;
++
+       if (*bip && bi->bi_next && (*bip) != bi->bi_next)
+               BUG();
+       if (*bip)
+@@ -954,6 +961,14 @@ static void add_stripe_bio (struct strip
+               if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
+                       set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+       }
++
++      return 1;
++
++overlap:
++      set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
++      spin_unlock_irq(&conf->device_lock);
++      spin_unlock(&sh->lock);
++      return 0;
+ }
+ /*
+@@ -1038,6 +1053,9 @@ static void handle_stripe(struct stripe_
+                       spin_lock_irq(&conf->device_lock);
+                       rbi = dev->toread;
+                       dev->toread = NULL;
++
++                      if (test_and_clear_bit(R5_Overlap, &dev->flags))
++                              wake_up(&conf->wait_for_overlap);
+                       spin_unlock_irq(&conf->device_lock);
+                       while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                               copy_data(0, rbi, dev->page, dev->sector);
+@@ -1087,6 +1105,9 @@ static void handle_stripe(struct stripe_
+                       sh->dev[i].towrite = NULL;
+                       if (bi) to_write--;
++                      if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
++                              wake_up(&conf->wait_for_overlap);
++
+                       while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                               struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+                               clear_bit(BIO_UPTODATE, &bi->bi_flags);
+@@ -1115,6 +1136,8 @@ static void handle_stripe(struct stripe_
+                       if (!test_bit(R5_Insync, &sh->dev[i].flags)) {
+                               bi = sh->dev[i].toread;
+                               sh->dev[i].toread = NULL;
++                              if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
++                                      wake_up(&conf->wait_for_overlap);
+                               if (bi) to_read--;
+                               while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
+                                       struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
+@@ -1648,6 +1671,8 @@ static int make_request (request_queue_t
+               sh = NULL;
+               /* iterrate through all pages in the stripe */
+               for (j = 0; j < data_disks && sectors > 0; j++) {
++                      DEFINE_WAIT(w);
++
+                       if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
+                           r_sector >= last_sector) {
+                               r_sector += sectors_per_chunk;
+@@ -1656,6 +1681,9 @@ static int make_request (request_queue_t
+                       new_sector = raid6_compute_sector(r_sector, raid_disks,
+                                                       data_disks, &dd_idx, 
+                                                       &pd_idx, conf);
++
++retry:
++                      prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
+                       if (sh == NULL) {
+                               /* first, try to get stripe w/o blocking
+                                * if we can't, then it's time to submit
+@@ -1668,10 +1696,18 @@ static int make_request (request_queue_t
+                               }
+                       }
+                       if (sh) {
+-                              add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
++                              if(!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
++                                      /* Failed to be added due to overlapped. */
++                                      raid6_unplug_device(mddev->queue);
++                                      release_stripe(sh);
++                                      schedule();
++                                      goto retry;
++                              }
++                              finish_wait(&conf->wait_for_overlap, &w);
+                       } else {
+                               /* cannot get stripe for read-ahead, just give-up */
+                               clear_bit(BIO_UPTODATE, &bi->bi_flags);
++                              finish_wait(&conf->wait_for_overlap, &w);
+                               sectors = 0;
+                               break;
+                       }
+@@ -1847,6 +1883,7 @@ static int run (mddev_t *mddev)
+       conf->device_lock = SPIN_LOCK_UNLOCKED;
+       init_waitqueue_head(&conf->wait_for_stripe);
++      init_waitqueue_head(&conf->wait_for_overlap);
+       INIT_LIST_HEAD(&conf->handle_list);
+       INIT_LIST_HEAD(&conf->delayed_list);
+       INIT_LIST_HEAD(&conf->inactive_list);
+diff -pur linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h
+--- linux-2.6.9.orig/include/linux/raid/raid5.h        2008-01-10 13:46:05.000000000 +0800
++++ linux-2.6.9/include/linux/raid/raid5.h     2008-01-10 13:55:56.000000000 +0800
+@@ -154,6 +154,8 @@ struct stripe_head {
+ #define       R5_Wantwrite    5
+ #define       R5_Syncio       6       /* this io need to be accounted as resync io */
+ #define       R5_Direct       7       /* use page from passed bio to avoid memcpy */
++#define       R5_Overlap      8       /* There is a pending overlapping request 
++                                       * on this block */
+ /*
+  * Write method
+@@ -221,6 +223,7 @@ struct raid5_private_data {
+       atomic_t                active_stripes;
+       struct list_head        inactive_list;
+       wait_queue_head_t       wait_for_stripe;
++      wait_queue_head_t       wait_for_overlap;
+       int                     inactive_blocked;       /* release of inactive stripes blocked,
+                                                        * waiting for 25% to be free
+                                                        */        
diff --git a/lustre/kernel_patches/patches/raid6-stats.patch b/lustre/kernel_patches/patches/raid6-stats.patch
new file mode 100644 (file)
index 0000000..c173a08
--- /dev/null
@@ -0,0 +1,169 @@
+diff -pur linux-2.6.9-55.0.9.orig/drivers/md/raid6main.c linux-2.6.9-55.0.9/drivers/md/raid6main.c
+--- linux-2.6.9-55.0.9.orig/drivers/md/raid6main.c     2007-09-28 17:53:55.000000000 +0800
++++ linux-2.6.9-55.0.9/drivers/md/raid6main.c  2007-12-13 20:19:11.000000000 +0800
+@@ -96,9 +96,10 @@ static inline void __release_stripe(raid
+               if (atomic_read(&conf->active_stripes)==0)
+                       BUG();
+               if (test_bit(STRIPE_HANDLE, &sh->state)) {
+-                      if (test_bit(STRIPE_DELAYED, &sh->state))
++                      if (test_bit(STRIPE_DELAYED, &sh->state)) {
+                               list_add_tail(&sh->lru, &conf->delayed_list);
+-                      else
++                              atomic_inc(&conf->delayed);
++                      } else
+                               list_add_tail(&sh->lru, &conf->handle_list);
+                       md_wakeup_thread(conf->mddev->thread);
+               } else {
+@@ -269,6 +270,7 @@ static struct stripe_head *get_active_st
+                       if (noblock && sh == NULL)
+                               break;
+                       if (!sh) {
++                              atomic_inc(&conf->out_of_stripes);
+                               conf->inactive_blocked = 1;
+                               wait_event_lock_irq(conf->wait_for_stripe,
+                                                   !list_empty(&conf->inactive_list) &&
+@@ -290,6 +292,9 @@ static struct stripe_head *get_active_st
+                               if (list_empty(&sh->lru))
+                                       BUG();
+                               list_del_init(&sh->lru);
++                              if (test_bit(STRIPE_DELAYED, &sh->state))
++                                      atomic_dec(&conf->delayed);
++
+                       }
+               }
+       } while (sh == NULL);
+@@ -368,6 +373,8 @@ static int raid6_end_read_request (struc
+       if (bi->bi_size)
+               return 1;
++      atomic_dec(&conf->out_reqs_in_queue);
++
+       for (i=0 ; i<disks; i++)
+               if (bi == &sh->dev[i].req)
+                       break;
+@@ -445,6 +452,8 @@ static int raid6_end_write_request (stru
+               if (bi == &sh->dev[i].req)
+                       break;
++      atomic_dec(&conf->out_reqs_in_queue);
++
+       PRINTK("end_write_request %llu/%d, count %d, uptodate: %d.\n",
+               (unsigned long long)sh->sector, i, atomic_read(&sh->count),
+               uptodate);
+@@ -989,6 +998,7 @@ static void handle_stripe(struct stripe_
+       spin_lock(&sh->lock);
+       clear_bit(STRIPE_HANDLE, &sh->state);
+       clear_bit(STRIPE_DELAYED, &sh->state);
++      atomic_inc(&conf->handle_called);
+       syncing = test_bit(STRIPE_SYNCING, &sh->state);
+       /* Now to look around and see what can be done */
+@@ -1257,6 +1267,7 @@ static void handle_stripe(struct stripe_
+                                               set_bit(R5_LOCKED, &dev->flags);
+                                               set_bit(R5_Wantread, &dev->flags);
+                                               locked++;
++                                              atomic_inc(&conf->reads_for_rcw);
+                                       } else {
+                                               PRINTK("Request delayed stripe %llu block %d for Reconstruct\n",
+                                                      (unsigned long long)sh->sector, i);
+@@ -1390,6 +1401,7 @@ static void handle_stripe(struct stripe_
+               bi->bi_next = NULL;
+               bi->bi_size = 0;
+               bi->bi_end_io(bi, bytes, 0);
++              atomic_dec(&conf->in_reqs_in_queue);
+       }
+       for (i=disks; i-- ;) {
+               int rw;
+@@ -1405,10 +1417,13 @@ static void handle_stripe(struct stripe_
+               bi = &sh->dev[i].req;
+               bi->bi_rw = rw;
+-              if (rw)
++              if (rw) {
++                      atomic_inc(&conf->writes_out);
+                       bi->bi_end_io = raid6_end_write_request;
+-              else
++              } else {
++                      atomic_inc(&conf->reads_out);
+                       bi->bi_end_io = raid6_end_read_request;
++              }
+               spin_lock_irq(&conf->device_lock);
+               rdev = conf->disks[i].rdev;
+@@ -1436,12 +1451,14 @@ static void handle_stripe(struct stripe_
+                       bi->bi_io_vec[0].bv_offset = 0;
+                       bi->bi_size = STRIPE_SIZE;
+                       bi->bi_next = NULL;
++                      atomic_inc(&conf->out_reqs_in_queue);
+                       generic_make_request(bi);
+               } else {
+                       PRINTK("skip op %ld on disc %d for sector %llu\n",
+                               bi->bi_rw, i, (unsigned long long)sh->sector);
+                       clear_bit(R5_LOCKED, &sh->dev[i].flags);
+                       set_bit(STRIPE_HANDLE, &sh->state);
++                      atomic_dec(&conf->delayed);
+               }
+       }
+ }
+@@ -1555,6 +1572,8 @@ static int make_request (request_queue_t
+       sector_t logical_sector, last_sector;
+       struct stripe_head *sh;
++      atomic_inc(&conf->in_reqs_in_queue);
++
+       if (unlikely(bio_barrier(bi))) {
+               bio_endio(bi, bi->bi_size, -EOPNOTSUPP);
+               return 0;
+@@ -1563,9 +1582,11 @@ static int make_request (request_queue_t
+       if (bio_data_dir(bi)==WRITE) {
+               disk_stat_inc(mddev->gendisk, writes);
+               disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
++              atomic_inc(&conf->writes_in);
+       } else {
+               disk_stat_inc(mddev->gendisk, reads);
+               disk_stat_add(mddev->gendisk, read_sectors, bio_sectors(bi));
++              atomic_inc(&conf->reads_in);
+       }
+       logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
+@@ -1605,6 +1626,7 @@ static int make_request (request_queue_t
+               if ( bio_data_dir(bi) == WRITE )
+                       md_write_end(mddev);
++              atomic_dec(&conf->in_reqs_in_queue);
+               bi->bi_size = 0;
+               bi->bi_end_io(bi, bytes, 0);
+       }
+@@ -1701,6 +1723,8 @@ static void raid6d (mddev_t *mddev)
+               spin_unlock_irq(&conf->device_lock);
+               handled++;
++
++              atomic_inc(&conf->handled_in_raid5d);
+               handle_stripe(sh);
+               release_stripe(sh);
+@@ -1940,6 +1964,23 @@ static void status (struct seq_file *seq
+                           conf->disks[i].rdev &&
+                           conf->disks[i].rdev->in_sync ? "U" : "_");
+       seq_printf (seq, "]");
++ 
++      seq_printf (seq, "\n\t\tin: %u reads, %u writes; out: %u reads, %u writes",
++              atomic_read(&conf->reads_in), atomic_read(&conf->writes_in),
++              atomic_read(&conf->reads_out), atomic_read(&conf->writes_out));
++      seq_printf (seq, "\n\t\t%u in raid5d, %u out of stripes, %u handle called",
++              atomic_read(&conf->handled_in_raid5d),
++              atomic_read(&conf->out_of_stripes),
++              atomic_read(&conf->handle_called));
++      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++              atomic_read(&conf->reads_for_rmw),
++              atomic_read(&conf->reads_for_rcw));
++      seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
++              atomic_read(&conf->delayed),
++              atomic_read(&conf->active_stripes),
++              atomic_read(&conf->in_reqs_in_queue),
++              atomic_read(&conf->out_reqs_in_queue));
++
+ #if RAID6_DUMPSTATE
+       seq_printf (seq, "\n");
+       printall(seq, conf);
diff --git a/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch b/lustre/kernel_patches/patches/raid6-stripe-by-stripe-handling.patch
new file mode 100644 (file)
index 0000000..d29a6c3
--- /dev/null
@@ -0,0 +1,100 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c    2008-01-10 13:47:18.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 13:49:06.000000000 +0800
+@@ -1571,6 +1571,11 @@ static int make_request (request_queue_t
+       sector_t new_sector;
+       sector_t logical_sector, last_sector;
+       struct stripe_head *sh;
++      sector_t stripe, sectors, block, r_sector, b_sector;
++      int sectors_per_chunk;
++      int stripes_per_chunk, sectors_per_block;
++      int sectors_per_stripe;
++      int i, j;
+       atomic_inc(&conf->in_reqs_in_queue);
+@@ -1596,30 +1601,69 @@ static int make_request (request_queue_t
+       bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
+       if ( bio_data_dir(bi) == WRITE )
+               md_write_start(mddev);
+-      for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
+-              new_sector = raid6_compute_sector(logical_sector,
+-                                                raid_disks, data_disks, &dd_idx, &pd_idx, conf);
+-
+-              PRINTK("raid6: make_request, sector %Lu logical %Lu\n",
+-                     (unsigned long long)new_sector,
+-                     (unsigned long long)logical_sector);
++      sectors_per_chunk = conf->chunk_size >> 9;
++      stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
++      sectors_per_stripe = STRIPE_SECTORS * data_disks;
++      sectors_per_block = stripes_per_chunk * sectors_per_stripe;
++ 
++      block = logical_sector & ~((sector_t)sectors_per_block - 1);
++      sector_div(block, sectors_per_block);
++      sectors = bi->bi_size >> 9;
++ 
++ repeat:
++      stripe = block * (sectors_per_block / data_disks);
++      b_sector = stripe * data_disks;
++      /* iterate through all stripes in this block,
++       * where block is a set of internal stripes
++       * which covers chunk */
++      for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
++              r_sector = b_sector + (i * STRIPE_SECTORS);
++              sh = NULL;
++              /* iterrate through all pages in the stripe */
++              for (j = 0; j < data_disks && sectors > 0; j++) {
++                      if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
++                          r_sector >= last_sector) {
++                              r_sector += sectors_per_chunk;
++                              continue;
++                      }
++                      new_sector = raid6_compute_sector(r_sector, raid_disks,
++                                                      data_disks, &dd_idx, 
++                                                      &pd_idx, conf);
++                      if (sh == NULL)
++                              sh = get_active_stripe(conf, new_sector, pd_idx,
++                                                      (bi->bi_rw&RWA_MASK));
++                      if (sh) {
++                              add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
++                      } else {
++                              /* cannot get stripe for read-ahead, just give-up */
++                              clear_bit(BIO_UPTODATE, &bi->bi_flags);
++                              sectors = 0;
++                              break;
++                      }
++ 
++                      BUG_ON (new_sector != stripe);
++                      sectors -= STRIPE_SECTORS;
++                      if (bi->bi_sector > r_sector)
++                              sectors += bi->bi_sector - r_sector;
++                      if (r_sector + STRIPE_SECTORS > last_sector)
++                              sectors += r_sector + STRIPE_SECTORS - last_sector;
++                      r_sector += sectors_per_chunk;
++              }
+-              sh = get_active_stripe(conf, new_sector, pd_idx, (bi->bi_rw&RWA_MASK));
+               if (sh) {
+-
+-                      add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
+-
+                       raid6_plug_device(conf);
+                       handle_stripe(sh);
+                       release_stripe(sh);
+-              } else {
+-                      /* cannot get stripe for read-ahead, just give-up */
+-                      clear_bit(BIO_UPTODATE, &bi->bi_flags);
+-                      break;
++                      sh = NULL;
+               }
++              stripe += STRIPE_SECTORS;
+       }
++      block++;
++      if(sectors > 0)
++              goto repeat;
++
+       spin_lock_irq(&conf->device_lock);
+       if (--bi->bi_phys_segments == 0) {
+               int bytes = bi->bi_size;
diff --git a/lustre/kernel_patches/patches/raid6-zerocopy.patch b/lustre/kernel_patches/patches/raid6-zerocopy.patch
new file mode 100644 (file)
index 0000000..95b713d
--- /dev/null
@@ -0,0 +1,166 @@
+diff -pur linux-2.6.9.orig/drivers/md/raid6main.c linux-2.6.9/drivers/md/raid6main.c
+--- linux-2.6.9.orig/drivers/md/raid6main.c    2008-01-10 14:02:08.000000000 +0800
++++ linux-2.6.9/drivers/md/raid6main.c 2008-01-10 14:01:56.000000000 +0800
+@@ -430,6 +430,7 @@ static int raid6_end_read_request (struc
+               clear_buffer_uptodate(bh);
+       }
+ #endif
++      BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+       clear_bit(R5_LOCKED, &sh->dev[i].flags);
+       set_bit(STRIPE_HANDLE, &sh->state);
+       release_stripe(sh);
+@@ -468,6 +469,10 @@ static int raid6_end_write_request (stru
+       rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
++      if (test_bit(R5_Direct, &sh->dev[i].flags)) {
++              BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
++              sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
++      }
+       clear_bit(R5_LOCKED, &sh->dev[i].flags);
+       set_bit(STRIPE_HANDLE, &sh->state);
+       __release_stripe(conf, sh);
+@@ -664,7 +669,27 @@ static sector_t compute_blocknr(struct s
+       return r_sector;
+ }
++static struct page *zero_copy_data(struct bio *bio, sector_t sector)
++{
++      sector_t bi_sector = bio->bi_sector;
++      struct page *page = NULL;
++      struct bio_vec *bvl;
++      int i;
++      bio_for_each_segment(bvl, bio, i) {
++              if (sector == bi_sector)
++                      page = bio_iovec_idx(bio, i)->bv_page;
++              bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
++              if (bi_sector >= sector + STRIPE_SECTORS) {
++                      /* check if the stripe is covered by one page */
++                      if (page == bio_iovec_idx(bio, i)->bv_page &&
++                          PageConstant(page))
++                              return page;
++                      return NULL;
++              }
++      }
++      return NULL;
++}
+ /*
+  * Copy data between a page in the stripe cache, and one or more bion
+@@ -731,6 +756,7 @@ static void compute_parity(struct stripe
+       raid6_conf_t *conf = sh->raid_conf;
+       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
+       struct bio *chosen;
++      struct page *page;
+       /**** FIX THIS: This could be very bad if disks is close to 256 ****/
+       void *ptrs[disks];
+@@ -761,18 +787,46 @@ static void compute_parity(struct stripe
+               BUG();          /* Not implemented yet */
+       }
+-      for (i = disks; i--;)
+-              if (sh->dev[i].written) {
+-                      sector_t sector = sh->dev[i].sector;
+-                      struct bio *wbi = sh->dev[i].written;
+-                      while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+-                              copy_data(1, wbi, sh->dev[i].page, sector);
+-                              wbi = r5_next_bio(wbi, sector);
++      for (i = disks; i--;) {
++              struct bio *wbi = sh->dev[i].written;
++              sector_t sector;
++
++              if (!wbi)
++                      continue;
++
++              sector = sh->dev[i].sector;
++              set_bit(R5_LOCKED, &sh->dev[i].flags);
++              BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
++
++              /* check if it's covered by a single page
++               * and whole stripe is written at once.
++               * in this case we can avoid memcpy() */
++              if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
++                  test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
++                      page = zero_copy_data(wbi, sector);
++                      /* we don't do zerocopy on a HighMem page. Raid6 tend 
++                       * to prepare all of the pages' content to be accessed
++                       * before computing PQ parity. If we need to support HighMem
++                       * page also, we have to modify the gen_syndrome()
++                       * algorithm. -jay */
++                      if (page && !PageHighMem(page)) {
++                              atomic_inc(&conf->writes_zcopy);
++                              sh->dev[i].req.bi_io_vec[0].bv_page = page;
++                              set_bit(R5_Direct, &sh->dev[i].flags);
++                              clear_bit(R5_UPTODATE, &sh->dev[i].flags);
++                              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++                              continue;
+                       }
++              }
+-                      set_bit(R5_LOCKED, &sh->dev[i].flags);
+-                      set_bit(R5_UPTODATE, &sh->dev[i].flags);
++              atomic_inc(&conf->writes_copied);
++              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
++              set_bit(R5_UPTODATE, &sh->dev[i].flags);
++              while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
++                      copy_data(1, wbi, sh->dev[i].page, sector);
++                      wbi = r5_next_bio(wbi, sector);
+               }
++      }
+ //    switch(method) {
+ //    case RECONSTRUCT_WRITE:
+@@ -783,7 +837,10 @@ static void compute_parity(struct stripe
+               count = 0;
+               i = d0_idx;
+               do {
+-                      ptrs[count++] = page_address(sh->dev[i].page);
++                      if (test_bit(R5_Direct, &sh->dev[i].flags))
++                              ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
++                      else
++                              ptrs[count++] = page_address(sh->dev[i].page);
+                       i = raid6_next_disk(i, disks);
+               } while ( i != d0_idx );
+@@ -1185,7 +1242,8 @@ static void handle_stripe(struct stripe_
+                       if (sh->dev[i].written) {
+                               dev = &sh->dev[i];
+                               if (!test_bit(R5_LOCKED, &dev->flags) &&
+-                                  test_bit(R5_UPTODATE, &dev->flags) ) {
++                                  (test_bit(R5_UPTODATE, &dev->flags) ||
++                                       test_bit(R5_Direct, &dev->flags)) ) {
+                                       /* We can return any write requests */
+                                       struct bio *wbi, *wbi2;
+                                       PRINTK("Return write for stripe %llu disc %d\n",
+@@ -1193,6 +1251,7 @@ static void handle_stripe(struct stripe_
+                                       spin_lock_irq(&conf->device_lock);
+                                       wbi = dev->written;
+                                       dev->written = NULL;
++                                      clear_bit(R5_Direct, &dev->flags);
+                                       while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
+                                               wbi2 = r5_next_bio(wbi, dev->sector);
+                                               if (--wbi->bi_phys_segments == 0) {
+@@ -2008,6 +2067,7 @@ static int run (mddev_t *mddev)
+               if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
+                       mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
+       }
++      mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
+       /* Ok, everything is just fine now */
+       mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
+@@ -2095,9 +2155,11 @@ static void status (struct seq_file *seq
+               atomic_read(&conf->handled_in_raid5d),
+               atomic_read(&conf->out_of_stripes),
+               atomic_read(&conf->handle_called));
+-      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
++      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
+               atomic_read(&conf->reads_for_rmw),
+-              atomic_read(&conf->reads_for_rcw));
++              atomic_read(&conf->reads_for_rcw),
++              atomic_read(&conf->writes_zcopy),
++              atomic_read(&conf->writes_copied));
+       seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
+               atomic_read(&conf->delayed),
+               atomic_read(&conf->active_stripes),
index 1cdc809..072df44 100644 (file)
@@ -24,6 +24,13 @@ raid5-stripe-by-stripe-handling.patch
 raid5-merge-ios.patch
 raid5-serialize-ovelapping-reqs.patch
 raid5-zerocopy.patch
+raid6-stats.patch
+raid6-configurable-cachesize.patch
+raid6-large-io.patch
+raid6-stripe-by-stripe-handling.patch
+raid6-merge-ios.patch
+raid6-serialize-ovelapping-reqs.patch
+raid6-zerocopy.patch
 jbd-stats-2.6.9.patch 
 bitops_ext2_find_next_le_bit-2.6.patch 
 quota-deadlock-on-pagelock-core.patch