+++ /dev/null
-RAID5 wasn't designed to support overlapping requests because
-in Linux all I/Os are serialized by page/buffer lock. As Lustre
-doesn't use pagecache on server, we need to serialize I/Os in RAID5.
-
-Index: linux-2.6.9/include/linux/raid/raid5.h
-===================================================================
---- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400
-+++ linux-2.6.9/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400
-@@ -134,6 +134,7 @@ struct stripe_head {
- unsigned long state; /* state flags */
- atomic_t count; /* nr of active thread/requests */
- spinlock_t lock;
-+ wait_queue_head_t wait; /* waitchan for overlapped bio's */
- struct r5dev {
- struct bio req;
- struct bio_vec vec;
-Index: linux-2.6.9/drivers/md/raid5.c
-===================================================================
---- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:11:21.000000000 +0400
-+++ linux-2.6.9/drivers/md/raid5.c 2006-05-22 00:19:27.000000000 +0400
-@@ -308,6 +308,7 @@ static int grow_stripes(raid5_conf_t *co
- memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
- sh->raid_conf = conf;
- sh->lock = SPIN_LOCK_UNLOCKED;
-+ init_waitqueue_head(&sh->wait);
-
- if (grow_buffers(sh, conf->raid_disks)) {
- shrink_buffers(sh, conf->raid_disks);
-@@ -878,6 +879,9 @@ static void compute_parity(struct stripe
- set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
- } else
- clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-+
-+ /* probably someone waits for our completion? */
-+ wake_up(&sh->wait);
- }
-
- /*
-@@ -885,7 +889,7 @@ static void compute_parity(struct stripe
- * toread/towrite point to the first in a chain.
- * The bi_next chain must be in order.
- */
--static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
-+static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
- {
- struct bio **bip;
- raid5_conf_t *conf = sh->raid_conf;
-@@ -894,13 +898,21 @@ static void add_stripe_bio (struct strip
- (unsigned long long)bi->bi_sector,
- (unsigned long long)sh->sector);
-
--
- spin_lock(&sh->lock);
- spin_lock_irq(&conf->device_lock);
- if (forwrite)
- bip = &sh->dev[dd_idx].towrite;
- else
- bip = &sh->dev[dd_idx].toread;
-+
-+#if 1
-+ if (*bip) {
-+ /* overlapping bio, let's wait till first one is completed */
-+ spin_unlock_irq(&conf->device_lock);
-+ spin_unlock(&sh->lock);
-+ return 1;
-+ }
-+#else
- while (*bip && (*bip)->bi_sector < bi->bi_sector) {
- BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
- bip = & (*bip)->bi_next;
-@@ -910,6 +922,7 @@ static void add_stripe_bio (struct strip
- BUG();
- if (*bip)
- bi->bi_next = *bip;
-+#endif
- *bip = bi;
- bi->bi_phys_segments ++;
- spin_unlock_irq(&conf->device_lock);
-@@ -932,6 +945,7 @@ static void add_stripe_bio (struct strip
- if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
- set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
- }
-+ return 0;
- }
-
- /*
-@@ -1014,6 +1028,7 @@ static void handle_stripe(struct stripe_
- rbi = dev->toread;
- dev->toread = NULL;
- spin_unlock_irq(&conf->device_lock);
-+ wake_up(&sh->wait);
- while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
- copy_data(0, rbi, dev->page, dev->sector);
- rbi2 = r5_next_bio(rbi, dev->sector);
-@@ -1059,6 +1074,7 @@ static void handle_stripe(struct stripe_
- bi = sh->dev[i].towrite;
- sh->dev[i].towrite = NULL;
- if (bi) to_write--;
-+ wake_up(&sh->wait);
-
- while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
- struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
-@@ -1511,6 +1527,16 @@ static inline void raid5_plug_device(rai
- spin_unlock_irq(&conf->device_lock);
- }
-
-+static inline void raid5_wait_stripe(struct stripe_head *sh, int dd_idx, int forwrite)
-+{
-+ struct bio **bip;
-+ if (forwrite)
-+ bip = &sh->dev[dd_idx].towrite;
-+ else
-+ bip = &sh->dev[dd_idx].toread;
-+ wait_event(sh->wait, *bip == NULL);
-+}
-+
- static int make_request (request_queue_t *q, struct bio * bi)
- {
- mddev_t *mddev = q->queuedata;
-@@ -1580,6 +1606,7 @@ repeat:
- * if we can't, then it's time to submit
- * all collected bio's in order to free
- * some space in the cache -bzzz */
-+try_stripe:
- sh = get_active_stripe(conf, new_sector, pd_idx, 1);
- if (!sh && !(bi->bi_rw&RWA_MASK)) {
- raid5_flush_bios(conf, bios, raid_disks);
-@@ -1587,7 +1614,11 @@ repeat:
- }
- }
- if (sh) {
-- add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
-+ if (add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
-+ release_stripe(sh);
-+ raid5_wait_stripe(sh, dd_idx, bi->bi_rw&RW_MASK);
-+ goto try_stripe;
-+ }
- } else {
- /* cannot get stripe for read-ahead, just give-up */
- clear_bit(BIO_UPTODATE, &bi->bi_flags);