RAID5 wasn't designed to support overlapping requests because in Linux all I/Os are serialized by page/buffer lock. As Lustre doesn't use pagecache on server, we need to serialize I/Os in RAID5. Index: linux-2.6.9/include/linux/raid/raid5.h =================================================================== --- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400 +++ linux-2.6.9/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400 @@ -134,6 +134,7 @@ struct stripe_head { unsigned long state; /* state flags */ atomic_t count; /* nr of active thread/requests */ spinlock_t lock; + wait_queue_head_t wait; /* waitchan for overlapped bio's */ struct r5dev { struct bio req; struct bio_vec vec; Index: linux-2.6.9/drivers/md/raid5.c =================================================================== --- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:11:21.000000000 +0400 +++ linux-2.6.9/drivers/md/raid5.c 2006-05-22 00:19:27.000000000 +0400 @@ -308,6 +308,7 @@ static int grow_stripes(raid5_conf_t *co memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev)); sh->raid_conf = conf; sh->lock = SPIN_LOCK_UNLOCKED; + init_waitqueue_head(&sh->wait); if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -878,6 +879,9 @@ static void compute_parity(struct stripe set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); } else clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + + /* probably someone waits for our completion? */ + wake_up(&sh->wait); } /* @@ -885,7 +889,7 @@ static void compute_parity(struct stripe * toread/towrite point to the first in a chain. * The bi_next chain must be in order. */ -static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) +static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) { struct bio **bip; raid5_conf_t *conf = sh->raid_conf; @@ -894,13 +898,21 @@ static void add_stripe_bio (struct strip (unsigned long long)bi->bi_sector, (unsigned long long)sh->sector); - spin_lock(&sh->lock); spin_lock_irq(&conf->device_lock); if (forwrite) bip = &sh->dev[dd_idx].towrite; else bip = &sh->dev[dd_idx].toread; + +#if 1 + if (*bip) { + /* overlapping bio, let's wait till first one is completed */ + spin_unlock_irq(&conf->device_lock); + spin_unlock(&sh->lock); + return 1; + } +#else while (*bip && (*bip)->bi_sector < bi->bi_sector) { BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector); bip = & (*bip)->bi_next; @@ -910,6 +922,7 @@ static void add_stripe_bio (struct strip BUG(); if (*bip) bi->bi_next = *bip; +#endif *bip = bi; bi->bi_phys_segments ++; spin_unlock_irq(&conf->device_lock); @@ -932,6 +945,7 @@ static void add_stripe_bio (struct strip if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); } + return 0; } /* @@ -1014,6 +1028,7 @@ static void handle_stripe(struct stripe_ rbi = dev->toread; dev->toread = NULL; spin_unlock_irq(&conf->device_lock); + wake_up(&sh->wait); while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { copy_data(0, rbi, dev->page, dev->sector); rbi2 = r5_next_bio(rbi, dev->sector); @@ -1059,6 +1074,7 @@ static void handle_stripe(struct stripe_ bi = sh->dev[i].towrite; sh->dev[i].towrite = NULL; if (bi) to_write--; + wake_up(&sh->wait); while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){ struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); @@ -1511,6 +1527,16 @@ static inline void raid5_plug_device(rai spin_unlock_irq(&conf->device_lock); } +static inline void raid5_wait_stripe(struct stripe_head *sh, int dd_idx, int forwrite) +{ + struct bio **bip; + if (forwrite) + bip = &sh->dev[dd_idx].towrite; + else + bip = &sh->dev[dd_idx].toread; + wait_event(sh->wait, *bip == NULL); +} + static int make_request (request_queue_t *q, struct bio * bi) { mddev_t *mddev = q->queuedata; @@ -1580,6 +1606,7 @@ repeat: * if we can't, then it's time to submit * all collected bio's in order to free * some space in the cache -bzzz */ +try_stripe: sh = get_active_stripe(conf, new_sector, pd_idx, 1); if (!sh && !(bi->bi_rw&RWA_MASK)) { raid5_flush_bios(conf, bios, raid_disks); @@ -1587,7 +1614,11 @@ repeat: } } if (sh) { - add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK)); + if (add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { + release_stripe(sh); + raid5_wait_stripe(sh, dd_idx, bi->bi_rw&RW_MASK); + goto try_stripe; + } } else { /* cannot get stripe for read-ahead, just give-up */ clear_bit(BIO_UPTODATE, &bi->bi_flags);