1 RAID5 wasn't designed to support overlapping requests because
2 in Linux all I/Os are serialized by page/buffer lock. As Lustre
3 doesn't use pagecache on server, we need to serialize I/Os in RAID5.
5 Index: linux-2.6.9/include/linux/raid/raid5.h
6 ===================================================================
7 --- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400
8 +++ linux-2.6.9/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400
9 @@ -134,6 +134,7 @@ struct stripe_head {
10 unsigned long state; /* state flags */
11 atomic_t count; /* nr of active thread/requests */
13 + wait_queue_head_t wait; /* waitchan for overlapped bio's */
17 Index: linux-2.6.9/drivers/md/raid5.c
18 ===================================================================
19 --- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:11:21.000000000 +0400
20 +++ linux-2.6.9/drivers/md/raid5.c 2006-05-22 00:19:27.000000000 +0400
21 @@ -308,6 +308,7 @@ static int grow_stripes(raid5_conf_t *co
22 memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
24 sh->lock = SPIN_LOCK_UNLOCKED;
25 + init_waitqueue_head(&sh->wait);
27 if (grow_buffers(sh, conf->raid_disks)) {
28 shrink_buffers(sh, conf->raid_disks);
29 @@ -878,6 +879,9 @@ static void compute_parity(struct stripe
30 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
32 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
34 + /* probably someone waits for our completion? */
39 @@ -885,7 +889,7 @@ static void compute_parity(struct stripe
40 * toread/towrite point to the first in a chain.
41 * The bi_next chain must be in order.
43 -static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
44 +static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
47 raid5_conf_t *conf = sh->raid_conf;
48 @@ -894,13 +898,21 @@ static void add_stripe_bio (struct strip
49 (unsigned long long)bi->bi_sector,
50 (unsigned long long)sh->sector);
54 spin_lock_irq(&conf->device_lock);
56 bip = &sh->dev[dd_idx].towrite;
58 bip = &sh->dev[dd_idx].toread;
62 + /* overlapping bio, let's wait till first one is completed */
63 + spin_unlock_irq(&conf->device_lock);
64 + spin_unlock(&sh->lock);
68 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
69 BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
70 bip = & (*bip)->bi_next;
71 @@ -910,6 +922,7 @@ static void add_stripe_bio (struct strip
77 bi->bi_phys_segments ++;
78 spin_unlock_irq(&conf->device_lock);
79 @@ -932,6 +945,7 @@ static void add_stripe_bio (struct strip
80 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
81 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
87 @@ -1014,6 +1028,7 @@ static void handle_stripe(struct stripe_
90 spin_unlock_irq(&conf->device_lock);
92 while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
93 copy_data(0, rbi, dev->page, dev->sector);
94 rbi2 = r5_next_bio(rbi, dev->sector);
95 @@ -1059,6 +1074,7 @@ static void handle_stripe(struct stripe_
96 bi = sh->dev[i].towrite;
97 sh->dev[i].towrite = NULL;
101 while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
102 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
103 @@ -1511,6 +1527,16 @@ static inline void raid5_plug_device(rai
104 spin_unlock_irq(&conf->device_lock);
107 +static inline void raid5_wait_stripe(struct stripe_head *sh, int dd_idx, int forwrite)
111 + bip = &sh->dev[dd_idx].towrite;
113 + bip = &sh->dev[dd_idx].toread;
114 + wait_event(sh->wait, *bip == NULL);
117 static int make_request (request_queue_t *q, struct bio * bi)
119 mddev_t *mddev = q->queuedata;
120 @@ -1580,6 +1606,7 @@ repeat:
121 * if we can't, then it's time to submit
122 * all collected bio's in order to free
123 * some space in the cache -bzzz */
125 sh = get_active_stripe(conf, new_sector, pd_idx, 1);
126 if (!sh && !(bi->bi_rw&RWA_MASK)) {
127 raid5_flush_bios(conf, bios, raid_disks);
128 @@ -1587,7 +1614,11 @@ repeat:
132 - add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
133 + if (add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
134 + release_stripe(sh);
135 + raid5_wait_stripe(sh, dd_idx, bi->bi_rw&RW_MASK);
139 /* cannot get stripe for read-ahead, just give-up */
140 clear_bit(BIO_UPTODATE, &bi->bi_flags);