Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-serialize-ovelapping-reqs.patch
1 RAID5 wasn't designed to support overlapping requests because
2 in Linux all I/Os are serialized by page/buffer lock.  As Lustre
3 doesn't use pagecache on server, we need to serialize I/Os in RAID5.
4
5 Index: linux-2.6.9/include/linux/raid/raid5.h
6 ===================================================================
7 --- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-22 00:11:21.000000000 +0400
8 +++ linux-2.6.9/include/linux/raid/raid5.h      2006-05-22 00:11:21.000000000 +0400
9 @@ -134,6 +134,7 @@ struct stripe_head {
10         unsigned long           state;                  /* state flags */
11         atomic_t                count;                  /* nr of active thread/requests */
12         spinlock_t              lock;
13 +       wait_queue_head_t       wait;                   /* waitchan for overlapped bio's */
14         struct r5dev {
15                 struct bio      req;
16                 struct bio_vec  vec;
17 Index: linux-2.6.9/drivers/md/raid5.c
18 ===================================================================
19 --- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:11:21.000000000 +0400
20 +++ linux-2.6.9/drivers/md/raid5.c      2006-05-22 00:19:27.000000000 +0400
21 @@ -308,6 +308,7 @@ static int grow_stripes(raid5_conf_t *co
22                 memset(sh, 0, sizeof(*sh) + (devs-1)*sizeof(struct r5dev));
23                 sh->raid_conf = conf;
24                 sh->lock = SPIN_LOCK_UNLOCKED;
25 +               init_waitqueue_head(&sh->wait);
26  
27                 if (grow_buffers(sh, conf->raid_disks)) {
28                         shrink_buffers(sh, conf->raid_disks);
29 @@ -878,6 +879,9 @@ static void compute_parity(struct stripe
30                 set_bit(R5_LOCKED,   &sh->dev[pd_idx].flags);
31         } else
32                 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
33 +
34 +       /* probably someone waits for our completion? */
35 +       wake_up(&sh->wait);
36  }
37  
38  /*
39 @@ -885,7 +889,7 @@ static void compute_parity(struct stripe
40   * toread/towrite point to the first in a chain. 
41   * The bi_next chain must be in order.
42   */
43 -static void add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
44 +static int add_stripe_bio (struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
45  {
46         struct bio **bip;
47         raid5_conf_t *conf = sh->raid_conf;
48 @@ -894,13 +898,21 @@ static void add_stripe_bio (struct strip
49                 (unsigned long long)bi->bi_sector,
50                 (unsigned long long)sh->sector);
51  
52 -
53         spin_lock(&sh->lock);
54         spin_lock_irq(&conf->device_lock);
55         if (forwrite)
56                 bip = &sh->dev[dd_idx].towrite;
57         else
58                 bip = &sh->dev[dd_idx].toread;
59 +
60 +#if 1
61 +       if (*bip) {
62 +               /* overlapping bio, let's wait till first one is completed */
63 +               spin_unlock_irq(&conf->device_lock);
64 +               spin_unlock(&sh->lock);
65 +               return 1;
66 +       }
67 +#else
68         while (*bip && (*bip)->bi_sector < bi->bi_sector) {
69                 BUG_ON((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector);
70                 bip = & (*bip)->bi_next;
71 @@ -910,6 +922,7 @@ static void add_stripe_bio (struct strip
72                 BUG();
73         if (*bip)
74                 bi->bi_next = *bip;
75 +#endif
76         *bip = bi;
77         bi->bi_phys_segments ++;
78         spin_unlock_irq(&conf->device_lock);
79 @@ -932,6 +945,7 @@ static void add_stripe_bio (struct strip
80                 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
81                         set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
82         }
83 +       return 0;
84  }
85  
86  /*
87 @@ -1014,6 +1028,7 @@ static void handle_stripe(struct stripe_
88                         rbi = dev->toread;
89                         dev->toread = NULL;
90                         spin_unlock_irq(&conf->device_lock);
91 +                       wake_up(&sh->wait);
92                         while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
93                                 copy_data(0, rbi, dev->page, dev->sector);
94                                 rbi2 = r5_next_bio(rbi, dev->sector);
95 @@ -1059,6 +1074,7 @@ static void handle_stripe(struct stripe_
96                         bi = sh->dev[i].towrite;
97                         sh->dev[i].towrite = NULL;
98                         if (bi) to_write--;
99 +                       wake_up(&sh->wait);
100  
101                         while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
102                                 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
103 @@ -1511,6 +1527,16 @@ static inline void raid5_plug_device(rai
104         spin_unlock_irq(&conf->device_lock);
105  }
106  
107 +static inline void raid5_wait_stripe(struct stripe_head *sh, int dd_idx, int forwrite)
108 +{
109 +       struct bio **bip;
110 +       if (forwrite)
111 +               bip = &sh->dev[dd_idx].towrite;
112 +       else
113 +               bip = &sh->dev[dd_idx].toread;
114 +       wait_event(sh->wait, *bip == NULL);
115 +}
116 +
117  static int make_request (request_queue_t *q, struct bio * bi)
118  {
119         mddev_t *mddev = q->queuedata;
120 @@ -1580,6 +1606,7 @@ repeat:
121                                  * if we can't, then it's time to submit
122                                  * all collected bio's in order to free
123                                  * some space in the cache -bzzz */
124 +try_stripe:
125                                 sh = get_active_stripe(conf, new_sector, pd_idx, 1);
126                                 if (!sh && !(bi->bi_rw&RWA_MASK)) {
127                                         raid5_flush_bios(conf, bios, raid_disks);
128 @@ -1587,7 +1614,11 @@ repeat:
129                                 }
130                         }
131                         if (sh) {
132 -                               add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK));
133 +                               if (add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
134 +                                       release_stripe(sh);
135 +                                       raid5_wait_stripe(sh, dd_idx, bi->bi_rw&RW_MASK);
136 +                                       goto try_stripe;
137 +                               }
138                         } else {
139                                 /* cannot get stripe for read-ahead, just give-up */
140                                 clear_bit(BIO_UPTODATE, &bi->bi_flags);