Whamcloud - gitweb
LU-2337 scripts: init script to not use -d with zpool import
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-stripe-by-stripe-handling-rhel5.patch
1 diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
2 --- linux-2.6.18-53.orig/drivers/md/raid5.c     2007-12-28 14:55:08.000000000 +0800
3 +++ linux-2.6.18-53/drivers/md/raid5.c  2007-12-28 18:52:08.000000000 +0800
4 @@ -2626,6 +2626,35 @@ static int raid5_issue_flush(request_que
5         return ret;
6  }
7  
8 +static inline int raid5_expanding_overlap(raid5_conf_t *conf, struct bio *bi)
9 +{
10 +       sector_t first_sector, last_sector;
11 +
12 +       if (likely(conf->expand_progress == MaxSector))
13 +               return 0;
14 +
15 +       first_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
16 +       last_sector = bi->bi_sector + (bi->bi_size>>9);
17 +
18 +       return (first_sector < conf->expand_progress &&
19 +               last_sector >= conf->expand_lo);
20 +}
21 +
22 +static inline int raid5_redo_bio(raid5_conf_t *conf, struct bio *bi, int disks, sector_t sector)
23 +{
24 +       int redo = 0;
25 +
26 +       if (likely(conf->expand_progress == MaxSector))
27 +               return 0;
28 +
29 +       spin_lock_irq(&conf->device_lock);
30 +       redo = (raid5_expanding_overlap(conf, bi) ||
31 +               (unlikely(sector < conf->expand_progress) &&
32 +               disks == conf->previous_raid_disks));
33 +       spin_unlock_irq(&conf->device_lock);
34 +       return redo;
35 +}
36 +
37  static int make_request(request_queue_t *q, struct bio * bi)
38  {
39         mddev_t *mddev = q->queuedata;
40 @@ -2636,6 +2665,14 @@ static int make_request(request_queue_t 
41         struct stripe_head *sh;
42         const int rw = bio_data_dir(bi);
43         int remaining;
44 +       sector_t stripe, sectors, block, r_sector, b_sector;
45 +       int sectors_per_chunk = conf->chunk_size >> 9;
46 +       int stripes_per_chunk, sectors_per_block;
47 +       int sectors_per_stripe;
48 +       int i, j;
49 +
50 +       DEFINE_WAIT(w);
51 +       int disks, data_disks;
52  
53         atomic_inc(&conf->in_reqs_in_queue);
54  
55 @@ -2653,105 +2690,136 @@ static int make_request(request_queue_t 
56         else
57                 atomic_inc(&conf->reads_in);
58  
59 -
60         logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
61         last_sector = bi->bi_sector + (bi->bi_size>>9);
62         bi->bi_next = NULL;
63         bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
64  
65 -       for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
66 -               DEFINE_WAIT(w);
67 -               int disks, data_disks;
68 -
69 -       retry:
70 -               prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
71 -               if (likely(conf->expand_progress == MaxSector))
72 -                       disks = conf->raid_disks;
73 -               else {
74 -                       /* spinlock is needed as expand_progress may be
75 -                        * 64bit on a 32bit platform, and so it might be
76 -                        * possible to see a half-updated value
77 -                        * Ofcourse expand_progress could change after
78 -                        * the lock is dropped, so once we get a reference
79 -                        * to the stripe that we think it is, we will have
80 -                        * to check again.
81 -                        */
82 -                       spin_lock_irq(&conf->device_lock);
83 -                       disks = conf->raid_disks;
84 -                       if (logical_sector >= conf->expand_progress)
85 -                               disks = conf->previous_raid_disks;
86 -                       else {
87 -                               if (logical_sector >= conf->expand_lo) {
88 -                                       spin_unlock_irq(&conf->device_lock);
89 -                                       schedule();
90 -                                       goto retry;
91 -                               }
92 -                       }
93 -                       spin_unlock_irq(&conf->device_lock);
94 -               }
95 -               data_disks = disks - conf->max_degraded;
96 +       sectors = bi->bi_size >> 9;
97 +       stripes_per_chunk = conf->chunk_size / STRIPE_SIZE;
98  
99 -               new_sector = raid5_compute_sector(logical_sector, disks, data_disks,
100 -                                                 &dd_idx, &pd_idx, conf);
101 -               PRINTK("raid5: make_request, sector %llu logical %llu\n",
102 -                       (unsigned long long)new_sector, 
103 -                       (unsigned long long)logical_sector);
104 +redo_bio:
105 +       /* stripe by stripe handle needs a stable raid layout, so if this
106 +        * reuqest covers the expanding region, wait it over. 
107 +        * Furthermore, we may get here with partial request handled, so
108 +        * wait for the bi_phys_segment to be 1 also. -jay */
109 +       spin_lock_irq(&conf->device_lock);
110 +       wait_event_lock_irq(conf->wait_for_overlap,
111 +                       (bi->bi_phys_segments == 1) &&
112 +                       !raid5_expanding_overlap(conf, bi),
113 +                       conf->device_lock,
114 +                       (unplug_slaves(conf->mddev), atomic_inc(&conf->expanding_overlap)));
115 +
116 +       disks = conf->raid_disks;
117 +       if (unlikely(logical_sector >= conf->expand_progress))
118 +               disks = conf->previous_raid_disks;
119 +       data_disks = disks - conf->max_degraded;
120 +       spin_unlock_irq(&conf->device_lock);
121  
122 -               sh = get_active_stripe(conf, new_sector, disks, pd_idx, (bi->bi_rw&RWA_MASK));
123 -               if (sh) {
124 -                       if (unlikely(conf->expand_progress != MaxSector)) {
125 -                               /* expansion might have moved on while waiting for a
126 -                                * stripe, so we must do the range check again.
127 -                                * Expansion could still move past after this
128 -                                * test, but as we are holding a reference to
129 -                                * 'sh', we know that if that happens,
130 -                                *  STRIPE_EXPANDING will get set and the expansion
131 -                                * won't proceed until we finish with the stripe.
132 -                                */
133 -                               int must_retry = 0;
134 -                               spin_lock_irq(&conf->device_lock);
135 -                               if (logical_sector <  conf->expand_progress &&
136 -                                   disks == conf->previous_raid_disks)
137 -                                       /* mismatch, need to try again */
138 -                                       must_retry = 1;
139 -                               spin_unlock_irq(&conf->device_lock);
140 -                               if (must_retry) {
141 -                                       release_stripe(sh);
142 -                                       goto retry;
143 +       /* compute the block # */
144 +       sectors_per_stripe = STRIPE_SECTORS * data_disks;
145 +       sectors_per_block = stripes_per_chunk * sectors_per_stripe;
146 +
147 +       block = logical_sector & ~((sector_t)sectors_per_block - 1);
148 +       sector_div(block, sectors_per_block);
149 +
150 +repeat:
151 +       stripe = block * (sectors_per_block / data_disks);
152 +       b_sector = stripe * data_disks;
153 +       /* iterate through all stripes in this block,
154 +        * where block is a set of internal stripes
155 +        * which covers chunk */
156 +
157 +       for (i = 0; i < stripes_per_chunk && sectors > 0; i++) {
158 +               r_sector = b_sector + (i * STRIPE_SECTORS);
159 +               sh = NULL;
160 +               /* iterrate through all pages in the stripe */
161 +               for (j = 0; j < data_disks && sectors > 0; j++) {
162 +                       DEFINE_WAIT(w);
163 +
164 +                       if (r_sector + STRIPE_SECTORS <= bi->bi_sector ||
165 +                           r_sector >= last_sector) {
166 +                               r_sector += sectors_per_chunk;
167 +                               continue;
168 +                       }
169 +
170 +retry:
171 +                       prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
172 +                       new_sector = raid5_compute_sector(r_sector, disks,
173 +                                                       data_disks, &dd_idx,
174 +                                                       &pd_idx, conf);
175 +                       if (sh == NULL) {
176 +                               sh = get_active_stripe(conf, new_sector, disks, pd_idx,
177 +                                                       (bi->bi_rw&RWA_MASK));
178 +                               if (sh) {
179 +                                       /* we're handling the bio stripe by stripe, so when we found
180 +                                        * the raid layout has been changed, we have to redo the 
181 +                                        * whole bio because we don't which sectors in it has been
182 +                                        * done, and which is not done. -jay */
183 +                                       if (raid5_redo_bio(conf, bi, disks, logical_sector))
184 +                                               goto redo_bio;
185 +
186 +                                       if (test_bit(STRIPE_EXPANDING, &sh->state)) {
187 +                                               /* Stripe is busy expanding or
188 +                                                * add failed due to overlap.  Flush everything
189 +                                                * and wait a while
190 +                                                */
191 +                                               release_stripe(sh);
192 +                                               sh = NULL;
193 +                                               raid5_unplug_device(mddev->queue);
194 +                                               schedule();
195 +                                               goto retry;
196 +                                       }
197 +                               } else {
198 +                                       /* cannot get stripe for read-ahead, just give-up */
199 +                                       finish_wait(&conf->wait_for_overlap, &w);
200 +                                       clear_bit(BIO_UPTODATE, &bi->bi_flags);
201 +                                       sectors = 0;
202 +                                       break;
203                                 }
204                         }
205 +
206                         /* FIXME what if we get a false positive because these
207                          * are being updated.
208                          */
209 -                       if (logical_sector >= mddev->suspend_lo &&
210 -                           logical_sector < mddev->suspend_hi) {
211 +                       if (r_sector >= mddev->suspend_lo &&
212 +                           r_sector < mddev->suspend_hi) {
213 +                               handle_stripe(sh, NULL);
214                                 release_stripe(sh);
215 +                               sh = NULL;
216                                 schedule();
217                                 goto retry;
218                         }
219  
220 -                       if (test_bit(STRIPE_EXPANDING, &sh->state) ||
221 -                           !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
222 -                               /* Stripe is busy expanding or
223 -                                * add failed due to overlap.  Flush everything
224 -                                * and wait a while
225 -                                */
226 -                               raid5_unplug_device(mddev->queue);
227 +                       if (!add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
228 +                               handle_stripe(sh, NULL);
229                                 release_stripe(sh);
230 +                               sh = NULL;
231 +                               raid5_unplug_device(mddev->queue);
232                                 schedule();
233                                 goto retry;
234                         }
235                         finish_wait(&conf->wait_for_overlap, &w);
236 +
237 +                       BUG_ON (new_sector != stripe);
238 +                       sectors -= STRIPE_SECTORS;
239 +                       if (bi->bi_sector > r_sector)
240 +                               sectors += bi->bi_sector - r_sector;
241 +                       if (r_sector + STRIPE_SECTORS > last_sector)
242 +                               sectors += r_sector + STRIPE_SECTORS - last_sector;
243 +                       r_sector += sectors_per_chunk;
244 +               }
245 +               if (sh) {
246                         handle_stripe(sh, NULL);
247                         release_stripe(sh);
248 -               } else {
249 -                       /* cannot get stripe for read-ahead, just give-up */
250 -                       clear_bit(BIO_UPTODATE, &bi->bi_flags);
251 -                       finish_wait(&conf->wait_for_overlap, &w);
252 -                       break;
253 +                       sh = NULL;
254                 }
255 -                       
256 +               stripe += STRIPE_SECTORS;
257         }
258 +       block++;
259 +       if (sectors > 0)
260 +               goto repeat;
261 +
262         spin_lock_irq(&conf->device_lock);
263         remaining = --bi->bi_phys_segments;
264         spin_unlock_irq(&conf->device_lock);
265 @@ -3439,6 +3507,8 @@ static void status (struct seq_file *seq
266                         atomic_read(&conf->active_stripes),
267                         atomic_read(&conf->in_reqs_in_queue),
268                         atomic_read(&conf->out_reqs_in_queue));
269 +       seq_printf (seq, "\t\t%u expanding overlap\n",
270 +                       atomic_read(&conf->expanding_overlap));
271  #if RAID5_DEBUG
272         seq_printf (seq, "\n");
273         printall(seq, conf);
274 diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
275 --- linux-2.6.18-53.orig/include/linux/raid/raid5.h     2007-12-28 14:55:08.000000000 +0800
276 +++ linux-2.6.18-53/include/linux/raid/raid5.h  2007-12-28 18:09:37.000000000 +0800
277 @@ -278,6 +278,7 @@ struct raid5_private_data {
278         atomic_t                bit_delayed;
279         atomic_t                in_reqs_in_queue;
280         atomic_t                out_reqs_in_queue;
281 +       atomic_t                expanding_overlap;
282  };
283  
284  typedef struct raid5_private_data raid5_conf_t;