Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-zerocopy-rhel5.patch
1 diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
2 --- linux-2.6.18-53.orig/drivers/md/raid5.c     2007-12-28 19:09:20.000000000 +0800
3 +++ linux-2.6.18-53/drivers/md/raid5.c  2007-12-28 19:09:32.000000000 +0800
4 @@ -633,6 +633,7 @@ static int raid5_end_read_request(struct
5                 clear_buffer_uptodate(bh);
6         }
7  #endif
8 +       BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
9         clear_bit(R5_LOCKED, &sh->dev[i].flags);
10         set_bit(STRIPE_HANDLE, &sh->state);
11         release_stripe(sh);
12 @@ -671,6 +672,10 @@ static int raid5_end_write_request (stru
13  
14         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
15         
16 +       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
17 +               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
18 +               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
19 +       }
20         clear_bit(R5_LOCKED, &sh->dev[i].flags);
21         set_bit(STRIPE_HANDLE, &sh->state);
22         __release_stripe(conf, sh);
23 @@ -911,7 +916,27 @@ static sector_t compute_blocknr(struct s
24         return r_sector;
25  }
26  
27 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
28 +{
29 +       sector_t bi_sector = bio->bi_sector;
30 +       struct page *page = NULL;
31 +       struct bio_vec *bvl;
32 +       int i;
33  
34 +       bio_for_each_segment(bvl, bio, i) {
35 +               if (sector == bi_sector)
36 +                       page = bio_iovec_idx(bio, i)->bv_page;
37 +               bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
38 +               if (bi_sector >= sector + STRIPE_SECTORS) {
39 +                       /* check if the stripe is covered by one page */
40 +                       if (page == bio_iovec_idx(bio, i)->bv_page &&
41 +                           PageConstant(page))
42 +                               return page;
43 +                       return NULL;
44 +               }
45 +       }
46 +       return NULL;
47 +}
48  
49  /*
50   * Copy data between a page in the stripe cache, and one or more bion
51 @@ -1003,8 +1028,9 @@ static void compute_parity5(struct strip
52  {
53         raid5_conf_t *conf = sh->raid_conf;
54         int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
55 -       void *ptr[MAX_XOR_BLOCKS];
56 +       void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
57         struct bio *chosen;
58 +       struct page *page;
59  
60         PRINTK("compute_parity5, stripe %llu, method %d\n",
61                 (unsigned long long)sh->sector, method);
62 @@ -1054,34 +1080,90 @@ static void compute_parity5(struct strip
63                 count = 1;
64         }
65         
66 -       for (i = disks; i--;)
67 -               if (sh->dev[i].written) {
68 -                       sector_t sector = sh->dev[i].sector;
69 -                       struct bio *wbi = sh->dev[i].written;
70 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
71 -                               copy_data(1, wbi, sh->dev[i].page, sector);
72 -                               wbi = r5_next_bio(wbi, sector);
73 +       for (i = disks; i--;) {
74 +               struct r5dev *dev = &sh->dev[i];
75 +               struct bio *wbi = dev->written;
76 +               sector_t sector;
77 +
78 +               if (!wbi)
79 +                       continue;
80 +
81 +               sector = dev->sector;
82 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
83 +               BUG_ON(test_bit(R5_Direct, &dev->flags));
84 +
85 +               /* check if it's covered by a single page
86 +                  and whole stripe is written at once.
87 +                * in this case we can avoid memcpy() */
88 +               if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
89 +                   test_bit(R5_Insync, &dev->flags)) {
90 +                       page = zero_copy_data(wbi, sector);
91 +                       if (page) {
92 +                               atomic_inc(&conf->writes_zcopy);
93 +                               dev->req.bi_io_vec[0].bv_page = page;
94 +                               set_bit(R5_Direct, &dev->flags);
95 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
96 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
97 +                               continue;
98                         }
99 +               }
100  
101 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
102 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
103 +               /* do copy write */
104 +               atomic_inc(&conf->writes_copied);
105 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
106 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
107 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
108 +                       copy_data(1, wbi, sh->dev[i].page, sector);
109 +                       wbi = r5_next_bio(wbi, sector);
110                 }
111 +       }
112  
113 +       h_ptr[0] = ptr[0];
114         switch(method) {
115         case RECONSTRUCT_WRITE:
116         case CHECK_PARITY:
117 -               for (i=disks; i--;)
118 -                       if (i != pd_idx) {
119 -                               ptr[count++] = page_address(sh->dev[i].page);
120 -                               check_xor();
121 +               for (i=disks; i--;) {
122 +                       if (i == pd_idx)
123 +                               continue;
124 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
125 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
126 +                       else
127 +                               page = sh->dev[i].page;
128 +
129 +                       /* have to compute the parity immediately for
130 +                        * a highmem page. it would happen for zerocopy. -jay
131 +                        */
132 +                       if (PageHighMem(page)) {
133 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
134 +                               xor_block(2, STRIPE_SIZE, h_ptr);
135 +                               kunmap_atomic(page, KM_USER0);
136 +                       } else {
137 +                               ptr[count++] = page_address(page);
138                         }
139 +                       check_xor();
140 +               }
141                 break;
142         case READ_MODIFY_WRITE:
143 -               for (i = disks; i--;)
144 -                       if (sh->dev[i].written) {
145 -                               ptr[count++] = page_address(sh->dev[i].page);
146 -                               check_xor();
147 +               for (i = disks; i--;) {
148 +                       if (!sh->dev[i].written)
149 +                               continue;
150 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
151 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
152 +                       else
153 +                               page = sh->dev[i].page;
154 +
155 +                       /* have to compute the parity immediately for
156 +                        * a highmem page. it would happen for zerocopy. -jay
157 +                        */
158 +                       if (PageHighMem(page)) {
159 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
160 +                               xor_block(2, STRIPE_SIZE, h_ptr);
161 +                               kunmap_atomic(page, KM_USER0);
162 +                       } else {
163 +                               ptr[count++] = page_address(page);
164                         }
165 +                       check_xor();
166 +               }
167         }
168         if (count != 1)
169                 xor_block(count, STRIPE_SIZE, ptr);
170 @@ -1098,6 +1180,7 @@ static void compute_parity6(struct strip
171         raid6_conf_t *conf = sh->raid_conf;
172         int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
173         struct bio *chosen;
174 +       struct page *page;
175         /**** FIX THIS: This could be very bad if disks is close to 256 ****/
176         void *ptrs[disks];
177  
178 @@ -1127,18 +1210,47 @@ static void compute_parity6(struct strip
179                 BUG();          /* Not implemented yet */
180         }
181  
182 -       for (i = disks; i--;)
183 -               if (sh->dev[i].written) {
184 -                       sector_t sector = sh->dev[i].sector;
185 -                       struct bio *wbi = sh->dev[i].written;
186 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
187 -                               copy_data(1, wbi, sh->dev[i].page, sector);
188 -                               wbi = r5_next_bio(wbi, sector);
189 +       for (i = disks; i--;) {
190 +               struct r5dev *dev = &sh->dev[i];
191 +               struct bio *wbi = dev->written;
192 +               sector_t sector;
193 +
194 +               if (!wbi)
195 +                       continue;
196 +
197 +               sector = sh->dev[i].sector;
198 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
199 +               BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
200 +
201 +               /* check if it's covered by a single page
202 +                * and whole stripe is written at once.
203 +                * in this case we can avoid memcpy() */
204 +               if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
205 +                   test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
206 +                       page = zero_copy_data(wbi, sector);
207 +                       /* we don't do zerocopy on a HighMem page. Raid6 tend
208 +                        * to prepare all of the pages' content to be accessed
209 +                        * before computing PQ parity. If we need to support HighMem
210 +                        * page also, we have to modify the gen_syndrome()
211 +                        * algorithm. -jay */
212 +                       if (page && !PageHighMem(page)) {
213 +                               atomic_inc(&conf->writes_zcopy);
214 +                               sh->dev[i].req.bi_io_vec[0].bv_page = page;
215 +                               set_bit(R5_Direct, &sh->dev[i].flags);
216 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
217 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
218 +                               continue;
219                         }
220 +               }
221  
222 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
223 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
224 +               atomic_inc(&conf->writes_copied);
225 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
226 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
227 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
228 +                       copy_data(1, wbi, sh->dev[i].page, sector);
229 +                       wbi = r5_next_bio(wbi, sector);
230                 }
231 +       }
232  
233  //     switch(method) {
234  //     case RECONSTRUCT_WRITE:
235 @@ -1149,8 +1261,12 @@ static void compute_parity6(struct strip
236                 count = 0;
237                 i = d0_idx;
238                 do {
239 -                       ptrs[count++] = page_address(sh->dev[i].page);
240 -                       if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
241 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
242 +                               ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
243 +                       else
244 +                               ptrs[count++] = page_address(sh->dev[i].page);
245 +                       if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
246 +                           !test_bit(R5_Direct, &sh->dev[i].flags))
247                                 printk("block %d/%d not uptodate on parity calc\n", i,count);
248                         i = raid6_next_disk(i, disks);
249                 } while ( i != d0_idx );
250 @@ -1597,7 +1713,8 @@ static void handle_stripe5(struct stripe
251                 if (sh->dev[i].written) {
252                     dev = &sh->dev[i];
253                     if (!test_bit(R5_LOCKED, &dev->flags) &&
254 -                        test_bit(R5_UPTODATE, &dev->flags) ) {
255 +                        (test_bit(R5_UPTODATE, &dev->flags) ||
256 +                         test_bit(R5_Direct, &dev->flags)) ) {
257                         /* We can return any write requests */
258                             struct bio *wbi, *wbi2;
259                             int bitmap_end = 0;
260 @@ -1605,6 +1722,7 @@ static void handle_stripe5(struct stripe
261                             spin_lock_irq(&conf->device_lock);
262                             wbi = dev->written;
263                             dev->written = NULL;
264 +                           clear_bit(R5_Direct, &dev->flags);
265                             while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
266                                     wbi2 = r5_next_bio(wbi, dev->sector);
267                                     if (--wbi->bi_phys_segments == 0) {
268 @@ -2173,7 +2291,8 @@ static void handle_stripe6(struct stripe
269                         if (sh->dev[i].written) {
270                                 dev = &sh->dev[i];
271                                 if (!test_bit(R5_LOCKED, &dev->flags) &&
272 -                                   test_bit(R5_UPTODATE, &dev->flags) ) {
273 +                                   (test_bit(R5_UPTODATE, &dev->flags) ||
274 +                                    test_bit(R5_Direct, &dev->flags)) ) {
275                                         /* We can return any write requests */
276                                         int bitmap_end = 0;
277                                         struct bio *wbi, *wbi2;
278 @@ -2182,6 +2301,7 @@ static void handle_stripe6(struct stripe
279                                         spin_lock_irq(&conf->device_lock);
280                                         wbi = dev->written;
281                                         dev->written = NULL;
282 +                                       clear_bit(R5_Direct, &dev->flags);
283                                         while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
284                                                 wbi2 = r5_next_bio(wbi, dev->sector);
285                                                 if (--wbi->bi_phys_segments == 0) {
286 @@ -3450,6 +3570,9 @@ static int run(mddev_t *mddev)
287         mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
288         mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
289  
290 +       /* raid5 device is able to do zcopy right now. */
291 +       mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
292 +
293         return 0;
294  abort:
295         if (conf) {
296 @@ -3536,9 +3659,11 @@ static void status (struct seq_file *seq
297                         atomic_read(&conf->handled_in_raid5d),
298                         atomic_read(&conf->out_of_stripes),
299                         atomic_read(&conf->handle_called));
300 -       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
301 +       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
302                         atomic_read(&conf->reads_for_rmw),
303 -                       atomic_read(&conf->reads_for_rcw));
304 +                       atomic_read(&conf->reads_for_rcw),
305 +                       atomic_read(&conf->writes_zcopy),
306 +                       atomic_read(&conf->writes_copied));
307         seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
308                         atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
309                         atomic_read(&conf->active_stripes),
310 diff -pur linux-2.6.18-53.orig/include/linux/backing-dev.h linux-2.6.18-53/include/linux/backing-dev.h
311 --- linux-2.6.18-53.orig/include/linux/backing-dev.h    2007-12-28 14:49:26.000000000 +0800
312 +++ linux-2.6.18-53/include/linux/backing-dev.h 2007-12-28 19:09:32.000000000 +0800
313 @@ -48,6 +48,7 @@ struct backing_dev_info {
314  #define BDI_CAP_READ_MAP       0x00000010      /* Can be mapped for reading */
315  #define BDI_CAP_WRITE_MAP      0x00000020      /* Can be mapped for writing */
316  #define BDI_CAP_EXEC_MAP       0x00000040      /* Can be mapped for execution */
317 +#define BDI_CAP_PAGE_CONSTANT_WRITE    0x00000080      /* Zcopy write - for raid5 */
318  #define BDI_CAP_VMFLAGS \
319         (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
320  
321 @@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
322  #define bdi_cap_account_dirty(bdi) \
323         (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
324  
325 +#define bdi_cap_page_constant_write(bdi) \
326 +       ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
327 +
328  #define mapping_cap_writeback_dirty(mapping) \
329         bdi_cap_writeback_dirty((mapping)->backing_dev_info)
330  
331  #define mapping_cap_account_dirty(mapping) \
332         bdi_cap_account_dirty((mapping)->backing_dev_info)
333  
334 +#define mapping_cap_page_constant_write(mapping) \
335 +       bdi_cap_page_constant_write((mapping)->backing_dev_info)
336 +       
337 +
338  
339  #endif         /* _LINUX_BACKING_DEV_H */
340 diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/include/linux/page-flags.h
341 --- linux-2.6.18-53.orig/include/linux/page-flags.h     2007-12-28 14:49:26.000000000 +0800
342 +++ linux-2.6.18-53/include/linux/page-flags.h  2007-12-28 19:09:32.000000000 +0800
343 @@ -86,6 +86,7 @@
344  #define PG_reclaim             17      /* To be reclaimed asap */
345  #define PG_nosave_free         18      /* Free, should not be written */
346  #define PG_buddy               19      /* Page is free, on buddy lists */
347 +#define PG_constant            20      /* To mark if the page is constant */
348  #define PG_xpmem               27      /* Testing for xpmem. */
349  
350  /* PG_owner_priv_1 users should have descriptive aliases */
351 @@ -252,6 +253,14 @@
352  
353  struct page;   /* forward declaration */
354  
355 +#define PageConstant(page)     test_bit(PG_constant, &(page)->flags)
356 +#define SetPageConstant(page)  set_bit(PG_constant, &(page)->flags)
357 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
358 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
359 +
360 +extern int set_page_constant(struct page *page);
361 +extern void clear_page_constant(struct page *);
362 +
363  int test_clear_page_dirty(struct page *page);
364  int test_clear_page_writeback(struct page *page);
365  int test_set_page_writeback(struct page *page);
366 diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
367 --- linux-2.6.18-53.orig/include/linux/raid/raid5.h     2007-12-28 18:55:24.000000000 +0800
368 +++ linux-2.6.18-53/include/linux/raid/raid5.h  2007-12-28 19:09:32.000000000 +0800
369 @@ -156,8 +156,9 @@ struct stripe_head {
370  #define        R5_Overlap      7       /* There is a pending overlapping request on this block */
371  #define        R5_ReadError    8       /* seen a read error here recently */
372  #define        R5_ReWrite      9       /* have tried to over-write the readerror */
373 -
374  #define        R5_Expanded     10      /* This block now has post-expand data */
375 +#define        R5_Direct       11      /* Use the pages in bio to do the write directly. */
376 +
377  /*
378   * Write method
379   */
380 diff -pur linux-2.6.18-53.orig/mm/filemap.c linux-2.6.18-53/mm/filemap.c
381 --- linux-2.6.18-53.orig/mm/filemap.c   2007-12-28 14:49:26.000000000 +0800
382 +++ linux-2.6.18-53/mm/filemap.c        2007-12-28 19:09:32.000000000 +0800
383 @@ -30,6 +30,7 @@
384  #include <linux/security.h>
385  #include <linux/syscalls.h>
386  #include <linux/cpuset.h>
387 +#include <linux/rmap.h>
388  #include "filemap.h"
389  #include "internal.h"
390  
391 @@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
392                 if (!test_clear_page_writeback(page))
393                         BUG();
394         }
395 +       clear_page_constant(page);
396         smp_mb__after_clear_bit();
397         wake_up_page(page, PG_writeback);
398  }
399  EXPORT_SYMBOL(end_page_writeback);
400  
401 +/* Make a page to be constant, `constant' means any write to this page will
402 + * be blocked until clear_page_constant is called.
403 + * The page lock must be held.
404 + */
405 +int set_page_constant(struct page *page)
406 +{
407 +       BUG_ON(!PageLocked(page));
408 +
409 +       /* If it's an anonymous page and haven't been added to swap cache,
410 +        * return directly because we have no way to swap this page.
411 +        */
412 +       if (page_mapping(page) == NULL)
413 +               return SWAP_FAIL;
414 +
415 +       BUG_ON(!PageUptodate(page));
416 +
417 +       /* I have to clear page uptodate before trying to remove
418 +        * it from user's page table because otherwise, the page may be
419 +        * reinstalled by a page access which happens between try_to_unmap()
420 +        * and ClearPageUptodate(). -jay
421 +        */
422 +       ClearPageUptodate(page);
423 +       if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
424 +               SetPageUptodate(page);
425 +               return SWAP_FAIL;
426 +       }
427 +       SetPageConstant(page);
428 +       return SWAP_SUCCESS;
429 +}
430 +
431 +void clear_page_constant(struct page *page)
432 +{
433 +       if (PageConstant(page)) {
434 +               BUG_ON(!PageLocked(page));
435 +               BUG_ON(PageUptodate(page));
436 +               ClearPageConstant(page);
437 +               SetPageUptodate(page);
438 +               unlock_page(page);
439 +       }
440 +}
441 +EXPORT_SYMBOL(set_page_constant);
442 +EXPORT_SYMBOL(clear_page_constant);
443 +
444  /**
445   * __lock_page - get a lock on the page, assuming we need to sleep to get it
446   * @page: the page to lock