Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-zerocopy-rhel5.patch
1 Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
2 ===================================================================
3 --- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c        2009-06-02 23:24:52.000000000 -0600
4 +++ linux-2.6.18-128.1.6/drivers/md/raid5.c     2009-06-02 23:24:55.000000000 -0600
5 @@ -633,6 +633,9 @@
6                 clear_buffer_uptodate(bh);
7         }
8  #endif
9 +       /* Read on a Directing write is allowable */
10 +       /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
11 +       BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
12         clear_bit(R5_LOCKED, &sh->dev[i].flags);
13         set_bit(STRIPE_HANDLE, &sh->state);
14         release_stripe(sh);
15 @@ -669,6 +672,10 @@
16  
17         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
18         
19 +       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
20 +               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
21 +               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
22 +       }
23         clear_bit(R5_LOCKED, &sh->dev[i].flags);
24         set_bit(STRIPE_HANDLE, &sh->state);
25         release_stripe(sh);
26 @@ -910,7 +917,27 @@
27         return r_sector;
28  }
29  
30 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
31 +{
32 +       sector_t bi_sector = bio->bi_sector;
33 +       struct page *page = NULL;
34 +       struct bio_vec *bvl;
35 +       int i;
36  
37 +       bio_for_each_segment(bvl, bio, i) {
38 +               if (sector == bi_sector)
39 +                       page = bio_iovec_idx(bio, i)->bv_page;
40 +               bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
41 +               if (bi_sector >= sector + STRIPE_SECTORS) {
42 +                       /* check if the stripe is covered by one page */
43 +                       if (page == bio_iovec_idx(bio, i)->bv_page &&
44 +                           PageConstant(page))
45 +                               return page;
46 +                       return NULL;
47 +               }
48 +       }
49 +       return NULL;
50 +}
51  
52  /*
53   * Copy data between a page in the stripe cache, and one or more bion
54 @@ -1002,8 +1029,9 @@
55  {
56         raid5_conf_t *conf = sh->raid_conf;
57         int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
58 -       void *ptr[MAX_XOR_BLOCKS];
59 +       void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
60         struct bio *chosen;
61 +       struct page *page;
62  
63         PRINTK("compute_parity5, stripe %llu, method %d\n",
64                 (unsigned long long)sh->sector, method);
65 @@ -1053,34 +1081,92 @@
66                 count = 1;
67         }
68         
69 -       for (i = disks; i--;)
70 -               if (sh->dev[i].written) {
71 -                       sector_t sector = sh->dev[i].sector;
72 -                       struct bio *wbi = sh->dev[i].written;
73 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
74 -                               copy_data(1, wbi, sh->dev[i].page, sector);
75 -                               wbi = r5_next_bio(wbi, sector);
76 +       for (i = disks; i--;) {
77 +               struct r5dev *dev = &sh->dev[i];
78 +               struct bio *wbi = dev->written;
79 +               sector_t sector;
80 +
81 +               if (!wbi)
82 +                       continue;
83 +
84 +               sector = dev->sector;
85 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
86 +               BUG_ON(test_bit(R5_Direct, &dev->flags));
87 +
88 +               /* check if it's covered by a single page
89 +                  and whole stripe is written at once.
90 +                * in this case we can avoid memcpy() */
91 +               if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
92 +                   test_bit(R5_Insync, &dev->flags)) {
93 +                       page = zero_copy_data(wbi, sector);
94 +                       if (page) {
95 +                               atomic_inc(&conf->writes_zcopy);
96 +                               /* The pointer must be restored whenever the LOCKED
97 +                                * gets cleared. */
98 +                               dev->req.bi_io_vec[0].bv_page = page;
99 +                               set_bit(R5_Direct, &dev->flags);
100 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
101 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
102 +                               continue;
103                         }
104 +               }
105  
106 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
107 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
108 +               /* do copy write */
109 +               atomic_inc(&conf->writes_copied);
110 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
111 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
112 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
113 +                       copy_data(1, wbi, sh->dev[i].page, sector);
114 +                       wbi = r5_next_bio(wbi, sector);
115                 }
116 +       }
117  
118 +       h_ptr[0] = ptr[0];
119         switch(method) {
120         case RECONSTRUCT_WRITE:
121         case CHECK_PARITY:
122 -               for (i=disks; i--;)
123 -                       if (i != pd_idx) {
124 -                               ptr[count++] = page_address(sh->dev[i].page);
125 -                               check_xor();
126 +               for (i=disks; i--;) {
127 +                       if (i == pd_idx)
128 +                               continue;
129 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
130 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
131 +                       else
132 +                               page = sh->dev[i].page;
133 +
134 +                       /* have to compute the parity immediately for
135 +                        * a highmem page. it would happen for zerocopy. -jay
136 +                        */
137 +                       if (PageHighMem(page)) {
138 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
139 +                               xor_block(2, STRIPE_SIZE, h_ptr);
140 +                               kunmap_atomic(page, KM_USER0);
141 +                       } else {
142 +                               ptr[count++] = page_address(page);
143                         }
144 +                       check_xor();
145 +               }
146                 break;
147         case READ_MODIFY_WRITE:
148 -               for (i = disks; i--;)
149 -                       if (sh->dev[i].written) {
150 -                               ptr[count++] = page_address(sh->dev[i].page);
151 -                               check_xor();
152 +               for (i = disks; i--;) {
153 +                       if (!sh->dev[i].written)
154 +                               continue;
155 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
156 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
157 +                       else
158 +                               page = sh->dev[i].page;
159 +
160 +                       /* have to compute the parity immediately for
161 +                        * a highmem page. it would happen for zerocopy. -jay
162 +                        */
163 +                       if (PageHighMem(page)) {
164 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
165 +                               xor_block(2, STRIPE_SIZE, h_ptr);
166 +                               kunmap_atomic(page, KM_USER0);
167 +                       } else {
168 +                               ptr[count++] = page_address(page);
169                         }
170 +                       check_xor();
171 +               }
172         }
173         if (count != 1)
174                 xor_block(count, STRIPE_SIZE, ptr);
175 @@ -1097,6 +1183,7 @@
176         raid6_conf_t *conf = sh->raid_conf;
177         int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
178         struct bio *chosen;
179 +       struct page *page;
180         /**** FIX THIS: This could be very bad if disks is close to 256 ****/
181         void *ptrs[disks];
182  
183 @@ -1126,18 +1213,49 @@
184                 BUG();          /* Not implemented yet */
185         }
186  
187 -       for (i = disks; i--;)
188 -               if (sh->dev[i].written) {
189 -                       sector_t sector = sh->dev[i].sector;
190 -                       struct bio *wbi = sh->dev[i].written;
191 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
192 -                               copy_data(1, wbi, sh->dev[i].page, sector);
193 -                               wbi = r5_next_bio(wbi, sector);
194 +       for (i = disks; i--;) {
195 +               struct r5dev *dev = &sh->dev[i];
196 +               struct bio *wbi = dev->written;
197 +               sector_t sector;
198 +
199 +               if (!wbi)
200 +                       continue;
201 +
202 +               sector = sh->dev[i].sector;
203 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
204 +               BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
205 +
206 +               /* check if it's covered by a single page
207 +                * and whole stripe is written at once.
208 +                * in this case we can avoid memcpy() */
209 +               if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
210 +                   test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
211 +                       page = zero_copy_data(wbi, sector);
212 +                       /* we don't do zerocopy on a HighMem page. Raid6 tend
213 +                        * to prepare all of the pages' content to be accessed
214 +                        * before computing PQ parity. If we need to support HighMem
215 +                        * page also, we have to modify the gen_syndrome()
216 +                        * algorithm. -jay */
217 +                       if (page && !PageHighMem(page)) {
218 +                               atomic_inc(&conf->writes_zcopy);
219 +                               /* The pointer must be restored whenever the LOCKED
220 +                                * gets cleared. */
221 +                               sh->dev[i].req.bi_io_vec[0].bv_page = page;
222 +                               set_bit(R5_Direct, &sh->dev[i].flags);
223 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
224 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
225 +                               continue;
226                         }
227 +               }
228  
229 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
230 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
231 +               atomic_inc(&conf->writes_copied);
232 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
233 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
234 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
235 +                       copy_data(1, wbi, sh->dev[i].page, sector);
236 +                       wbi = r5_next_bio(wbi, sector);
237                 }
238 +       }
239  
240  //     switch(method) {
241  //     case RECONSTRUCT_WRITE:
242 @@ -1148,8 +1266,12 @@
243                 count = 0;
244                 i = d0_idx;
245                 do {
246 -                       ptrs[count++] = page_address(sh->dev[i].page);
247 -                       if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
248 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
249 +                               ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
250 +                       else
251 +                               ptrs[count++] = page_address(sh->dev[i].page);
252 +                       if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
253 +                           !test_bit(R5_Direct, &sh->dev[i].flags))
254                                 printk("block %d/%d not uptodate on parity calc\n", i,count);
255                         i = raid6_next_disk(i, disks);
256                 } while ( i != d0_idx );
257 @@ -1596,7 +1718,8 @@
258                 if (sh->dev[i].written) {
259                     dev = &sh->dev[i];
260                     if (!test_bit(R5_LOCKED, &dev->flags) &&
261 -                        test_bit(R5_UPTODATE, &dev->flags) ) {
262 +                        (test_bit(R5_UPTODATE, &dev->flags) ||
263 +                         test_bit(R5_Direct, &dev->flags)) ) {
264                         /* We can return any write requests */
265                             struct bio *wbi, *wbi2;
266                             int bitmap_end = 0;
267 @@ -1604,6 +1727,7 @@
268                             spin_lock_irq(&conf->device_lock);
269                             wbi = dev->written;
270                             dev->written = NULL;
271 +                           clear_bit(R5_Direct, &dev->flags);
272                             while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
273                                     wbi2 = r5_next_bio(wbi, dev->sector);
274                                     if (--wbi->bi_phys_segments == 0) {
275 @@ -1967,6 +2091,15 @@
276                                 set_bit(STRIPE_DEGRADED, &sh->state);
277                         PRINTK("skip op %ld on disc %d for sector %llu\n",
278                                 bi->bi_rw, i, (unsigned long long)sh->sector);
279 +
280 +                       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
281 +                               /* restore the page pointer of req, otherwise,
282 +                                * no any read is permitted on this stripe, this is
283 +                                * not what we want. -jay */
284 +                               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
285 +                               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
286 +                       }
287 +
288                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
289                         set_bit(STRIPE_HANDLE, &sh->state);
290                 }
291 @@ -2172,7 +2305,8 @@
292                         if (sh->dev[i].written) {
293                                 dev = &sh->dev[i];
294                                 if (!test_bit(R5_LOCKED, &dev->flags) &&
295 -                                   test_bit(R5_UPTODATE, &dev->flags) ) {
296 +                                   (test_bit(R5_UPTODATE, &dev->flags) ||
297 +                                    test_bit(R5_Direct, &dev->flags)) ) {
298                                         /* We can return any write requests */
299                                         int bitmap_end = 0;
300                                         struct bio *wbi, *wbi2;
301 @@ -2181,6 +2315,7 @@
302                                         spin_lock_irq(&conf->device_lock);
303                                         wbi = dev->written;
304                                         dev->written = NULL;
305 +                                       clear_bit(R5_Direct, &dev->flags);
306                                         while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
307                                                 wbi2 = r5_next_bio(wbi, dev->sector);
308                                                 if (--wbi->bi_phys_segments == 0) {
309 @@ -2532,6 +2667,15 @@
310                                 set_bit(STRIPE_DEGRADED, &sh->state);
311                         PRINTK("skip op %ld on disc %d for sector %llu\n",
312                                 bi->bi_rw, i, (unsigned long long)sh->sector);
313 +
314 +                       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
315 +                               /* restore the page pointer of req, otherwise,
316 +                                * no any read is permitted on this stripe, this is
317 +                                * not what we want. -jay */
318 +                               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
319 +                               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
320 +                       }
321 +
322                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
323                         set_bit(STRIPE_HANDLE, &sh->state);
324                 }
325 @@ -3451,6 +3595,9 @@
326         mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
327         mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
328  
329 +       /* raid5 device is able to do zcopy right now. */
330 +       mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
331 +
332         return 0;
333  abort:
334         if (conf) {
335 @@ -3537,9 +3684,11 @@
336                         atomic_read(&conf->handled_in_raid5d),
337                         atomic_read(&conf->out_of_stripes),
338                         atomic_read(&conf->handle_called));
339 -       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
340 +       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
341                         atomic_read(&conf->reads_for_rmw),
342 -                       atomic_read(&conf->reads_for_rcw));
343 +                       atomic_read(&conf->reads_for_rcw),
344 +                       atomic_read(&conf->writes_zcopy),
345 +                       atomic_read(&conf->writes_copied));
346         seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
347                         atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
348                         atomic_read(&conf->active_stripes),
349 Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h
350 ===================================================================
351 --- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h       2006-09-19 21:42:06.000000000 -0600
352 +++ linux-2.6.18-128.1.6/include/linux/backing-dev.h    2009-06-02 23:24:55.000000000 -0600
353 @@ -48,6 +48,7 @@
354  #define BDI_CAP_READ_MAP       0x00000010      /* Can be mapped for reading */
355  #define BDI_CAP_WRITE_MAP      0x00000020      /* Can be mapped for writing */
356  #define BDI_CAP_EXEC_MAP       0x00000040      /* Can be mapped for execution */
357 +#define BDI_CAP_PAGE_CONSTANT_WRITE    0x00000080      /* Zcopy write - for raid5 */
358  #define BDI_CAP_VMFLAGS \
359         (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
360  
361 @@ -94,11 +95,18 @@
362  #define bdi_cap_account_dirty(bdi) \
363         (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
364  
365 +#define bdi_cap_page_constant_write(bdi) \
366 +       ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
367 +
368  #define mapping_cap_writeback_dirty(mapping) \
369         bdi_cap_writeback_dirty((mapping)->backing_dev_info)
370  
371  #define mapping_cap_account_dirty(mapping) \
372         bdi_cap_account_dirty((mapping)->backing_dev_info)
373  
374 +#define mapping_cap_page_constant_write(mapping) \
375 +       bdi_cap_page_constant_write((mapping)->backing_dev_info)
376 +       
377 +
378  
379  #endif         /* _LINUX_BACKING_DEV_H */
380 Index: linux-2.6.18-128.1.6/include/linux/page-flags.h
381 ===================================================================
382 --- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h        2009-04-14 21:05:24.000000000 -0600
383 +++ linux-2.6.18-128.1.6/include/linux/page-flags.h     2009-06-02 23:24:55.000000000 -0600
384 @@ -86,6 +86,7 @@
385  #define PG_reclaim             17      /* To be reclaimed asap */
386  #define PG_nosave_free         18      /* Free, should not be written */
387  #define PG_buddy               19      /* Page is free, on buddy lists */
388  #define PG_gup                 20      /* Page pin may be because of gup */
389 +#define PG_constant            21      /* To mark if the page is constant */
390  #define PG_xpmem               27      /* Testing for xpmem. */
391  
392  /* PG_owner_priv_1 users should have descriptive aliases */
393 @@ -283,6 +284,14 @@
394  
395  struct page;   /* forward declaration */
396  
397 +#define PageConstant(page)     test_bit(PG_constant, &(page)->flags)
398 +#define SetPageConstant(page)  set_bit(PG_constant, &(page)->flags)
399 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
400 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
401 +
402 +extern int set_page_constant(struct page *page);
403 +extern void clear_page_constant(struct page *);
404 +
405  int test_clear_page_dirty(struct page *page);
406  int test_clear_page_writeback(struct page *page);
407  int test_set_page_writeback(struct page *page);
408 Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
409 ===================================================================
410 --- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h        2009-06-02 23:24:50.000000000 -0600
411 +++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h     2009-06-02 23:24:55.000000000 -0600
412 @@ -156,8 +156,9 @@
413  #define        R5_Overlap      7       /* There is a pending overlapping request on this block */
414  #define        R5_ReadError    8       /* seen a read error here recently */
415  #define        R5_ReWrite      9       /* have tried to over-write the readerror */
416 -
417  #define        R5_Expanded     10      /* This block now has post-expand data */
418 +#define        R5_Direct       11      /* Use the pages in bio to do the write directly. */
419 +
420  /*
421   * Write method
422   */
423 Index: linux-2.6.18-128.1.6/mm/filemap.c
424 ===================================================================
425 --- linux-2.6.18-128.1.6.orig/mm/filemap.c      2009-04-14 21:05:46.000000000 -0600
426 +++ linux-2.6.18-128.1.6/mm/filemap.c   2009-06-02 23:24:55.000000000 -0600
427 @@ -30,6 +30,7 @@
428  #include <linux/security.h>
429  #include <linux/syscalls.h>
430  #include <linux/cpuset.h>
431 +#include <linux/rmap.h>
432  #include "filemap.h"
433  #include "internal.h"
434  
435 @@ -567,11 +568,55 @@
436                 if (!test_clear_page_writeback(page))
437                         BUG();
438         }
439 +       clear_page_constant(page);
440         smp_mb__after_clear_bit();
441         wake_up_page(page, PG_writeback);
442  }
443  EXPORT_SYMBOL(end_page_writeback);
444  
445 +/* Make a page to be constant, `constant' means any write to this page will
446 + * be blocked until clear_page_constant is called.
447 + * The page lock must be held.
448 + */
449 +int set_page_constant(struct page *page)
450 +{
451 +       BUG_ON(!PageLocked(page));
452 +
453 +       /* If it's an anonymous page and haven't been added to swap cache,
454 +        * return directly because we have no way to swap this page.
455 +        */
456 +       if (page_mapping(page) == NULL)
457 +               return SWAP_FAIL;
458 +
459 +       BUG_ON(!PageUptodate(page));
460 +
461 +       /* I have to clear page uptodate before trying to remove
462 +        * it from user's page table because otherwise, the page may be
463 +        * reinstalled by a page access which happens between try_to_unmap()
464 +        * and ClearPageUptodate(). -jay
465 +        */
466 +       ClearPageUptodate(page);
467 +       if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
468 +               SetPageUptodate(page);
469 +               return SWAP_FAIL;
470 +       }
471 +       SetPageConstant(page);
472 +       return SWAP_SUCCESS;
473 +}
474 +
475 +void clear_page_constant(struct page *page)
476 +{
477 +       if (PageConstant(page)) {
478 +               BUG_ON(!PageLocked(page));
479 +               BUG_ON(PageUptodate(page));
480 +               ClearPageConstant(page);
481 +               SetPageUptodate(page);
482 +               unlock_page(page);
483 +       }
484 +}
485 +EXPORT_SYMBOL(set_page_constant);
486 +EXPORT_SYMBOL(clear_page_constant);
487 +
488  /**
489   * __lock_page - get a lock on the page, assuming we need to sleep to get it
490   * @page: the page to lock