Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-zerocopy.patch
1 diff -pru linux-2.6.9.orig/drivers/md/raid5.c linux-2.6.9/drivers/md/raid5.c
2 --- linux-2.6.9.orig/drivers/md/raid5.c 2007-07-09 02:43:33.000000000 -0600
3 +++ linux-2.6.9/drivers/md/raid5.c      2007-07-13 00:39:15.000000000 -0600
4 @@ -412,6 +412,7 @@ static int raid5_end_read_request (struc
5                 clear_buffer_uptodate(bh);
6         }
7  #endif
8 +       BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
9         clear_bit(R5_LOCKED, &sh->dev[i].flags);
10         set_bit(STRIPE_HANDLE, &sh->state);
11         release_stripe(sh);
12 @@ -450,6 +451,10 @@ static int raid5_end_write_request (stru
13  
14         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
15         
16 +       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
17 +               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
18 +               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
19 +       }
20         clear_bit(R5_LOCKED, &sh->dev[i].flags);
21         set_bit(STRIPE_HANDLE, &sh->state);
22         __release_stripe(conf, sh);
23 @@ -621,6 +626,25 @@ static sector_t compute_blocknr(struct s
24  }
25  
26  
27 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
28 +{
29 +       sector_t bi_sector = bio->bi_sector;
30 +       struct page *page;
31 +       struct bio_vec *bvl;
32 +       int i;
33 +
34 +       bio_for_each_segment(bvl, bio, i) {
35 +               if (sector > bi_sector) {
36 +                       bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
37 +                       continue;
38 +               }
39 +               BUG_ON(sector != bi_sector);
40 +               page = bio_iovec_idx(bio, i)->bv_page;
41 +               return PageConstant(page) ? page : NULL;
42 +       }
43 +       BUG();
44 +       return NULL;
45 +}
46  
47  /*
48   * Copy data between a page in the stripe cache, and one or more bion
49 @@ -716,8 +740,9 @@ static void compute_parity(struct stripe
50  {
51         raid5_conf_t *conf = sh->raid_conf;
52         int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
53 -       void *ptr[MAX_XOR_BLOCKS];
54 +       void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
55         struct bio *chosen;
56 +       struct page *page;
57  
58         PRINTK("compute_parity, stripe %llu, method %d\n",
59                 (unsigned long long)sh->sector, method);
60 @@ -744,13 +769,14 @@ static void compute_parity(struct stripe
61                 break;
62         case RECONSTRUCT_WRITE:
63                 memset(ptr[0], 0, STRIPE_SIZE);
64 -               for (i= disks; i-- ;)
65 +               for (i= disks; i-- ;) {
66                         if (i!=pd_idx && sh->dev[i].towrite) {
67                                 chosen = sh->dev[i].towrite;
68                                 sh->dev[i].towrite = NULL;
69                                 if (sh->dev[i].written) BUG();
70                                 sh->dev[i].written = chosen;
71                         }
72 +               }
73                 break;
74         case CHECK_PARITY:
75                 break;
76 @@ -760,34 +786,88 @@ static void compute_parity(struct stripe
77                 count = 1;
78         }
79         
80 -       for (i = disks; i--;)
81 -               if (sh->dev[i].written) {
82 -                       sector_t sector = sh->dev[i].sector;
83 -                       struct bio *wbi = sh->dev[i].written;
84 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
85 -                               copy_data(1, wbi, sh->dev[i].page, sector);
86 -                               wbi = r5_next_bio(wbi, sector);
87 +       for (i = disks; i--;) {
88 +               struct bio *wbi = sh->dev[i].written;
89 +               sector_t sector;
90 +
91 +               if (!wbi)
92 +                       continue;
93 +
94 +               sector = sh->dev[i].sector;
95 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
96 +               BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
97 +
98 +               /* check if it's covered by a single page
99 +                  and whole stripe is written at once.
100 +                * in this case we can avoid memcpy() */
101 +               if (!wbi->bi_next && test_bit(R5_OVERWRITE, &sh->dev[i].flags) &&
102 +                   test_bit(R5_Insync, &sh->dev[i].flags)) {
103 +                       page = zero_copy_data(wbi, sector);
104 +                       if (page) {
105 +                               atomic_inc(&conf->writes_zcopy);
106 +                               sh->dev[i].req.bi_io_vec[0].bv_page = page;
107 +                               set_bit(R5_Direct, &sh->dev[i].flags);
108 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
109 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
110 +                               continue;
111                         }
112 +               }
113  
114 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
115 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
116 +               atomic_inc(&conf->writes_copied);
117 +               test_and_clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
118 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
119 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
120 +                       copy_data(1, wbi, sh->dev[i].page, sector);
121 +                       wbi = r5_next_bio(wbi, sector);
122                 }
123 +       }
124  
125 +       h_ptr[0] = ptr[0];
126         switch(method) {
127         case RECONSTRUCT_WRITE:
128         case CHECK_PARITY:
129 -               for (i=disks; i--;)
130 -                       if (i != pd_idx) {
131 -                               ptr[count++] = page_address(sh->dev[i].page);
132 -                               check_xor();
133 +               for (i=disks; i--;) {
134 +                       if (i == pd_idx)
135 +                               continue;
136 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
137 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
138 +                       else
139 +                               page = sh->dev[i].page;
140 +
141 +                       /* have to compute the parity immediately for
142 +                        * a highmem page. it would happen for zerocopy. -jay
143 +                        */
144 +                       if (PageHighMem(page)) {
145 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
146 +                               xor_block(2, STRIPE_SIZE, h_ptr);
147 +                               kunmap_atomic(page, KM_USER0);
148 +                       } else {
149 +                               ptr[count++] = page_address(page);
150                         }
151 +                       check_xor();
152 +               }
153                 break;
154         case READ_MODIFY_WRITE:
155 -               for (i = disks; i--;)
156 -                       if (sh->dev[i].written) {
157 -                               ptr[count++] = page_address(sh->dev[i].page);
158 -                               check_xor();
159 +               for (i = disks; i--;) {
160 +                       if (!sh->dev[i].written)
161 +                               continue;
162 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
163 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
164 +                       else
165 +                               page = sh->dev[i].page;
166 +
167 +                       /* have to compute the parity immediately for
168 +                        * a highmem page. it would happen for zerocopy. -jay
169 +                        */
170 +                       if (PageHighMem(page)) {
171 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
172 +                               xor_block(2, STRIPE_SIZE, h_ptr);
173 +                               kunmap_atomic(page, KM_USER0);
174 +                       } else {
175 +                               ptr[count++] = page_address(page);
176                         }
177 +                       check_xor();
178 +               }
179         }
180         if (count != 1)
181                 xor_block(count, STRIPE_SIZE, ptr);
182 @@ -1059,13 +1139,15 @@ static void handle_stripe(struct stripe_
183                 if (sh->dev[i].written) {
184                     dev = &sh->dev[i];
185                     if (!test_bit(R5_LOCKED, &dev->flags) &&
186 -                        test_bit(R5_UPTODATE, &dev->flags) ) {
187 +                        (test_bit(R5_UPTODATE, &dev->flags) ||
188 +                               test_bit(R5_Direct, &dev->flags)) ) {
189                         /* We can return any write requests */
190                             struct bio *wbi, *wbi2;
191                             PRINTK("Return write for disc %d\n", i);
192                             spin_lock_irq(&conf->device_lock);
193                             wbi = dev->written;
194                             dev->written = NULL;
195 +                           test_and_clear_bit(R5_Direct, &dev->flags);
196                             while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
197                                     wbi2 = r5_next_bio(wbi, dev->sector);
198                                     if (--wbi->bi_phys_segments == 0) {
199 @@ -1831,6 +1913,7 @@ memory = conf->max_nr_stripes * (sizeof(
200                 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
201                         mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
202         }
203 +       mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
204  
205         /* Ok, everything is just fine now */
206         mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
207 @@ -1918,9 +2001,11 @@ static void status (struct seq_file *seq
208                         atomic_read(&conf->handled_in_raid5d),
209                         atomic_read(&conf->out_of_stripes),
210                         atomic_read(&conf->handle_called));
211 -       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
212 +       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
213                         atomic_read(&conf->reads_for_rmw),
214 -                       atomic_read(&conf->reads_for_rcw));
215 +                       atomic_read(&conf->reads_for_rcw),
216 +                       atomic_read(&conf->writes_zcopy),
217 +                       atomic_read(&conf->writes_copied));
218         seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
219                         atomic_read(&conf->delayed),
220                         atomic_read(&conf->active_stripes),
221 diff -pru linux-2.6.9.orig/include/linux/backing-dev.h linux-2.6.9/include/linux/backing-dev.h
222 --- linux-2.6.9.orig/include/linux/backing-dev.h        2004-10-18 15:53:46.000000000 -0600
223 +++ linux-2.6.9/include/linux/backing-dev.h     2007-07-13 00:12:46.000000000 -0600
224 @@ -30,8 +30,11 @@ struct backing_dev_info {
225         void *congested_data;   /* Pointer to aux data for congested func */
226         void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
227         void *unplug_io_data;
228 +       unsigned int capabilities;
229  };
230  
231 +#define BDI_CAP_PAGE_CONST_WRITE      0x00000001
232 +
233  extern struct backing_dev_info default_backing_dev_info;
234  void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
235  
236 @@ -62,4 +65,7 @@ static inline int bdi_rw_congested(struc
237                                   (1 << BDI_write_congested));
238  }
239  
240 +#define mapping_cap_page_constant_write(mapping) \
241 +       ((mapping)->backing_dev_info->capabilities & BDI_CAP_PAGE_CONST_WRITE)
242 +
243  #endif         /* _LINUX_BACKING_DEV_H */
244 diff -pru linux-2.6.9.orig/include/linux/page-flags.h linux-2.6.9/include/linux/page-flags.h
245 --- linux-2.6.9.orig/include/linux/page-flags.h 2004-10-18 15:54:39.000000000 -0600
246 +++ linux-2.6.9/include/linux/page-flags.h      2007-07-13 00:12:46.000000000 -0600
247 @@ -74,6 +74,7 @@
248  #define PG_swapcache           16      /* Swap page: swp_entry_t in private */
249  #define PG_mappedtodisk                17      /* Has blocks allocated on-disk */
250  #define PG_reclaim             18      /* To be reclaimed asap */
251 +#define PG_constant            19  /* To mark the page is constant */
252  
253  
254  /*
255 @@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u
256  #define PageSwapCache(page)    0
257  #endif
258  
259 +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
260 +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
261 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
262 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
263 +
264  struct page;   /* forward declaration */
265  
266  int test_clear_page_dirty(struct page *page);
267 diff -pru linux-2.6.9.orig/include/linux/pagemap.h linux-2.6.9/include/linux/pagemap.h
268 --- linux-2.6.9.orig/include/linux/pagemap.h    2004-10-18 15:53:06.000000000 -0600
269 +++ linux-2.6.9/include/linux/pagemap.h 2007-07-13 00:12:46.000000000 -0600
270 @@ -191,6 +191,19 @@ static inline void wait_on_page_writebac
271  
272  extern void end_page_writeback(struct page *page);
273  
274 +extern int set_page_constant(struct page *page);
275 +extern void clear_page_constant(struct page *);
276 +static inline int set_page_constant_lock(struct page *page)
277 +{
278 +        BUG_ON(PageLocked(page));
279 +        lock_page(page);
280 +        if (set_page_constant(page)) {
281 +                unlock_page(page);
282 +                return 1;
283 +        }
284 +        return 0;
285 +}
286 +
287  /*
288   * Fault a userspace page into pagetables.  Return non-zero on a fault.
289   *
290 diff -pru linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h
291 --- linux-2.6.9.orig/include/linux/raid/raid5.h 2007-07-09 02:43:33.000000000 -0600
292 +++ linux-2.6.9/include/linux/raid/raid5.h      2007-07-13 00:39:15.000000000 -0600
293 @@ -153,6 +153,7 @@ struct stripe_head {
294  #define        R5_Wantread     4       /* want to schedule a read */
295  #define        R5_Wantwrite    5
296  #define        R5_Syncio       6       /* this io need to be accounted as resync io */
297 +#define        R5_Direct       7       /* use page from passed bio to avoid memcpy */
298  
299  /*
300   * Write method
301 @@ -234,6 +235,8 @@ struct raid5_private_data {
302         atomic_t                out_of_stripes;
303         atomic_t                reads_for_rmw;
304         atomic_t                reads_for_rcw;
305 +       atomic_t                writes_zcopy;
306 +       atomic_t                writes_copied;
307         atomic_t                handle_called;
308         atomic_t                delayed;
309         atomic_t                in_reqs_in_queue;
310 diff -pru linux-2.6.9.orig/mm/filemap.c linux-2.6.9/mm/filemap.c
311 --- linux-2.6.9.orig/mm/filemap.c       2007-07-09 02:43:33.000000000 -0600
312 +++ linux-2.6.9/mm/filemap.c    2007-07-13 00:12:46.000000000 -0600
313 @@ -27,6 +27,8 @@
314  #include <linux/pagevec.h>
315  #include <linux/blkdev.h>
316  #include <linux/security.h>
317 +#include <linux/rmap.h>
318 +
319  /*
320   * This is needed for the following functions:
321   *  - try_to_release_page
322 @@ -486,11 +488,52 @@ void end_page_writeback(struct page *pag
323                         BUG();
324                 smp_mb__after_clear_bit();
325         }
326 +       clear_page_constant(page);
327         wake_up_page(page);
328  }
329  
330  EXPORT_SYMBOL(end_page_writeback);
331  
332 +/* Mark a page in bio to be constant, page must be locked */
333 +int set_page_constant(struct page *page)
334 +{
335 +       BUG_ON(!PageLocked(page));
336 +
337 +       /* If it's an anonymous page and haven't been added to swap cache, 
338 +        * do it here.
339 +        */
340 +       if (PageAnon(page) && !PageSwapCache(page))
341 +               return 1;
342 +
343 +       BUG_ON(!PageUptodate(page));
344 +
345 +       /* I have to clear page uptodate before trying to remove
346 +        * it from user's page table because otherwise, the page may be
347 +        * reinstalled by a page access which happens between try_to_unmap()
348 +        * and ClearPageUptodate(). -jay
349 +        */
350 +       ClearPageUptodate(page);
351 +       if (page_mapped(page) && try_to_unmap(page) != SWAP_SUCCESS) {
352 +               SetPageUptodate(page);
353 +               return 1;
354 +       }
355 +       SetPageConstant(page);
356 +       return 0;
357 +}
358 +
359 +void clear_page_constant(struct page *page)
360 +{
361 +       if (PageConstant(page)) {
362 +               BUG_ON(!PageLocked(page));
363 +               BUG_ON(PageUptodate(page));
364 +               ClearPageConstant(page);
365 +               SetPageUptodate(page);
366 +               unlock_page(page);
367 +       }
368 +}
369 +EXPORT_SYMBOL(set_page_constant);
370 +EXPORT_SYMBOL(clear_page_constant);
371 +
372  /*
373   * Get a lock on the page, assuming we need to sleep to get it.
374   *