Whamcloud - gitweb
b=18649 set wait_recovery_complete() MAX value to max recovery time estimated
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-zerocopy.patch
1 diff -pur linux-2.6.9-67.orig/drivers/md/raid5.c linux-2.6.9-67/drivers/md/raid5.c
2 --- linux-2.6.9-67.orig/drivers/md/raid5.c      2009-02-15 10:11:54.000000000 +0800
3 +++ linux-2.6.9-67/drivers/md/raid5.c   2009-02-15 10:22:51.000000000 +0800
4 @@ -412,6 +412,9 @@ static int raid5_end_read_request (struc
5                 clear_buffer_uptodate(bh);
6         }
7  #endif
8 +       /* Read on a Directing write is allowable */
9 +       /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
10 +       BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
11         clear_bit(R5_LOCKED, &sh->dev[i].flags);
12         set_bit(STRIPE_HANDLE, &sh->state);
13         release_stripe(sh);
14 @@ -450,6 +453,10 @@ static int raid5_end_write_request (stru
15  
16         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
17         
18 +       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
19 +               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
20 +               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
21 +       }
22         clear_bit(R5_LOCKED, &sh->dev[i].flags);
23         set_bit(STRIPE_HANDLE, &sh->state);
24         __release_stripe(conf, sh);
25 @@ -620,7 +627,27 @@ static sector_t compute_blocknr(struct s
26         return r_sector;
27  }
28  
29 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
30 +{
31 +       sector_t bi_sector = bio->bi_sector;
32 +       struct page *page = NULL;
33 +       struct bio_vec *bvl;
34 +       int i;
35  
36 +       bio_for_each_segment(bvl, bio, i) {
37 +               if (sector == bi_sector)
38 +                       page = bio_iovec_idx(bio, i)->bv_page;
39 +               bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
40 +               if (bi_sector >= sector + STRIPE_SECTORS) {
41 +                       /* check if the stripe is covered by one page */
42 +                       if (page == bio_iovec_idx(bio, i)->bv_page &&
43 +                           PageConstant(page))
44 +                               return page;
45 +                       return NULL;
46 +               }
47 +       }
48 +       return NULL;
49 +}
50  
51  /*
52   * Copy data between a page in the stripe cache, and one or more bion
53 @@ -716,8 +743,9 @@ static void compute_parity(struct stripe
54  {
55         raid5_conf_t *conf = sh->raid_conf;
56         int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
57 -       void *ptr[MAX_XOR_BLOCKS];
58 +       void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
59         struct bio *chosen;
60 +       struct page *page;
61  
62         PRINTK("compute_parity, stripe %llu, method %d\n",
63                 (unsigned long long)sh->sector, method);
64 @@ -744,13 +772,14 @@ static void compute_parity(struct stripe
65                 break;
66         case RECONSTRUCT_WRITE:
67                 memset(ptr[0], 0, STRIPE_SIZE);
68 -               for (i= disks; i-- ;)
69 +               for (i= disks; i-- ;) {
70                         if (i!=pd_idx && sh->dev[i].towrite) {
71                                 chosen = sh->dev[i].towrite;
72                                 sh->dev[i].towrite = NULL;
73                                 if (sh->dev[i].written) BUG();
74                                 sh->dev[i].written = chosen;
75                         }
76 +               }
77                 break;
78         case CHECK_PARITY:
79                 break;
80 @@ -760,34 +789,90 @@ static void compute_parity(struct stripe
81                 count = 1;
82         }
83         
84 -       for (i = disks; i--;)
85 -               if (sh->dev[i].written) {
86 -                       sector_t sector = sh->dev[i].sector;
87 -                       struct bio *wbi = sh->dev[i].written;
88 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
89 -                               copy_data(1, wbi, sh->dev[i].page, sector);
90 -                               wbi = r5_next_bio(wbi, sector);
91 +       for (i = disks; i--;) {
92 +               struct bio *wbi = sh->dev[i].written;
93 +               sector_t sector;
94 +
95 +               if (!wbi)
96 +                       continue;
97 +
98 +               sector = sh->dev[i].sector;
99 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
100 +               BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
101 +
102 +               /* check if it's covered by a single page
103 +                  and whole stripe is written at once.
104 +                * in this case we can avoid memcpy() */
105 +               if (!wbi->bi_next && test_bit(R5_OVERWRITE, &sh->dev[i].flags) &&
106 +                   test_bit(R5_Insync, &sh->dev[i].flags)) {
107 +                       page = zero_copy_data(wbi, sector);
108 +                       if (page) {
109 +                               atomic_inc(&conf->writes_zcopy);
110 +                               /* The pointer must be restored whenever the LOCKED
111 +                                * gets cleared. */
112 +                               sh->dev[i].req.bi_io_vec[0].bv_page = page;
113 +                               set_bit(R5_Direct, &sh->dev[i].flags);
114 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
115 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
116 +                               continue;
117                         }
118 +               }
119  
120 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
121 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
122 +               atomic_inc(&conf->writes_copied);
123 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
124 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
125 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
126 +                       copy_data(1, wbi, sh->dev[i].page, sector);
127 +                       wbi = r5_next_bio(wbi, sector);
128                 }
129 +       }
130  
131 +       h_ptr[0] = ptr[0];
132         switch(method) {
133         case RECONSTRUCT_WRITE:
134         case CHECK_PARITY:
135 -               for (i=disks; i--;)
136 -                       if (i != pd_idx) {
137 -                               ptr[count++] = page_address(sh->dev[i].page);
138 -                               check_xor();
139 +               for (i=disks; i--;) {
140 +                       if (i == pd_idx)
141 +                               continue;
142 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
143 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
144 +                       else
145 +                               page = sh->dev[i].page;
146 +
147 +                       /* have to compute the parity immediately for
148 +                        * a highmem page. it would happen for zerocopy. -jay
149 +                        */
150 +                       if (PageHighMem(page)) {
151 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
152 +                               xor_block(2, STRIPE_SIZE, h_ptr);
153 +                               kunmap_atomic(page, KM_USER0);
154 +                       } else {
155 +                               ptr[count++] = page_address(page);
156                         }
157 +                       check_xor();
158 +               }
159                 break;
160         case READ_MODIFY_WRITE:
161 -               for (i = disks; i--;)
162 -                       if (sh->dev[i].written) {
163 -                               ptr[count++] = page_address(sh->dev[i].page);
164 -                               check_xor();
165 +               for (i = disks; i--;) {
166 +                       if (!sh->dev[i].written)
167 +                               continue;
168 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
169 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
170 +                       else
171 +                               page = sh->dev[i].page;
172 +
173 +                       /* have to compute the parity immediately for
174 +                        * a highmem page. it would happen for zerocopy. -jay
175 +                        */
176 +                       if (PageHighMem(page)) {
177 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
178 +                               xor_block(2, STRIPE_SIZE, h_ptr);
179 +                               kunmap_atomic(page, KM_USER0);
180 +                       } else {
181 +                               ptr[count++] = page_address(page);
182                         }
183 +                       check_xor();
184 +               }
185         }
186         if (count != 1)
187                 xor_block(count, STRIPE_SIZE, ptr);
188 @@ -1061,13 +1146,15 @@ static void handle_stripe(struct stripe_
189                 if (sh->dev[i].written) {
190                     dev = &sh->dev[i];
191                     if (!test_bit(R5_LOCKED, &dev->flags) &&
192 -                        test_bit(R5_UPTODATE, &dev->flags) ) {
193 +                        (test_bit(R5_UPTODATE, &dev->flags) ||
194 +                               test_bit(R5_Direct, &dev->flags)) ) {
195                         /* We can return any write requests */
196                             struct bio *wbi, *wbi2;
197                             PRINTK("Return write for disc %d\n", i);
198                             spin_lock_irq(&conf->device_lock);
199                             wbi = dev->written;
200                             dev->written = NULL;
201 +                           clear_bit(R5_Direct, &dev->flags);
202                             while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
203                                     wbi2 = r5_next_bio(wbi, dev->sector);
204                                     if (--wbi->bi_phys_segments == 0) {
205 @@ -1337,6 +1424,15 @@ static void handle_stripe(struct stripe_
206                 } else {
207                         PRINTK("skip op %ld on disc %d for sector %llu\n",
208                                 bi->bi_rw, i, (unsigned long long)sh->sector);
209 +
210 +                       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
211 +                               /* restore the page pointer of req, otherwise,
212 +                                * no any read is permitted on this stripe, this is
213 +                                * not what we want. -jay */
214 +                               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
215 +                               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
216 +                       }
217 +
218                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
219                         set_bit(STRIPE_HANDLE, &sh->state);
220                 }
221 @@ -1835,6 +1931,7 @@ memory = conf->max_nr_stripes * (sizeof(
222                 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
223                         mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
224         }
225 +       mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
226  
227         /* Ok, everything is just fine now */
228         mddev->array_size =  mddev->size * (mddev->raid_disks - 1);
229 @@ -1922,9 +2019,11 @@ static void status (struct seq_file *seq
230                         atomic_read(&conf->handled_in_raid5d),
231                         atomic_read(&conf->out_of_stripes),
232                         atomic_read(&conf->handle_called));
233 -       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
234 +       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
235                         atomic_read(&conf->reads_for_rmw),
236 -                       atomic_read(&conf->reads_for_rcw));
237 +                       atomic_read(&conf->reads_for_rcw),
238 +                       atomic_read(&conf->writes_zcopy),
239 +                       atomic_read(&conf->writes_copied));
240         seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
241                         atomic_read(&conf->delayed),
242                         atomic_read(&conf->active_stripes),
243 diff -pur linux-2.6.9-67.orig/include/linux/backing-dev.h linux-2.6.9-67/include/linux/backing-dev.h
244 --- linux-2.6.9-67.orig/include/linux/backing-dev.h     2009-02-15 10:11:54.000000000 +0800
245 +++ linux-2.6.9-67/include/linux/backing-dev.h  2009-02-15 10:22:40.000000000 +0800
246 @@ -30,8 +30,11 @@ struct backing_dev_info {
247         void *congested_data;   /* Pointer to aux data for congested func */
248         void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
249         void *unplug_io_data;
250 +       unsigned int capabilities;
251  };
252  
253 +#define BDI_CAP_PAGE_CONST_WRITE      0x00000001
254 +
255  extern struct backing_dev_info default_backing_dev_info;
256  void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
257  
258 @@ -62,4 +65,7 @@ static inline int bdi_rw_congested(struc
259                                   (1 << BDI_write_congested));
260  }
261  
262 +#define mapping_cap_page_constant_write(mapping) \
263 +       ((mapping)->backing_dev_info->capabilities & BDI_CAP_PAGE_CONST_WRITE)
264 +
265  #endif         /* _LINUX_BACKING_DEV_H */
266 diff -pur linux-2.6.9-67.orig/include/linux/page-flags.h linux-2.6.9-67/include/linux/page-flags.h
267 --- linux-2.6.9-67.orig/include/linux/page-flags.h      2009-02-15 10:11:54.000000000 +0800
268 +++ linux-2.6.9-67/include/linux/page-flags.h   2009-02-15 10:22:40.000000000 +0800
269 @@ -74,6 +74,7 @@
270  #define PG_swapcache           16      /* Swap page: swp_entry_t in private */
271  #define PG_mappedtodisk                17      /* Has blocks allocated on-disk */
272  #define PG_reclaim             18      /* To be reclaimed asap */
273 +#define PG_constant            19  /* To mark the page is constant */
274  
275  
276  /*
277 @@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u
278  #define PageSwapCache(page)    0
279  #endif
280  
281 +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
282 +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
283 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
284 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
285 +
286  struct page;   /* forward declaration */
287  
288  int test_clear_page_dirty(struct page *page);
289 diff -pur linux-2.6.9-67.orig/include/linux/pagemap.h linux-2.6.9-67/include/linux/pagemap.h
290 --- linux-2.6.9-67.orig/include/linux/pagemap.h 2009-02-15 10:11:54.000000000 +0800
291 +++ linux-2.6.9-67/include/linux/pagemap.h      2009-02-15 10:22:40.000000000 +0800
292 @@ -191,6 +191,19 @@ static inline void wait_on_page_writebac
293  
294  extern void end_page_writeback(struct page *page);
295  
296 +extern int set_page_constant(struct page *page);
297 +extern void clear_page_constant(struct page *);
298 +static inline int set_page_constant_lock(struct page *page)
299 +{
300 +        BUG_ON(PageLocked(page));
301 +        lock_page(page);
302 +        if (set_page_constant(page)) {
303 +                unlock_page(page);
304 +                return 1;
305 +        }
306 +        return 0;
307 +}
308 +
309  /*
310   * Fault a userspace page into pagetables.  Return non-zero on a fault.
311   *
312 diff -pur linux-2.6.9-67.orig/include/linux/raid/raid5.h linux-2.6.9-67/include/linux/raid/raid5.h
313 --- linux-2.6.9-67.orig/include/linux/raid/raid5.h      2009-02-15 10:11:54.000000000 +0800
314 +++ linux-2.6.9-67/include/linux/raid/raid5.h   2009-02-15 10:22:40.000000000 +0800
315 @@ -153,6 +153,7 @@ struct stripe_head {
316  #define        R5_Wantread     4       /* want to schedule a read */
317  #define        R5_Wantwrite    5
318  #define        R5_Syncio       6       /* this io need to be accounted as resync io */
319 +#define        R5_Direct       7       /* use page from passed bio to avoid memcpy */
320  
321  /*
322   * Write method
323 @@ -234,6 +235,8 @@ struct raid5_private_data {
324         atomic_t                out_of_stripes;
325         atomic_t                reads_for_rmw;
326         atomic_t                reads_for_rcw;
327 +       atomic_t                writes_zcopy;
328 +       atomic_t                writes_copied;
329         atomic_t                handle_called;
330         atomic_t                delayed;
331         atomic_t                in_reqs_in_queue;
332 diff -pur linux-2.6.9-67.orig/mm/filemap.c linux-2.6.9-67/mm/filemap.c
333 --- linux-2.6.9-67.orig/mm/filemap.c    2009-02-15 10:11:55.000000000 +0800
334 +++ linux-2.6.9-67/mm/filemap.c 2009-02-15 10:22:40.000000000 +0800
335 @@ -27,6 +27,8 @@
336  #include <linux/pagevec.h>
337  #include <linux/blkdev.h>
338  #include <linux/security.h>
339 +#include <linux/rmap.h>
340 +
341  /*
342   * This is needed for the following functions:
343   *  - try_to_release_page
344 @@ -485,11 +487,52 @@ void end_page_writeback(struct page *pag
345                         BUG();
346                 smp_mb__after_clear_bit();
347         }
348 +       clear_page_constant(page);
349         wake_up_page(page);
350  }
351  
352  EXPORT_SYMBOL(end_page_writeback);
353  
354 +/* Mark a page in bio to be constant, page must be locked */
355 +int set_page_constant(struct page *page)
356 +{
357 +       BUG_ON(!PageLocked(page));
358 +
359 +       /* If it's an anonymous page and haven't been added to swap cache, 
360 +        * do it here.
361 +        */
362 +       if (PageAnon(page) && !PageSwapCache(page))
363 +               return 1;
364 +
365 +       BUG_ON(!PageUptodate(page));
366 +
367 +       /* I have to clear page uptodate before trying to remove
368 +        * it from user's page table because otherwise, the page may be
369 +        * reinstalled by a page access which happens between try_to_unmap()
370 +        * and ClearPageUptodate(). -jay
371 +        */
372 +       ClearPageUptodate(page);
373 +       if (page_mapped(page) && try_to_unmap(page) != SWAP_SUCCESS) {
374 +               SetPageUptodate(page);
375 +               return 1;
376 +       }
377 +       SetPageConstant(page);
378 +       return 0;
379 +}
380 +
381 +void clear_page_constant(struct page *page)
382 +{
383 +       if (PageConstant(page)) {
384 +               BUG_ON(!PageLocked(page));
385 +               BUG_ON(PageUptodate(page));
386 +               ClearPageConstant(page);
387 +               SetPageUptodate(page);
388 +               unlock_page(page);
389 +       }
390 +}
391 +EXPORT_SYMBOL(set_page_constant);
392 +EXPORT_SYMBOL(clear_page_constant);
393 +
394  /*
395   * Get a lock on the page, assuming we need to sleep to get it.
396   *