1 Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
2 ===================================================================
3 --- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c 2009-06-02 23:24:52.000000000 -0600
4 +++ linux-2.6.18-128.1.6/drivers/md/raid5.c 2009-06-02 23:24:55.000000000 -0600
6 clear_buffer_uptodate(bh);
9 + /* Read on a Directing write is allowable */
10 + /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
11 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
12 clear_bit(R5_LOCKED, &sh->dev[i].flags);
13 set_bit(STRIPE_HANDLE, &sh->state);
17 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
19 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
20 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
21 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
23 clear_bit(R5_LOCKED, &sh->dev[i].flags);
24 set_bit(STRIPE_HANDLE, &sh->state);
30 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
32 + sector_t bi_sector = bio->bi_sector;
33 + struct page *page = NULL;
34 + struct bio_vec *bvl;
37 + bio_for_each_segment(bvl, bio, i) {
38 + if (sector == bi_sector)
39 + page = bio_iovec_idx(bio, i)->bv_page;
40 + bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
41 + if (bi_sector >= sector + STRIPE_SECTORS) {
42 + /* check if the stripe is covered by one page */
43 + if (page == bio_iovec_idx(bio, i)->bv_page &&
53 * Copy data between a page in the stripe cache, and one or more bion
56 raid5_conf_t *conf = sh->raid_conf;
57 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
58 - void *ptr[MAX_XOR_BLOCKS];
59 + void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
63 PRINTK("compute_parity5, stripe %llu, method %d\n",
64 (unsigned long long)sh->sector, method);
65 @@ -1053,34 +1081,92 @@
69 - for (i = disks; i--;)
70 - if (sh->dev[i].written) {
71 - sector_t sector = sh->dev[i].sector;
72 - struct bio *wbi = sh->dev[i].written;
73 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
74 - copy_data(1, wbi, sh->dev[i].page, sector);
75 - wbi = r5_next_bio(wbi, sector);
76 + for (i = disks; i--;) {
77 + struct r5dev *dev = &sh->dev[i];
78 + struct bio *wbi = dev->written;
84 + sector = dev->sector;
85 + set_bit(R5_LOCKED, &sh->dev[i].flags);
86 + BUG_ON(test_bit(R5_Direct, &dev->flags));
88 + /* check if it's covered by a single page
89 + and whole stripe is written at once.
90 + * in this case we can avoid memcpy() */
91 + if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
92 + test_bit(R5_Insync, &dev->flags)) {
93 + page = zero_copy_data(wbi, sector);
95 + atomic_inc(&conf->writes_zcopy);
96 + /* The pointer must be restored whenever the LOCKED
98 + dev->req.bi_io_vec[0].bv_page = page;
99 + set_bit(R5_Direct, &dev->flags);
100 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
101 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
106 - set_bit(R5_LOCKED, &sh->dev[i].flags);
107 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
108 + /* do copy write */
109 + atomic_inc(&conf->writes_copied);
110 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
111 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
112 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
113 + copy_data(1, wbi, sh->dev[i].page, sector);
114 + wbi = r5_next_bio(wbi, sector);
120 case RECONSTRUCT_WRITE:
122 - for (i=disks; i--;)
124 - ptr[count++] = page_address(sh->dev[i].page);
126 + for (i=disks; i--;) {
129 + if (test_bit(R5_Direct, &sh->dev[i].flags))
130 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
132 + page = sh->dev[i].page;
134 + /* have to compute the parity immediately for
135 + * a highmem page. it would happen for zerocopy. -jay
137 + if (PageHighMem(page)) {
138 + h_ptr[1] = kmap_atomic(page, KM_USER0);
139 + xor_block(2, STRIPE_SIZE, h_ptr);
140 + kunmap_atomic(page, KM_USER0);
142 + ptr[count++] = page_address(page);
147 case READ_MODIFY_WRITE:
148 - for (i = disks; i--;)
149 - if (sh->dev[i].written) {
150 - ptr[count++] = page_address(sh->dev[i].page);
152 + for (i = disks; i--;) {
153 + if (!sh->dev[i].written)
155 + if (test_bit(R5_Direct, &sh->dev[i].flags))
156 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
158 + page = sh->dev[i].page;
160 + /* have to compute the parity immediately for
161 + * a highmem page. it would happen for zerocopy. -jay
163 + if (PageHighMem(page)) {
164 + h_ptr[1] = kmap_atomic(page, KM_USER0);
165 + xor_block(2, STRIPE_SIZE, h_ptr);
166 + kunmap_atomic(page, KM_USER0);
168 + ptr[count++] = page_address(page);
174 xor_block(count, STRIPE_SIZE, ptr);
175 @@ -1097,6 +1183,7 @@
176 raid6_conf_t *conf = sh->raid_conf;
177 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
180 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
183 @@ -1126,18 +1213,49 @@
184 BUG(); /* Not implemented yet */
187 - for (i = disks; i--;)
188 - if (sh->dev[i].written) {
189 - sector_t sector = sh->dev[i].sector;
190 - struct bio *wbi = sh->dev[i].written;
191 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
192 - copy_data(1, wbi, sh->dev[i].page, sector);
193 - wbi = r5_next_bio(wbi, sector);
194 + for (i = disks; i--;) {
195 + struct r5dev *dev = &sh->dev[i];
196 + struct bio *wbi = dev->written;
202 + sector = sh->dev[i].sector;
203 + set_bit(R5_LOCKED, &sh->dev[i].flags);
204 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
206 + /* check if it's covered by a single page
207 + * and whole stripe is written at once.
208 + * in this case we can avoid memcpy() */
209 + if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
210 + test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
211 + page = zero_copy_data(wbi, sector);
212 + /* we don't do zerocopy on a HighMem page. Raid6 tend
213 + * to prepare all of the pages' content to be accessed
214 + * before computing PQ parity. If we need to support HighMem
215 + * page also, we have to modify the gen_syndrome()
216 + * algorithm. -jay */
217 + if (page && !PageHighMem(page)) {
218 + atomic_inc(&conf->writes_zcopy);
219 + /* The pointer must be restored whenever the LOCKED
221 + sh->dev[i].req.bi_io_vec[0].bv_page = page;
222 + set_bit(R5_Direct, &sh->dev[i].flags);
223 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
224 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
229 - set_bit(R5_LOCKED, &sh->dev[i].flags);
230 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
231 + atomic_inc(&conf->writes_copied);
232 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
233 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
234 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
235 + copy_data(1, wbi, sh->dev[i].page, sector);
236 + wbi = r5_next_bio(wbi, sector);
241 // case RECONSTRUCT_WRITE:
242 @@ -1148,8 +1266,12 @@
246 - ptrs[count++] = page_address(sh->dev[i].page);
247 - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
248 + if (test_bit(R5_Direct, &sh->dev[i].flags))
249 + ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
251 + ptrs[count++] = page_address(sh->dev[i].page);
252 + if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
253 + !test_bit(R5_Direct, &sh->dev[i].flags))
254 printk("block %d/%d not uptodate on parity calc\n", i,count);
255 i = raid6_next_disk(i, disks);
256 } while ( i != d0_idx );
257 @@ -1596,7 +1718,8 @@
258 if (sh->dev[i].written) {
260 if (!test_bit(R5_LOCKED, &dev->flags) &&
261 - test_bit(R5_UPTODATE, &dev->flags) ) {
262 + (test_bit(R5_UPTODATE, &dev->flags) ||
263 + test_bit(R5_Direct, &dev->flags)) ) {
264 /* We can return any write requests */
265 struct bio *wbi, *wbi2;
267 @@ -1604,6 +1727,7 @@
268 spin_lock_irq(&conf->device_lock);
271 + clear_bit(R5_Direct, &dev->flags);
272 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
273 wbi2 = r5_next_bio(wbi, dev->sector);
274 if (--wbi->bi_phys_segments == 0) {
275 @@ -1967,6 +2091,15 @@
276 set_bit(STRIPE_DEGRADED, &sh->state);
277 PRINTK("skip op %ld on disc %d for sector %llu\n",
278 bi->bi_rw, i, (unsigned long long)sh->sector);
280 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
281 + /* restore the page pointer of req, otherwise,
282 + * no any read is permitted on this stripe, this is
283 + * not what we want. -jay */
284 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
285 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
288 clear_bit(R5_LOCKED, &sh->dev[i].flags);
289 set_bit(STRIPE_HANDLE, &sh->state);
291 @@ -2172,7 +2305,8 @@
292 if (sh->dev[i].written) {
294 if (!test_bit(R5_LOCKED, &dev->flags) &&
295 - test_bit(R5_UPTODATE, &dev->flags) ) {
296 + (test_bit(R5_UPTODATE, &dev->flags) ||
297 + test_bit(R5_Direct, &dev->flags)) ) {
298 /* We can return any write requests */
300 struct bio *wbi, *wbi2;
301 @@ -2181,6 +2315,7 @@
302 spin_lock_irq(&conf->device_lock);
305 + clear_bit(R5_Direct, &dev->flags);
306 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
307 wbi2 = r5_next_bio(wbi, dev->sector);
308 if (--wbi->bi_phys_segments == 0) {
309 @@ -2532,6 +2667,15 @@
310 set_bit(STRIPE_DEGRADED, &sh->state);
311 PRINTK("skip op %ld on disc %d for sector %llu\n",
312 bi->bi_rw, i, (unsigned long long)sh->sector);
314 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
315 + /* restore the page pointer of req, otherwise,
316 + * no any read is permitted on this stripe, this is
317 + * not what we want. -jay */
318 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
319 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
322 clear_bit(R5_LOCKED, &sh->dev[i].flags);
323 set_bit(STRIPE_HANDLE, &sh->state);
325 @@ -3451,6 +3595,9 @@
326 mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
327 mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
329 + /* raid5 device is able to do zcopy right now. */
330 + mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
335 @@ -3537,9 +3684,11 @@
336 atomic_read(&conf->handled_in_raid5d),
337 atomic_read(&conf->out_of_stripes),
338 atomic_read(&conf->handle_called));
339 - seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
340 + seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
341 atomic_read(&conf->reads_for_rmw),
342 - atomic_read(&conf->reads_for_rcw));
343 + atomic_read(&conf->reads_for_rcw),
344 + atomic_read(&conf->writes_zcopy),
345 + atomic_read(&conf->writes_copied));
346 seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
347 atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
348 atomic_read(&conf->active_stripes),
349 Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h
350 ===================================================================
351 --- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h 2006-09-19 21:42:06.000000000 -0600
352 +++ linux-2.6.18-128.1.6/include/linux/backing-dev.h 2009-06-02 23:24:55.000000000 -0600
354 #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
355 #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
356 #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
357 +#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */
358 #define BDI_CAP_VMFLAGS \
359 (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
362 #define bdi_cap_account_dirty(bdi) \
363 (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
365 +#define bdi_cap_page_constant_write(bdi) \
366 + ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
368 #define mapping_cap_writeback_dirty(mapping) \
369 bdi_cap_writeback_dirty((mapping)->backing_dev_info)
371 #define mapping_cap_account_dirty(mapping) \
372 bdi_cap_account_dirty((mapping)->backing_dev_info)
374 +#define mapping_cap_page_constant_write(mapping) \
375 + bdi_cap_page_constant_write((mapping)->backing_dev_info)
379 #endif /* _LINUX_BACKING_DEV_H */
380 Index: linux-2.6.18-128.1.6/include/linux/page-flags.h
381 ===================================================================
382 --- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h 2009-04-14 21:05:24.000000000 -0600
383 +++ linux-2.6.18-128.1.6/include/linux/page-flags.h 2009-06-02 23:24:55.000000000 -0600
385 #define PG_nosave_free 18 /* Free, should not be written */
386 #define PG_buddy 19 /* Page is free, on buddy lists */
387 #define PG_gup 20 /* Page pin may be because of gup */
388 +#define PG_constant 21 /* To mark if the page is constant */
389 #define PG_xpmem 27 /* Testing for xpmem. */
391 /* PG_owner_priv_1 users should have descriptive aliases */
394 struct page; /* forward declaration */
396 +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
397 +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
398 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
399 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
401 +extern int set_page_constant(struct page *page);
402 +extern void clear_page_constant(struct page *);
404 int test_clear_page_dirty(struct page *page);
405 int test_clear_page_writeback(struct page *page);
406 int test_set_page_writeback(struct page *page);
407 Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
408 ===================================================================
409 --- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h 2009-06-02 23:24:50.000000000 -0600
410 +++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h 2009-06-02 23:24:55.000000000 -0600
412 #define R5_Overlap 7 /* There is a pending overlapping request on this block */
413 #define R5_ReadError 8 /* seen a read error here recently */
414 #define R5_ReWrite 9 /* have tried to over-write the readerror */
416 #define R5_Expanded 10 /* This block now has post-expand data */
417 +#define R5_Direct 11 /* Use the pages in bio to do the write directly. */
422 Index: linux-2.6.18-128.1.6/mm/filemap.c
423 ===================================================================
424 --- linux-2.6.18-128.1.6.orig/mm/filemap.c 2009-04-14 21:05:46.000000000 -0600
425 +++ linux-2.6.18-128.1.6/mm/filemap.c 2009-06-02 23:24:55.000000000 -0600
427 #include <linux/security.h>
428 #include <linux/syscalls.h>
429 #include <linux/cpuset.h>
430 +#include <linux/rmap.h>
432 #include "internal.h"
434 @@ -567,11 +568,55 @@
435 if (!test_clear_page_writeback(page))
438 + clear_page_constant(page);
439 smp_mb__after_clear_bit();
440 wake_up_page(page, PG_writeback);
442 EXPORT_SYMBOL(end_page_writeback);
444 +/* Make a page to be constant, `constant' means any write to this page will
445 + * be blocked until clear_page_constant is called.
446 + * The page lock must be held.
448 +int set_page_constant(struct page *page)
450 + BUG_ON(!PageLocked(page));
452 + /* If it's an anonymous page and haven't been added to swap cache,
453 + * return directly because we have no way to swap this page.
455 + if (page_mapping(page) == NULL)
458 + BUG_ON(!PageUptodate(page));
460 + /* I have to clear page uptodate before trying to remove
461 + * it from user's page table because otherwise, the page may be
462 + * reinstalled by a page access which happens between try_to_unmap()
463 + * and ClearPageUptodate(). -jay
465 + ClearPageUptodate(page);
466 + if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
467 + SetPageUptodate(page);
470 + SetPageConstant(page);
471 + return SWAP_SUCCESS;
474 +void clear_page_constant(struct page *page)
476 + if (PageConstant(page)) {
477 + BUG_ON(!PageLocked(page));
478 + BUG_ON(PageUptodate(page));
479 + ClearPageConstant(page);
480 + SetPageUptodate(page);
484 +EXPORT_SYMBOL(set_page_constant);
485 +EXPORT_SYMBOL(clear_page_constant);
488 * __lock_page - get a lock on the page, assuming we need to sleep to get it
489 * @page: the page to lock