Index: linux-2.6.18-128.1.6/drivers/md/raid5.c =================================================================== --- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c 2009-06-02 23:24:52.000000000 -0600 +++ linux-2.6.18-128.1.6/drivers/md/raid5.c 2009-06-02 23:24:55.000000000 -0600 @@ -633,6 +633,9 @@ clear_buffer_uptodate(bh); } #endif + /* Read on a Directing write is allowable */ + /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */ + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -669,6 +672,10 @@ rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + if (test_bit(R5_Direct, &sh->dev[i].flags)) { + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; + } clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -910,7 +917,27 @@ return r_sector; } +static struct page *zero_copy_data(struct bio *bio, sector_t sector) +{ + sector_t bi_sector = bio->bi_sector; + struct page *page = NULL; + struct bio_vec *bvl; + int i; + bio_for_each_segment(bvl, bio, i) { + if (sector == bi_sector) + page = bio_iovec_idx(bio, i)->bv_page; + bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9; + if (bi_sector >= sector + STRIPE_SECTORS) { + /* check if the stripe is covered by one page */ + if (page == bio_iovec_idx(bio, i)->bv_page && + PageConstant(page)) + return page; + return NULL; + } + } + return NULL; +} /* * Copy data between a page in the stripe cache, and one or more bion @@ -1002,8 +1029,9 @@ { raid5_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, disks = sh->disks, count; - void *ptr[MAX_XOR_BLOCKS]; + void *ptr[MAX_XOR_BLOCKS], *h_ptr[2]; struct bio *chosen; + struct page *page; PRINTK("compute_parity5, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); @@ -1053,34 +1081,92 @@ count = 1; } - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); + for (i = disks; i--;) { + struct r5dev *dev = &sh->dev[i]; + struct bio *wbi = dev->written; + sector_t sector; + + if (!wbi) + continue; + + sector = dev->sector; + set_bit(R5_LOCKED, &sh->dev[i].flags); + BUG_ON(test_bit(R5_Direct, &dev->flags)); + + /* check if it's covered by a single page + and whole stripe is written at once. + * in this case we can avoid memcpy() */ + if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) && + test_bit(R5_Insync, &dev->flags)) { + page = zero_copy_data(wbi, sector); + if (page) { + atomic_inc(&conf->writes_zcopy); + /* The pointer must be restored whenever the LOCKED + * gets cleared. */ + dev->req.bi_io_vec[0].bv_page = page; + set_bit(R5_Direct, &dev->flags); + clear_bit(R5_UPTODATE, &sh->dev[i].flags); + clear_bit(R5_OVERWRITE, &sh->dev[i].flags); + continue; } + } - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); + /* do copy write */ + atomic_inc(&conf->writes_copied); + clear_bit(R5_OVERWRITE, &sh->dev[i].flags); + set_bit(R5_UPTODATE, &sh->dev[i].flags); + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { + copy_data(1, wbi, sh->dev[i].page, sector); + wbi = r5_next_bio(wbi, sector); } + } + h_ptr[0] = ptr[0]; switch(method) { case RECONSTRUCT_WRITE: case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); + for (i=disks; i--;) { + if (i == pd_idx) + continue; + if (test_bit(R5_Direct, &sh->dev[i].flags)) + page = sh->dev[i].req.bi_io_vec[0].bv_page; + else + page = sh->dev[i].page; + + /* have to compute the parity immediately for + * a highmem page. it would happen for zerocopy. -jay + */ + if (PageHighMem(page)) { + h_ptr[1] = kmap_atomic(page, KM_USER0); + xor_block(2, STRIPE_SIZE, h_ptr); + kunmap_atomic(page, KM_USER0); + } else { + ptr[count++] = page_address(page); } + check_xor(); + } break; case READ_MODIFY_WRITE: - for (i = disks; i--;) - if (sh->dev[i].written) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); + for (i = disks; i--;) { + if (!sh->dev[i].written) + continue; + if (test_bit(R5_Direct, &sh->dev[i].flags)) + page = sh->dev[i].req.bi_io_vec[0].bv_page; + else + page = sh->dev[i].page; + + /* have to compute the parity immediately for + * a highmem page. it would happen for zerocopy. -jay + */ + if (PageHighMem(page)) { + h_ptr[1] = kmap_atomic(page, KM_USER0); + xor_block(2, STRIPE_SIZE, h_ptr); + kunmap_atomic(page, KM_USER0); + } else { + ptr[count++] = page_address(page); } + check_xor(); + } } if (count != 1) xor_block(count, STRIPE_SIZE, ptr); @@ -1097,6 +1183,7 @@ raid6_conf_t *conf = sh->raid_conf; int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count; struct bio *chosen; + struct page *page; /**** FIX THIS: This could be very bad if disks is close to 256 ****/ void *ptrs[disks]; @@ -1126,18 +1213,49 @@ BUG(); /* Not implemented yet */ } - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); + for (i = disks; i--;) { + struct r5dev *dev = &sh->dev[i]; + struct bio *wbi = dev->written; + sector_t sector; + + if (!wbi) + continue; + + sector = sh->dev[i].sector; + set_bit(R5_LOCKED, &sh->dev[i].flags); + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); + + /* check if it's covered by a single page + * and whole stripe is written at once. + * in this case we can avoid memcpy() */ + if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) && + test_bit(R5_OVERWRITE, &sh->dev[i].flags)) { + page = zero_copy_data(wbi, sector); + /* we don't do zerocopy on a HighMem page. Raid6 tend + * to prepare all of the pages' content to be accessed + * before computing PQ parity. If we need to support HighMem + * page also, we have to modify the gen_syndrome() + * algorithm. -jay */ + if (page && !PageHighMem(page)) { + atomic_inc(&conf->writes_zcopy); + /* The pointer must be restored whenever the LOCKED + * gets cleared. */ + sh->dev[i].req.bi_io_vec[0].bv_page = page; + set_bit(R5_Direct, &sh->dev[i].flags); + clear_bit(R5_UPTODATE, &sh->dev[i].flags); + clear_bit(R5_OVERWRITE, &sh->dev[i].flags); + continue; } + } - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); + atomic_inc(&conf->writes_copied); + clear_bit(R5_OVERWRITE, &sh->dev[i].flags); + set_bit(R5_UPTODATE, &sh->dev[i].flags); + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { + copy_data(1, wbi, sh->dev[i].page, sector); + wbi = r5_next_bio(wbi, sector); } + } // switch(method) { // case RECONSTRUCT_WRITE: @@ -1148,8 +1266,12 @@ count = 0; i = d0_idx; do { - ptrs[count++] = page_address(sh->dev[i].page); - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags)) + if (test_bit(R5_Direct, &sh->dev[i].flags)) + ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page); + else + ptrs[count++] = page_address(sh->dev[i].page); + if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) && + !test_bit(R5_Direct, &sh->dev[i].flags)) printk("block %d/%d not uptodate on parity calc\n", i,count); i = raid6_next_disk(i, disks); } while ( i != d0_idx ); @@ -1596,7 +1718,8 @@ if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags) ) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Direct, &dev->flags)) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; int bitmap_end = 0; @@ -1604,6 +1727,7 @@ spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; + clear_bit(R5_Direct, &dev->flags); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (--wbi->bi_phys_segments == 0) { @@ -1967,6 +2091,15 @@ set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); + + if (test_bit(R5_Direct, &sh->dev[i].flags)) { + /* restore the page pointer of req, otherwise, + * no any read is permitted on this stripe, this is + * not what we want. -jay */ + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; + } + clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } @@ -2172,7 +2305,8 @@ if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags) ) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Direct, &dev->flags)) ) { /* We can return any write requests */ int bitmap_end = 0; struct bio *wbi, *wbi2; @@ -2181,6 +2315,7 @@ spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; + clear_bit(R5_Direct, &dev->flags); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (--wbi->bi_phys_segments == 0) { @@ -2532,6 +2667,15 @@ set_bit(STRIPE_DEGRADED, &sh->state); PRINTK("skip op %ld on disc %d for sector %llu\n", bi->bi_rw, i, (unsigned long long)sh->sector); + + if (test_bit(R5_Direct, &sh->dev[i].flags)) { + /* restore the page pointer of req, otherwise, + * no any read is permitted on this stripe, this is + * not what we want. -jay */ + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; + } + clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); } @@ -3451,6 +3595,9 @@ mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT; mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;; + /* raid5 device is able to do zcopy right now. */ + mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE; + return 0; abort: if (conf) { @@ -3537,9 +3684,11 @@ atomic_read(&conf->handled_in_raid5d), atomic_read(&conf->out_of_stripes), atomic_read(&conf->handle_called)); - seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", + seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u", atomic_read(&conf->reads_for_rmw), - atomic_read(&conf->reads_for_rcw)); + atomic_read(&conf->reads_for_rcw), + atomic_read(&conf->writes_zcopy), + atomic_read(&conf->writes_copied)); seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n", atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed), atomic_read(&conf->active_stripes), Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h =================================================================== --- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h 2006-09-19 21:42:06.000000000 -0600 +++ linux-2.6.18-128.1.6/include/linux/backing-dev.h 2009-06-02 23:24:55.000000000 -0600 @@ -48,6 +48,7 @@ #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */ #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */ #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */ +#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */ #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) @@ -94,11 +95,18 @@ #define bdi_cap_account_dirty(bdi) \ (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY)) +#define bdi_cap_page_constant_write(bdi) \ + ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE) + #define mapping_cap_writeback_dirty(mapping) \ bdi_cap_writeback_dirty((mapping)->backing_dev_info) #define mapping_cap_account_dirty(mapping) \ bdi_cap_account_dirty((mapping)->backing_dev_info) +#define mapping_cap_page_constant_write(mapping) \ + bdi_cap_page_constant_write((mapping)->backing_dev_info) + + #endif /* _LINUX_BACKING_DEV_H */ Index: linux-2.6.18-128.1.6/include/linux/page-flags.h =================================================================== --- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h 2009-04-14 21:05:24.000000000 -0600 +++ linux-2.6.18-128.1.6/include/linux/page-flags.h 2009-06-02 23:24:55.000000000 -0600 @@ -86,6 +86,7 @@ #define PG_reclaim 17 /* To be reclaimed asap */ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_buddy 19 /* Page is free, on buddy lists */ #define PG_gup 20 /* Page pin may be because of gup */ +#define PG_constant 21 /* To mark if the page is constant */ #define PG_xpmem 27 /* Testing for xpmem. */ /* PG_owner_priv_1 users should have descriptive aliases */ @@ -283,6 +284,14 @@ struct page; /* forward declaration */ +#define PageConstant(page) test_bit(PG_constant, &(page)->flags) +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags) +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags)) +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags) + +extern int set_page_constant(struct page *page); +extern void clear_page_constant(struct page *); + int test_clear_page_dirty(struct page *page); int test_clear_page_writeback(struct page *page); int test_set_page_writeback(struct page *page); Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h =================================================================== --- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h 2009-06-02 23:24:50.000000000 -0600 +++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h 2009-06-02 23:24:55.000000000 -0600 @@ -156,8 +156,9 @@ #define R5_Overlap 7 /* There is a pending overlapping request on this block */ #define R5_ReadError 8 /* seen a read error here recently */ #define R5_ReWrite 9 /* have tried to over-write the readerror */ - #define R5_Expanded 10 /* This block now has post-expand data */ +#define R5_Direct 11 /* Use the pages in bio to do the write directly. */ + /* * Write method */ Index: linux-2.6.18-128.1.6/mm/filemap.c =================================================================== --- linux-2.6.18-128.1.6.orig/mm/filemap.c 2009-04-14 21:05:46.000000000 -0600 +++ linux-2.6.18-128.1.6/mm/filemap.c 2009-06-02 23:24:55.000000000 -0600 @@ -30,6 +30,7 @@ #include #include #include +#include #include "filemap.h" #include "internal.h" @@ -567,11 +568,55 @@ if (!test_clear_page_writeback(page)) BUG(); } + clear_page_constant(page); smp_mb__after_clear_bit(); wake_up_page(page, PG_writeback); } EXPORT_SYMBOL(end_page_writeback); +/* Make a page to be constant, `constant' means any write to this page will + * be blocked until clear_page_constant is called. + * The page lock must be held. + */ +int set_page_constant(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + /* If it's an anonymous page and haven't been added to swap cache, + * return directly because we have no way to swap this page. + */ + if (page_mapping(page) == NULL) + return SWAP_FAIL; + + BUG_ON(!PageUptodate(page)); + + /* I have to clear page uptodate before trying to remove + * it from user's page table because otherwise, the page may be + * reinstalled by a page access which happens between try_to_unmap() + * and ClearPageUptodate(). -jay + */ + ClearPageUptodate(page); + if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) { + SetPageUptodate(page); + return SWAP_FAIL; + } + SetPageConstant(page); + return SWAP_SUCCESS; +} + +void clear_page_constant(struct page *page) +{ + if (PageConstant(page)) { + BUG_ON(!PageLocked(page)); + BUG_ON(PageUptodate(page)); + ClearPageConstant(page); + SetPageUptodate(page); + unlock_page(page); + } +} +EXPORT_SYMBOL(set_page_constant); +EXPORT_SYMBOL(clear_page_constant); + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock