+++ /dev/null
-Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c 2009-06-02 23:24:52.000000000 -0600
-+++ linux-2.6.18-128.1.6/drivers/md/raid5.c 2009-06-02 23:24:55.000000000 -0600
-@@ -633,6 +633,9 @@
- clear_buffer_uptodate(bh);
- }
- #endif
-+ /* Read on a Directing write is allowable */
-+ /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
-@@ -669,6 +672,10 @@
-
- rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-
-+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+ }
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- release_stripe(sh);
-@@ -910,7 +917,27 @@
- return r_sector;
- }
-
-+static struct page *zero_copy_data(struct bio *bio, sector_t sector)
-+{
-+ sector_t bi_sector = bio->bi_sector;
-+ struct page *page = NULL;
-+ struct bio_vec *bvl;
-+ int i;
-
-+ bio_for_each_segment(bvl, bio, i) {
-+ if (sector == bi_sector)
-+ page = bio_iovec_idx(bio, i)->bv_page;
-+ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
-+ if (bi_sector >= sector + STRIPE_SECTORS) {
-+ /* check if the stripe is covered by one page */
-+ if (page == bio_iovec_idx(bio, i)->bv_page &&
-+ PageConstant(page))
-+ return page;
-+ return NULL;
-+ }
-+ }
-+ return NULL;
-+}
-
- /*
- * Copy data between a page in the stripe cache, and one or more bion
-@@ -1002,8 +1029,9 @@
- {
- raid5_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-- void *ptr[MAX_XOR_BLOCKS];
-+ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
- struct bio *chosen;
-+ struct page *page;
-
- PRINTK("compute_parity5, stripe %llu, method %d\n",
- (unsigned long long)sh->sector, method);
-@@ -1053,34 +1081,92 @@
- count = 1;
- }
-
-- for (i = disks; i--;)
-- if (sh->dev[i].written) {
-- sector_t sector = sh->dev[i].sector;
-- struct bio *wbi = sh->dev[i].written;
-- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-- copy_data(1, wbi, sh->dev[i].page, sector);
-- wbi = r5_next_bio(wbi, sector);
-+ for (i = disks; i--;) {
-+ struct r5dev *dev = &sh->dev[i];
-+ struct bio *wbi = dev->written;
-+ sector_t sector;
-+
-+ if (!wbi)
-+ continue;
-+
-+ sector = dev->sector;
-+ set_bit(R5_LOCKED, &sh->dev[i].flags);
-+ BUG_ON(test_bit(R5_Direct, &dev->flags));
-+
-+ /* check if it's covered by a single page
-+ and whole stripe is written at once.
-+ * in this case we can avoid memcpy() */
-+ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
-+ test_bit(R5_Insync, &dev->flags)) {
-+ page = zero_copy_data(wbi, sector);
-+ if (page) {
-+ atomic_inc(&conf->writes_zcopy);
-+ /* The pointer must be restored whenever the LOCKED
-+ * gets cleared. */
-+ dev->req.bi_io_vec[0].bv_page = page;
-+ set_bit(R5_Direct, &dev->flags);
-+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ continue;
- }
-+ }
-
-- set_bit(R5_LOCKED, &sh->dev[i].flags);
-- set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ /* do copy write */
-+ atomic_inc(&conf->writes_copied);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-+ copy_data(1, wbi, sh->dev[i].page, sector);
-+ wbi = r5_next_bio(wbi, sector);
- }
-+ }
-
-+ h_ptr[0] = ptr[0];
- switch(method) {
- case RECONSTRUCT_WRITE:
- case CHECK_PARITY:
-- for (i=disks; i--;)
-- if (i != pd_idx) {
-- ptr[count++] = page_address(sh->dev[i].page);
-- check_xor();
-+ for (i=disks; i--;) {
-+ if (i == pd_idx)
-+ continue;
-+ if (test_bit(R5_Direct, &sh->dev[i].flags))
-+ page = sh->dev[i].req.bi_io_vec[0].bv_page;
-+ else
-+ page = sh->dev[i].page;
-+
-+ /* have to compute the parity immediately for
-+ * a highmem page. it would happen for zerocopy. -jay
-+ */
-+ if (PageHighMem(page)) {
-+ h_ptr[1] = kmap_atomic(page, KM_USER0);
-+ xor_block(2, STRIPE_SIZE, h_ptr);
-+ kunmap_atomic(page, KM_USER0);
-+ } else {
-+ ptr[count++] = page_address(page);
- }
-+ check_xor();
-+ }
- break;
- case READ_MODIFY_WRITE:
-- for (i = disks; i--;)
-- if (sh->dev[i].written) {
-- ptr[count++] = page_address(sh->dev[i].page);
-- check_xor();
-+ for (i = disks; i--;) {
-+ if (!sh->dev[i].written)
-+ continue;
-+ if (test_bit(R5_Direct, &sh->dev[i].flags))
-+ page = sh->dev[i].req.bi_io_vec[0].bv_page;
-+ else
-+ page = sh->dev[i].page;
-+
-+ /* have to compute the parity immediately for
-+ * a highmem page. it would happen for zerocopy. -jay
-+ */
-+ if (PageHighMem(page)) {
-+ h_ptr[1] = kmap_atomic(page, KM_USER0);
-+ xor_block(2, STRIPE_SIZE, h_ptr);
-+ kunmap_atomic(page, KM_USER0);
-+ } else {
-+ ptr[count++] = page_address(page);
- }
-+ check_xor();
-+ }
- }
- if (count != 1)
- xor_block(count, STRIPE_SIZE, ptr);
-@@ -1097,6 +1183,7 @@
- raid6_conf_t *conf = sh->raid_conf;
- int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
- struct bio *chosen;
-+ struct page *page;
- /**** FIX THIS: This could be very bad if disks is close to 256 ****/
- void *ptrs[disks];
-
-@@ -1126,18 +1213,49 @@
- BUG(); /* Not implemented yet */
- }
-
-- for (i = disks; i--;)
-- if (sh->dev[i].written) {
-- sector_t sector = sh->dev[i].sector;
-- struct bio *wbi = sh->dev[i].written;
-- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-- copy_data(1, wbi, sh->dev[i].page, sector);
-- wbi = r5_next_bio(wbi, sector);
-+ for (i = disks; i--;) {
-+ struct r5dev *dev = &sh->dev[i];
-+ struct bio *wbi = dev->written;
-+ sector_t sector;
-+
-+ if (!wbi)
-+ continue;
-+
-+ sector = sh->dev[i].sector;
-+ set_bit(R5_LOCKED, &sh->dev[i].flags);
-+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
-+
-+ /* check if it's covered by a single page
-+ * and whole stripe is written at once.
-+ * in this case we can avoid memcpy() */
-+ if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
-+ test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
-+ page = zero_copy_data(wbi, sector);
-+ /* we don't do zerocopy on a HighMem page. Raid6 tend
-+ * to prepare all of the pages' content to be accessed
-+ * before computing PQ parity. If we need to support HighMem
-+ * page also, we have to modify the gen_syndrome()
-+ * algorithm. -jay */
-+ if (page && !PageHighMem(page)) {
-+ atomic_inc(&conf->writes_zcopy);
-+ /* The pointer must be restored whenever the LOCKED
-+ * gets cleared. */
-+ sh->dev[i].req.bi_io_vec[0].bv_page = page;
-+ set_bit(R5_Direct, &sh->dev[i].flags);
-+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ continue;
- }
-+ }
-
-- set_bit(R5_LOCKED, &sh->dev[i].flags);
-- set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ atomic_inc(&conf->writes_copied);
-+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-+ copy_data(1, wbi, sh->dev[i].page, sector);
-+ wbi = r5_next_bio(wbi, sector);
- }
-+ }
-
- // switch(method) {
- // case RECONSTRUCT_WRITE:
-@@ -1148,8 +1266,12 @@
- count = 0;
- i = d0_idx;
- do {
-- ptrs[count++] = page_address(sh->dev[i].page);
-- if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-+ if (test_bit(R5_Direct, &sh->dev[i].flags))
-+ ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
-+ else
-+ ptrs[count++] = page_address(sh->dev[i].page);
-+ if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
-+ !test_bit(R5_Direct, &sh->dev[i].flags))
- printk("block %d/%d not uptodate on parity calc\n", i,count);
- i = raid6_next_disk(i, disks);
- } while ( i != d0_idx );
-@@ -1596,7 +1718,8 @@
- if (sh->dev[i].written) {
- dev = &sh->dev[i];
- if (!test_bit(R5_LOCKED, &dev->flags) &&
-- test_bit(R5_UPTODATE, &dev->flags) ) {
-+ (test_bit(R5_UPTODATE, &dev->flags) ||
-+ test_bit(R5_Direct, &dev->flags)) ) {
- /* We can return any write requests */
- struct bio *wbi, *wbi2;
- int bitmap_end = 0;
-@@ -1604,6 +1727,7 @@
- spin_lock_irq(&conf->device_lock);
- wbi = dev->written;
- dev->written = NULL;
-+ clear_bit(R5_Direct, &dev->flags);
- while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
- if (--wbi->bi_phys_segments == 0) {
-@@ -1967,6 +2091,15 @@
- set_bit(STRIPE_DEGRADED, &sh->state);
- PRINTK("skip op %ld on disc %d for sector %llu\n",
- bi->bi_rw, i, (unsigned long long)sh->sector);
-+
-+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+ /* restore the page pointer of req, otherwise,
-+ * no any read is permitted on this stripe, this is
-+ * not what we want. -jay */
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+ }
-+
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
-@@ -2172,7 +2305,8 @@
- if (sh->dev[i].written) {
- dev = &sh->dev[i];
- if (!test_bit(R5_LOCKED, &dev->flags) &&
-- test_bit(R5_UPTODATE, &dev->flags) ) {
-+ (test_bit(R5_UPTODATE, &dev->flags) ||
-+ test_bit(R5_Direct, &dev->flags)) ) {
- /* We can return any write requests */
- int bitmap_end = 0;
- struct bio *wbi, *wbi2;
-@@ -2181,6 +2315,7 @@
- spin_lock_irq(&conf->device_lock);
- wbi = dev->written;
- dev->written = NULL;
-+ clear_bit(R5_Direct, &dev->flags);
- while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
- wbi2 = r5_next_bio(wbi, dev->sector);
- if (--wbi->bi_phys_segments == 0) {
-@@ -2532,6 +2667,15 @@
- set_bit(STRIPE_DEGRADED, &sh->state);
- PRINTK("skip op %ld on disc %d for sector %llu\n",
- bi->bi_rw, i, (unsigned long long)sh->sector);
-+
-+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+ /* restore the page pointer of req, otherwise,
-+ * no any read is permitted on this stripe, this is
-+ * not what we want. -jay */
-+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+ }
-+
- clear_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(STRIPE_HANDLE, &sh->state);
- }
-@@ -3451,6 +3595,9 @@
- mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
- mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
-
-+ /* raid5 device is able to do zcopy right now. */
-+ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
-+
- return 0;
- abort:
- if (conf) {
-@@ -3537,9 +3684,11 @@
- atomic_read(&conf->handled_in_raid5d),
- atomic_read(&conf->out_of_stripes),
- atomic_read(&conf->handle_called));
-- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
-+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
- atomic_read(&conf->reads_for_rmw),
-- atomic_read(&conf->reads_for_rcw));
-+ atomic_read(&conf->reads_for_rcw),
-+ atomic_read(&conf->writes_zcopy),
-+ atomic_read(&conf->writes_copied));
- seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
- atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
- atomic_read(&conf->active_stripes),
-Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h 2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/backing-dev.h 2009-06-02 23:24:55.000000000 -0600
-@@ -48,6 +48,7 @@
- #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
- #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
- #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
-+#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */
- #define BDI_CAP_VMFLAGS \
- (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
-
-@@ -94,11 +95,18 @@
- #define bdi_cap_account_dirty(bdi) \
- (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
-
-+#define bdi_cap_page_constant_write(bdi) \
-+ ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
-+
- #define mapping_cap_writeback_dirty(mapping) \
- bdi_cap_writeback_dirty((mapping)->backing_dev_info)
-
- #define mapping_cap_account_dirty(mapping) \
- bdi_cap_account_dirty((mapping)->backing_dev_info)
-
-+#define mapping_cap_page_constant_write(mapping) \
-+ bdi_cap_page_constant_write((mapping)->backing_dev_info)
-+
-+
-
- #endif /* _LINUX_BACKING_DEV_H */
-Index: linux-2.6.18-128.1.6/include/linux/page-flags.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h 2009-04-14 21:05:24.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/page-flags.h 2009-06-02 23:24:55.000000000 -0600
-@@ -86,6 +86,7 @@
- #define PG_reclaim 17 /* To be reclaimed asap */
- #define PG_nosave_free 18 /* Free, should not be written */
- #define PG_buddy 19 /* Page is free, on buddy lists */
-+#define PG_constant 21 /* To mark if the page is constant */
- #define PG_xpmem 27 /* Testing for xpmem. */
-
- /* PG_owner_priv_1 users should have descriptive aliases */
-@@ -283,6 +284,14 @@
-
- struct page; /* forward declaration */
-
-+#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
-+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
-+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
-+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
-+
-+extern int set_page_constant(struct page *page);
-+extern void clear_page_constant(struct page *);
-+
- int test_clear_page_dirty(struct page *page);
- int test_clear_page_writeback(struct page *page);
- int test_set_page_writeback(struct page *page);
-Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h 2009-06-02 23:24:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h 2009-06-02 23:24:55.000000000 -0600
-@@ -156,8 +156,9 @@
- #define R5_Overlap 7 /* There is a pending overlapping request on this block */
- #define R5_ReadError 8 /* seen a read error here recently */
- #define R5_ReWrite 9 /* have tried to over-write the readerror */
--
- #define R5_Expanded 10 /* This block now has post-expand data */
-+#define R5_Direct 11 /* Use the pages in bio to do the write directly. */
-+
- /*
- * Write method
- */
-Index: linux-2.6.18-128.1.6/mm/filemap.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/mm/filemap.c 2009-04-14 21:05:46.000000000 -0600
-+++ linux-2.6.18-128.1.6/mm/filemap.c 2009-06-02 23:24:55.000000000 -0600
-@@ -30,6 +30,7 @@
- #include <linux/security.h>
- #include <linux/syscalls.h>
- #include <linux/cpuset.h>
-+#include <linux/rmap.h>
- #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
- #include <trace/mm.h>
- #include "internal.h"
-@@ -567,11 +568,55 @@
- if (!test_clear_page_writeback(page))
- BUG();
- }
-+ clear_page_constant(page);
- smp_mb__after_clear_bit();
- wake_up_page(page, PG_writeback);
- }
- EXPORT_SYMBOL(end_page_writeback);
-
-+/* Make a page to be constant, `constant' means any write to this page will
-+ * be blocked until clear_page_constant is called.
-+ * The page lock must be held.
-+ */
-+int set_page_constant(struct page *page)
-+{
-+ BUG_ON(!PageLocked(page));
-+
-+ /* If it's an anonymous page and haven't been added to swap cache,
-+ * return directly because we have no way to swap this page.
-+ */
-+ if (page_mapping(page) == NULL)
-+ return SWAP_FAIL;
-+
-+ BUG_ON(!PageUptodate(page));
-+
-+ /* I have to clear page uptodate before trying to remove
-+ * it from user's page table because otherwise, the page may be
-+ * reinstalled by a page access which happens between try_to_unmap()
-+ * and ClearPageUptodate(). -jay
-+ */
-+ ClearPageUptodate(page);
-+ if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
-+ SetPageUptodate(page);
-+ return SWAP_FAIL;
-+ }
-+ SetPageConstant(page);
-+ return SWAP_SUCCESS;
-+}
-+
-+void clear_page_constant(struct page *page)
-+{
-+ if (PageConstant(page)) {
-+ BUG_ON(!PageLocked(page));
-+ BUG_ON(PageUptodate(page));
-+ ClearPageConstant(page);
-+ SetPageUptodate(page);
-+ unlock_page(page);
-+ }
-+}
-+EXPORT_SYMBOL(set_page_constant);
-+EXPORT_SYMBOL(clear_page_constant);
-+
- /**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @page: the page to lock