Whamcloud - gitweb
LU-992 kernel: deprecate RHEL5 server support for master
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-zerocopy-rhel5.patch
diff --git a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch
deleted file mode 100644 (file)
index 06db94d..0000000
+++ /dev/null
@@ -1,489 +0,0 @@
-Index: linux-2.6.18-128.1.6/drivers/md/raid5.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/drivers/md/raid5.c       2009-06-02 23:24:52.000000000 -0600
-+++ linux-2.6.18-128.1.6/drivers/md/raid5.c    2009-06-02 23:24:55.000000000 -0600
-@@ -633,6 +633,9 @@
-               clear_buffer_uptodate(bh);
-       }
- #endif
-+      /* Read on a Directing write is allowable */
-+      /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
-+      BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
-       clear_bit(R5_LOCKED, &sh->dev[i].flags);
-       set_bit(STRIPE_HANDLE, &sh->state);
-       release_stripe(sh);
-@@ -669,6 +672,10 @@
-       rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
-       
-+      if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+              BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+              sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+      }
-       clear_bit(R5_LOCKED, &sh->dev[i].flags);
-       set_bit(STRIPE_HANDLE, &sh->state);
-       release_stripe(sh);
-@@ -910,7 +917,27 @@
-       return r_sector;
- }
-+static struct page *zero_copy_data(struct bio *bio, sector_t sector)
-+{
-+      sector_t bi_sector = bio->bi_sector;
-+      struct page *page = NULL;
-+      struct bio_vec *bvl;
-+      int i;
-+      bio_for_each_segment(bvl, bio, i) {
-+              if (sector == bi_sector)
-+                      page = bio_iovec_idx(bio, i)->bv_page;
-+              bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
-+              if (bi_sector >= sector + STRIPE_SECTORS) {
-+                      /* check if the stripe is covered by one page */
-+                      if (page == bio_iovec_idx(bio, i)->bv_page &&
-+                          PageConstant(page))
-+                              return page;
-+                      return NULL;
-+              }
-+      }
-+      return NULL;
-+}
- /*
-  * Copy data between a page in the stripe cache, and one or more bion
-@@ -1002,8 +1029,9 @@
- {
-       raid5_conf_t *conf = sh->raid_conf;
-       int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
--      void *ptr[MAX_XOR_BLOCKS];
-+      void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
-       struct bio *chosen;
-+      struct page *page;
-       PRINTK("compute_parity5, stripe %llu, method %d\n",
-               (unsigned long long)sh->sector, method);
-@@ -1053,34 +1081,92 @@
-               count = 1;
-       }
-       
--      for (i = disks; i--;)
--              if (sh->dev[i].written) {
--                      sector_t sector = sh->dev[i].sector;
--                      struct bio *wbi = sh->dev[i].written;
--                      while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
--                              copy_data(1, wbi, sh->dev[i].page, sector);
--                              wbi = r5_next_bio(wbi, sector);
-+      for (i = disks; i--;) {
-+              struct r5dev *dev = &sh->dev[i];
-+              struct bio *wbi = dev->written;
-+              sector_t sector;
-+
-+              if (!wbi)
-+                      continue;
-+
-+              sector = dev->sector;
-+              set_bit(R5_LOCKED, &sh->dev[i].flags);
-+              BUG_ON(test_bit(R5_Direct, &dev->flags));
-+
-+              /* check if it's covered by a single page
-+                 and whole stripe is written at once.
-+               * in this case we can avoid memcpy() */
-+              if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
-+                  test_bit(R5_Insync, &dev->flags)) {
-+                      page = zero_copy_data(wbi, sector);
-+                      if (page) {
-+                              atomic_inc(&conf->writes_zcopy);
-+                              /* The pointer must be restored whenever the LOCKED
-+                               * gets cleared. */
-+                              dev->req.bi_io_vec[0].bv_page = page;
-+                              set_bit(R5_Direct, &dev->flags);
-+                              clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-+                              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+                              continue;
-                       }
-+              }
--                      set_bit(R5_LOCKED, &sh->dev[i].flags);
--                      set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+              /* do copy write */
-+              atomic_inc(&conf->writes_copied);
-+              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+              set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+              while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-+                      copy_data(1, wbi, sh->dev[i].page, sector);
-+                      wbi = r5_next_bio(wbi, sector);
-               }
-+      }
-+      h_ptr[0] = ptr[0];
-       switch(method) {
-       case RECONSTRUCT_WRITE:
-       case CHECK_PARITY:
--              for (i=disks; i--;)
--                      if (i != pd_idx) {
--                              ptr[count++] = page_address(sh->dev[i].page);
--                              check_xor();
-+              for (i=disks; i--;) {
-+                      if (i == pd_idx)
-+                              continue;
-+                      if (test_bit(R5_Direct, &sh->dev[i].flags))
-+                              page = sh->dev[i].req.bi_io_vec[0].bv_page;
-+                      else
-+                              page = sh->dev[i].page;
-+
-+                      /* have to compute the parity immediately for
-+                       * a highmem page. it would happen for zerocopy. -jay
-+                       */
-+                      if (PageHighMem(page)) {
-+                              h_ptr[1] = kmap_atomic(page, KM_USER0);
-+                              xor_block(2, STRIPE_SIZE, h_ptr);
-+                              kunmap_atomic(page, KM_USER0);
-+                      } else {
-+                              ptr[count++] = page_address(page);
-                       }
-+                      check_xor();
-+              }
-               break;
-       case READ_MODIFY_WRITE:
--              for (i = disks; i--;)
--                      if (sh->dev[i].written) {
--                              ptr[count++] = page_address(sh->dev[i].page);
--                              check_xor();
-+              for (i = disks; i--;) {
-+                      if (!sh->dev[i].written)
-+                              continue;
-+                      if (test_bit(R5_Direct, &sh->dev[i].flags))
-+                              page = sh->dev[i].req.bi_io_vec[0].bv_page;
-+                      else
-+                              page = sh->dev[i].page;
-+
-+                      /* have to compute the parity immediately for
-+                       * a highmem page. it would happen for zerocopy. -jay
-+                       */
-+                      if (PageHighMem(page)) {
-+                              h_ptr[1] = kmap_atomic(page, KM_USER0);
-+                              xor_block(2, STRIPE_SIZE, h_ptr);
-+                              kunmap_atomic(page, KM_USER0);
-+                      } else {
-+                              ptr[count++] = page_address(page);
-                       }
-+                      check_xor();
-+              }
-       }
-       if (count != 1)
-               xor_block(count, STRIPE_SIZE, ptr);
-@@ -1097,6 +1183,7 @@
-       raid6_conf_t *conf = sh->raid_conf;
-       int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
-       struct bio *chosen;
-+      struct page *page;
-       /**** FIX THIS: This could be very bad if disks is close to 256 ****/
-       void *ptrs[disks];
-@@ -1126,18 +1213,49 @@
-               BUG();          /* Not implemented yet */
-       }
--      for (i = disks; i--;)
--              if (sh->dev[i].written) {
--                      sector_t sector = sh->dev[i].sector;
--                      struct bio *wbi = sh->dev[i].written;
--                      while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
--                              copy_data(1, wbi, sh->dev[i].page, sector);
--                              wbi = r5_next_bio(wbi, sector);
-+      for (i = disks; i--;) {
-+              struct r5dev *dev = &sh->dev[i];
-+              struct bio *wbi = dev->written;
-+              sector_t sector;
-+
-+              if (!wbi)
-+                      continue;
-+
-+              sector = sh->dev[i].sector;
-+              set_bit(R5_LOCKED, &sh->dev[i].flags);
-+              BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
-+
-+              /* check if it's covered by a single page
-+               * and whole stripe is written at once.
-+               * in this case we can avoid memcpy() */
-+              if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
-+                  test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
-+                      page = zero_copy_data(wbi, sector);
-+                      /* we don't do zerocopy on a HighMem page. Raid6 tend
-+                       * to prepare all of the pages' content to be accessed
-+                       * before computing PQ parity. If we need to support HighMem
-+                       * page also, we have to modify the gen_syndrome()
-+                       * algorithm. -jay */
-+                      if (page && !PageHighMem(page)) {
-+                              atomic_inc(&conf->writes_zcopy);
-+                              /* The pointer must be restored whenever the LOCKED
-+                               * gets cleared. */
-+                              sh->dev[i].req.bi_io_vec[0].bv_page = page;
-+                              set_bit(R5_Direct, &sh->dev[i].flags);
-+                              clear_bit(R5_UPTODATE, &sh->dev[i].flags);
-+                              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+                              continue;
-                       }
-+              }
--                      set_bit(R5_LOCKED, &sh->dev[i].flags);
--                      set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+              atomic_inc(&conf->writes_copied);
-+              clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
-+              set_bit(R5_UPTODATE, &sh->dev[i].flags);
-+              while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
-+                      copy_data(1, wbi, sh->dev[i].page, sector);
-+                      wbi = r5_next_bio(wbi, sector);
-               }
-+      }
- //    switch(method) {
- //    case RECONSTRUCT_WRITE:
-@@ -1148,8 +1266,12 @@
-               count = 0;
-               i = d0_idx;
-               do {
--                      ptrs[count++] = page_address(sh->dev[i].page);
--                      if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
-+                      if (test_bit(R5_Direct, &sh->dev[i].flags))
-+                              ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
-+                      else
-+                              ptrs[count++] = page_address(sh->dev[i].page);
-+                      if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
-+                          !test_bit(R5_Direct, &sh->dev[i].flags))
-                               printk("block %d/%d not uptodate on parity calc\n", i,count);
-                       i = raid6_next_disk(i, disks);
-               } while ( i != d0_idx );
-@@ -1596,7 +1718,8 @@
-               if (sh->dev[i].written) {
-                   dev = &sh->dev[i];
-                   if (!test_bit(R5_LOCKED, &dev->flags) &&
--                       test_bit(R5_UPTODATE, &dev->flags) ) {
-+                       (test_bit(R5_UPTODATE, &dev->flags) ||
-+                        test_bit(R5_Direct, &dev->flags)) ) {
-                       /* We can return any write requests */
-                           struct bio *wbi, *wbi2;
-                           int bitmap_end = 0;
-@@ -1604,6 +1727,7 @@
-                           spin_lock_irq(&conf->device_lock);
-                           wbi = dev->written;
-                           dev->written = NULL;
-+                          clear_bit(R5_Direct, &dev->flags);
-                           while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-                                   wbi2 = r5_next_bio(wbi, dev->sector);
-                                   if (--wbi->bi_phys_segments == 0) {
-@@ -1967,6 +2091,15 @@
-                               set_bit(STRIPE_DEGRADED, &sh->state);
-                       PRINTK("skip op %ld on disc %d for sector %llu\n",
-                               bi->bi_rw, i, (unsigned long long)sh->sector);
-+
-+                      if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+                              /* restore the page pointer of req, otherwise,
-+                               * no any read is permitted on this stripe, this is
-+                               * not what we want. -jay */
-+                              BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+                              sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+                      }
-+
-                       clear_bit(R5_LOCKED, &sh->dev[i].flags);
-                       set_bit(STRIPE_HANDLE, &sh->state);
-               }
-@@ -2172,7 +2305,8 @@
-                       if (sh->dev[i].written) {
-                               dev = &sh->dev[i];
-                               if (!test_bit(R5_LOCKED, &dev->flags) &&
--                                  test_bit(R5_UPTODATE, &dev->flags) ) {
-+                                  (test_bit(R5_UPTODATE, &dev->flags) ||
-+                                   test_bit(R5_Direct, &dev->flags)) ) {
-                                       /* We can return any write requests */
-                                       int bitmap_end = 0;
-                                       struct bio *wbi, *wbi2;
-@@ -2181,6 +2315,7 @@
-                                       spin_lock_irq(&conf->device_lock);
-                                       wbi = dev->written;
-                                       dev->written = NULL;
-+                                      clear_bit(R5_Direct, &dev->flags);
-                                       while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
-                                               wbi2 = r5_next_bio(wbi, dev->sector);
-                                               if (--wbi->bi_phys_segments == 0) {
-@@ -2532,6 +2667,15 @@
-                               set_bit(STRIPE_DEGRADED, &sh->state);
-                       PRINTK("skip op %ld on disc %d for sector %llu\n",
-                               bi->bi_rw, i, (unsigned long long)sh->sector);
-+
-+                      if (test_bit(R5_Direct, &sh->dev[i].flags)) {
-+                              /* restore the page pointer of req, otherwise,
-+                               * no any read is permitted on this stripe, this is
-+                               * not what we want. -jay */
-+                              BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
-+                              sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
-+                      }
-+
-                       clear_bit(R5_LOCKED, &sh->dev[i].flags);
-                       set_bit(STRIPE_HANDLE, &sh->state);
-               }
-@@ -3451,6 +3595,9 @@
-       mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
-       mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
-+      /* raid5 device is able to do zcopy right now. */
-+      mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
-+
-       return 0;
- abort:
-       if (conf) {
-@@ -3537,9 +3684,11 @@
-                       atomic_read(&conf->handled_in_raid5d),
-                       atomic_read(&conf->out_of_stripes),
-                       atomic_read(&conf->handle_called));
--      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
-+      seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
-                       atomic_read(&conf->reads_for_rmw),
--                      atomic_read(&conf->reads_for_rcw));
-+                      atomic_read(&conf->reads_for_rcw),
-+                      atomic_read(&conf->writes_zcopy),
-+                      atomic_read(&conf->writes_copied));
-       seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
-                       atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
-                       atomic_read(&conf->active_stripes),
-Index: linux-2.6.18-128.1.6/include/linux/backing-dev.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/backing-dev.h      2006-09-19 21:42:06.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/backing-dev.h   2009-06-02 23:24:55.000000000 -0600
-@@ -48,6 +48,7 @@
- #define BDI_CAP_READ_MAP      0x00000010      /* Can be mapped for reading */
- #define BDI_CAP_WRITE_MAP     0x00000020      /* Can be mapped for writing */
- #define BDI_CAP_EXEC_MAP      0x00000040      /* Can be mapped for execution */
-+#define BDI_CAP_PAGE_CONSTANT_WRITE   0x00000080      /* Zcopy write - for raid5 */
- #define BDI_CAP_VMFLAGS \
-       (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
-@@ -94,11 +95,18 @@
- #define bdi_cap_account_dirty(bdi) \
-       (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
-+#define bdi_cap_page_constant_write(bdi) \
-+      ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
-+
- #define mapping_cap_writeback_dirty(mapping) \
-       bdi_cap_writeback_dirty((mapping)->backing_dev_info)
- #define mapping_cap_account_dirty(mapping) \
-       bdi_cap_account_dirty((mapping)->backing_dev_info)
-+#define mapping_cap_page_constant_write(mapping) \
-+      bdi_cap_page_constant_write((mapping)->backing_dev_info)
-+      
-+
- #endif                /* _LINUX_BACKING_DEV_H */
-Index: linux-2.6.18-128.1.6/include/linux/page-flags.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/page-flags.h       2009-04-14 21:05:24.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/page-flags.h    2009-06-02 23:24:55.000000000 -0600
-@@ -86,6 +86,7 @@
- #define PG_reclaim            17      /* To be reclaimed asap */
- #define PG_nosave_free                18      /* Free, should not be written */
- #define PG_buddy              19      /* Page is free, on buddy lists */
-+#define PG_constant           21      /* To mark if the page is constant */
- #define PG_xpmem              27      /* Testing for xpmem. */
- /* PG_owner_priv_1 users should have descriptive aliases */
-@@ -283,6 +284,14 @@
- struct page;  /* forward declaration */
-+#define PageConstant(page)    test_bit(PG_constant, &(page)->flags)
-+#define SetPageConstant(page)         set_bit(PG_constant, &(page)->flags)
-+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
-+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
-+
-+extern int set_page_constant(struct page *page);
-+extern void clear_page_constant(struct page *);
-+
- int test_clear_page_dirty(struct page *page);
- int test_clear_page_writeback(struct page *page);
- int test_set_page_writeback(struct page *page);
-Index: linux-2.6.18-128.1.6/include/linux/raid/raid5.h
-===================================================================
---- linux-2.6.18-128.1.6.orig/include/linux/raid/raid5.h       2009-06-02 23:24:50.000000000 -0600
-+++ linux-2.6.18-128.1.6/include/linux/raid/raid5.h    2009-06-02 23:24:55.000000000 -0600
-@@ -156,8 +156,9 @@
- #define       R5_Overlap      7       /* There is a pending overlapping request on this block */
- #define       R5_ReadError    8       /* seen a read error here recently */
- #define       R5_ReWrite      9       /* have tried to over-write the readerror */
--
- #define       R5_Expanded     10      /* This block now has post-expand data */
-+#define       R5_Direct       11      /* Use the pages in bio to do the write directly. */
-+
- /*
-  * Write method
-  */
-Index: linux-2.6.18-128.1.6/mm/filemap.c
-===================================================================
---- linux-2.6.18-128.1.6.orig/mm/filemap.c     2009-04-14 21:05:46.000000000 -0600
-+++ linux-2.6.18-128.1.6/mm/filemap.c  2009-06-02 23:24:55.000000000 -0600
-@@ -30,6 +30,7 @@
- #include <linux/security.h>
- #include <linux/syscalls.h>
- #include <linux/cpuset.h>
-+#include <linux/rmap.h>
- #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
- #include <trace/mm.h>
- #include "internal.h"
-@@ -567,11 +568,55 @@
-               if (!test_clear_page_writeback(page))
-                       BUG();
-       }
-+      clear_page_constant(page);
-       smp_mb__after_clear_bit();
-       wake_up_page(page, PG_writeback);
- }
- EXPORT_SYMBOL(end_page_writeback);
-+/* Make a page to be constant, `constant' means any write to this page will
-+ * be blocked until clear_page_constant is called.
-+ * The page lock must be held.
-+ */
-+int set_page_constant(struct page *page)
-+{
-+      BUG_ON(!PageLocked(page));
-+
-+      /* If it's an anonymous page and haven't been added to swap cache,
-+       * return directly because we have no way to swap this page.
-+       */
-+      if (page_mapping(page) == NULL)
-+              return SWAP_FAIL;
-+
-+      BUG_ON(!PageUptodate(page));
-+
-+      /* I have to clear page uptodate before trying to remove
-+       * it from user's page table because otherwise, the page may be
-+       * reinstalled by a page access which happens between try_to_unmap()
-+       * and ClearPageUptodate(). -jay
-+       */
-+      ClearPageUptodate(page);
-+      if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
-+              SetPageUptodate(page);
-+              return SWAP_FAIL;
-+      }
-+      SetPageConstant(page);
-+      return SWAP_SUCCESS;
-+}
-+
-+void clear_page_constant(struct page *page)
-+{
-+      if (PageConstant(page)) {
-+              BUG_ON(!PageLocked(page));
-+              BUG_ON(PageUptodate(page));
-+              ClearPageConstant(page);
-+              SetPageUptodate(page);
-+              unlock_page(page);
-+      }
-+}
-+EXPORT_SYMBOL(set_page_constant);
-+EXPORT_SYMBOL(clear_page_constant);
-+
- /**
-  * __lock_page - get a lock on the page, assuming we need to sleep to get it
-  * @page: the page to lock