lustre/kernel_patches/patches/raid6-zerocopy.patch

   1 diff -pur linux-2.6.9-67.orig/drivers/md/raid6main.c linux-2.6.9-67/drivers/md/raid6main.c
   2 --- linux-2.6.9-67.orig/drivers/md/raid6main.c  2009-02-15 10:24:30.000000000 +0800
   3 +++ linux-2.6.9-67/drivers/md/raid6main.c       2009-02-15 10:26:17.000000000 +0800
   4 @@ -430,6 +430,9 @@ static int raid6_end_read_request (struc
   5                 clear_buffer_uptodate(bh);
   6         }
   7  #endif
   8 +       /* Read on a Directing write is allowable */
   9 +       /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
  10 +       BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
  11         clear_bit(R5_LOCKED, &sh->dev[i].flags);
  12         set_bit(STRIPE_HANDLE, &sh->state);
  13         release_stripe(sh);
  14 @@ -468,6 +471,10 @@ static int raid6_end_write_request (stru
  15
  16         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
  17
  18 +       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
  19 +               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
  20 +               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
  21 +       }
  22         clear_bit(R5_LOCKED, &sh->dev[i].flags);
  23         set_bit(STRIPE_HANDLE, &sh->state);
  24         __release_stripe(conf, sh);
  25 @@ -664,7 +671,27 @@ static sector_t compute_blocknr(struct s
  26         return r_sector;
  27  }
  28
  29 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
  30 +{
  31 +       sector_t bi_sector = bio->bi_sector;
  32 +       struct page *page = NULL;
  33 +       struct bio_vec *bvl;
  34 +       int i;
  35
  36 +       bio_for_each_segment(bvl, bio, i) {
  37 +               if (sector == bi_sector)
  38 +                       page = bio_iovec_idx(bio, i)->bv_page;
  39 +               bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
  40 +               if (bi_sector >= sector + STRIPE_SECTORS) {
  41 +                       /* check if the stripe is covered by one page */
  42 +                       if (page == bio_iovec_idx(bio, i)->bv_page &&
  43 +                           PageConstant(page))
  44 +                               return page;
  45 +                       return NULL;
  46 +               }
  47 +       }
  48 +       return NULL;
  49 +}
  50
  51  /*
  52   * Copy data between a page in the stripe cache, and one or more bion
  53 @@ -731,6 +758,7 @@ static void compute_parity(struct stripe
  54         raid6_conf_t *conf = sh->raid_conf;
  55         int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
  56         struct bio *chosen;
  57 +       struct page *page;
  58         /**** FIX THIS: This could be very bad if disks is close to 256 ****/
  59         void *ptrs[disks];
  60
  61 @@ -761,18 +789,46 @@ static void compute_parity(struct stripe
  62                 BUG();          /* Not implemented yet */
  63         }
  64
  65 -       for (i = disks; i--;)
  66 -               if (sh->dev[i].written) {
  67 -                       sector_t sector = sh->dev[i].sector;
  68 -                       struct bio *wbi = sh->dev[i].written;
  69 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
  70 -                               copy_data(1, wbi, sh->dev[i].page, sector);
  71 -                               wbi = r5_next_bio(wbi, sector);
  72 +       for (i = disks; i--;) {
  73 +               struct bio *wbi = sh->dev[i].written;
  74 +               sector_t sector;
  75 +
  76 +               if (!wbi)
  77 +                       continue;
  78 +
  79 +               sector = sh->dev[i].sector;
  80 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
  81 +               BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
  82 +
  83 +               /* check if it's covered by a single page
  84 +                * and whole stripe is written at once.
  85 +                * in this case we can avoid memcpy() */
  86 +               if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
  87 +                   test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
  88 +                       page = zero_copy_data(wbi, sector);
  89 +                       /* we don't do zerocopy on a HighMem page. Raid6 tend
  90 +                        * to prepare all of the pages' content to be accessed
  91 +                        * before computing PQ parity. If we need to support HighMem
  92 +                        * page also, we have to modify the gen_syndrome()
  93 +                        * algorithm. -jay */
  94 +                       if (page && !PageHighMem(page)) {
  95 +                               atomic_inc(&conf->writes_zcopy);
  96 +                               sh->dev[i].req.bi_io_vec[0].bv_page = page;
  97 +                               set_bit(R5_Direct, &sh->dev[i].flags);
  98 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
  99 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
 100 +                               continue;
 101                         }
 102 +               }
 103
 104 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
 105 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
 106 +               atomic_inc(&conf->writes_copied);
 107 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
 108 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
 109 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
 110 +                       copy_data(1, wbi, sh->dev[i].page, sector);
 111 +                       wbi = r5_next_bio(wbi, sector);
 112                 }
 113 +       }
 114
 115  //     switch(method) {
 116  //     case RECONSTRUCT_WRITE:
 117 @@ -783,7 +839,10 @@ static void compute_parity(struct stripe
 118                 count = 0;
 119                 i = d0_idx;
 120                 do {
 121 -                       ptrs[count++] = page_address(sh->dev[i].page);
 122 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
 123 +                               ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
 124 +                       else
 125 +                               ptrs[count++] = page_address(sh->dev[i].page);
 126
 127                         i = raid6_next_disk(i, disks);
 128                 } while ( i != d0_idx );
 129 @@ -1185,7 +1244,8 @@ static void handle_stripe(struct stripe_
 130                         if (sh->dev[i].written) {
 131                                 dev = &sh->dev[i];
 132                                 if (!test_bit(R5_LOCKED, &dev->flags) &&
 133 -                                   test_bit(R5_UPTODATE, &dev->flags) ) {
 134 +                                   (test_bit(R5_UPTODATE, &dev->flags) ||
 135 +                                        test_bit(R5_Direct, &dev->flags)) ) {
 136                                         /* We can return any write requests */
 137                                         struct bio *wbi, *wbi2;
 138                                         PRINTK("Return write for stripe %llu disc %d\n",
 139 @@ -1193,6 +1253,7 @@ static void handle_stripe(struct stripe_
 140                                         spin_lock_irq(&conf->device_lock);
 141                                         wbi = dev->written;
 142                                         dev->written = NULL;
 143 +                                       clear_bit(R5_Direct, &dev->flags);
 144                                         while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 145                                                 wbi2 = r5_next_bio(wbi, dev->sector);
 146                                                 if (--wbi->bi_phys_segments == 0) {
 147 @@ -1503,6 +1564,15 @@ static void handle_stripe(struct stripe_
 148                 } else {
 149                         PRINTK("skip op %ld on disc %d for sector %llu\n",
 150                                 bi->bi_rw, i, (unsigned long long)sh->sector);
 151 +
 152 +                       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
 153 +                               /* restore the page pointer of req, otherwise,
 154 +                                * no any read is permitted on this stripe, this is
 155 +                                * not what we want. -jay */
 156 +                               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
 157 +                               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
 158 +                       }
 159 +
 160                         clear_bit(R5_LOCKED, &sh->dev[i].flags);
 161                         set_bit(STRIPE_HANDLE, &sh->state);
 162                         atomic_dec(&conf->delayed);
 163 @@ -2008,6 +2078,7 @@ static int run (mddev_t *mddev)
 164                 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 165                         mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 166         }
 167 +       mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
 168
 169         /* Ok, everything is just fine now */
 170         mddev->array_size =  mddev->size * (mddev->raid_disks - 2);
 171 @@ -2095,9 +2166,11 @@ static void status (struct seq_file *seq
 172                 atomic_read(&conf->handled_in_raid5d),
 173                 atomic_read(&conf->out_of_stripes),
 174                 atomic_read(&conf->handle_called));
 175 -       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
 176 +       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
 177                 atomic_read(&conf->reads_for_rmw),
 178 -               atomic_read(&conf->reads_for_rcw));
 179 +               atomic_read(&conf->reads_for_rcw),
 180 +               atomic_read(&conf->writes_zcopy),
 181 +               atomic_read(&conf->writes_copied));
 182         seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
 183                 atomic_read(&conf->delayed),
 184                 atomic_read(&conf->active_stripes),