lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch

   1 diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
   2 --- linux-2.6.18-53.orig/drivers/md/raid5.c     2007-12-28 19:09:20.000000000 +0800
   3 +++ linux-2.6.18-53/drivers/md/raid5.c  2007-12-28 19:09:32.000000000 +0800
   4 @@ -633,6 +633,7 @@ static int raid5_end_read_request(struct
   5                 clear_buffer_uptodate(bh);
   6         }
   7  #endif
   8 +       BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
   9         clear_bit(R5_LOCKED, &sh->dev[i].flags);
  10         set_bit(STRIPE_HANDLE, &sh->state);
  11         release_stripe(sh);
  12 @@ -671,6 +672,10 @@ static int raid5_end_write_request (stru
  13
  14         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
  15
  16 +       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
  17 +               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
  18 +               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
  19 +       }
  20         clear_bit(R5_LOCKED, &sh->dev[i].flags);
  21         set_bit(STRIPE_HANDLE, &sh->state);
  22         __release_stripe(conf, sh);
  23 @@ -911,7 +916,27 @@ static sector_t compute_blocknr(struct s
  24         return r_sector;
  25  }
  26
  27 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
  28 +{
  29 +       sector_t bi_sector = bio->bi_sector;
  30 +       struct page *page = NULL;
  31 +       struct bio_vec *bvl;
  32 +       int i;
  33
  34 +       bio_for_each_segment(bvl, bio, i) {
  35 +               if (sector == bi_sector)
  36 +                       page = bio_iovec_idx(bio, i)->bv_page;
  37 +               bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
  38 +               if (bi_sector >= sector + STRIPE_SECTORS) {
  39 +                       /* check if the stripe is covered by one page */
  40 +                       if (page == bio_iovec_idx(bio, i)->bv_page &&
  41 +                           PageConstant(page))
  42 +                               return page;
  43 +                       return NULL;
  44 +               }
  45 +       }
  46 +       return NULL;
  47 +}
  48
  49  /*
  50   * Copy data between a page in the stripe cache, and one or more bion
  51 @@ -1003,8 +1028,9 @@ static void compute_parity5(struct strip
  52  {
  53         raid5_conf_t *conf = sh->raid_conf;
  54         int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
  55 -       void *ptr[MAX_XOR_BLOCKS];
  56 +       void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
  57         struct bio *chosen;
  58 +       struct page *page;
  59
  60         PRINTK("compute_parity5, stripe %llu, method %d\n",
  61                 (unsigned long long)sh->sector, method);
  62 @@ -1054,34 +1080,90 @@ static void compute_parity5(struct strip
  63                 count = 1;
  64         }
  65
  66 -       for (i = disks; i--;)
  67 -               if (sh->dev[i].written) {
  68 -                       sector_t sector = sh->dev[i].sector;
  69 -                       struct bio *wbi = sh->dev[i].written;
  70 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
  71 -                               copy_data(1, wbi, sh->dev[i].page, sector);
  72 -                               wbi = r5_next_bio(wbi, sector);
  73 +       for (i = disks; i--;) {
  74 +               struct r5dev *dev = &sh->dev[i];
  75 +               struct bio *wbi = dev->written;
  76 +               sector_t sector;
  77 +
  78 +               if (!wbi)
  79 +                       continue;
  80 +
  81 +               sector = dev->sector;
  82 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
  83 +               BUG_ON(test_bit(R5_Direct, &dev->flags));
  84 +
  85 +               /* check if it's covered by a single page
  86 +                  and whole stripe is written at once.
  87 +                * in this case we can avoid memcpy() */
  88 +               if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
  89 +                   test_bit(R5_Insync, &dev->flags)) {
  90 +                       page = zero_copy_data(wbi, sector);
  91 +                       if (page) {
  92 +                               atomic_inc(&conf->writes_zcopy);
  93 +                               dev->req.bi_io_vec[0].bv_page = page;
  94 +                               set_bit(R5_Direct, &dev->flags);
  95 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
  96 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
  97 +                               continue;
  98                         }
  99 +               }
 100
 101 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
 102 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
 103 +               /* do copy write */
 104 +               atomic_inc(&conf->writes_copied);
 105 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
 106 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
 107 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
 108 +                       copy_data(1, wbi, sh->dev[i].page, sector);
 109 +                       wbi = r5_next_bio(wbi, sector);
 110                 }
 111 +       }
 112
 113 +       h_ptr[0] = ptr[0];
 114         switch(method) {
 115         case RECONSTRUCT_WRITE:
 116         case CHECK_PARITY:
 117 -               for (i=disks; i--;)
 118 -                       if (i != pd_idx) {
 119 -                               ptr[count++] = page_address(sh->dev[i].page);
 120 -                               check_xor();
 121 +               for (i=disks; i--;) {
 122 +                       if (i == pd_idx)
 123 +                               continue;
 124 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
 125 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
 126 +                       else
 127 +                               page = sh->dev[i].page;
 128 +
 129 +                       /* have to compute the parity immediately for
 130 +                        * a highmem page. it would happen for zerocopy. -jay
 131 +                        */
 132 +                       if (PageHighMem(page)) {
 133 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
 134 +                               xor_block(2, STRIPE_SIZE, h_ptr);
 135 +                               kunmap_atomic(page, KM_USER0);
 136 +                       } else {
 137 +                               ptr[count++] = page_address(page);
 138                         }
 139 +                       check_xor();
 140 +               }
 141                 break;
 142         case READ_MODIFY_WRITE:
 143 -               for (i = disks; i--;)
 144 -                       if (sh->dev[i].written) {
 145 -                               ptr[count++] = page_address(sh->dev[i].page);
 146 -                               check_xor();
 147 +               for (i = disks; i--;) {
 148 +                       if (!sh->dev[i].written)
 149 +                               continue;
 150 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
 151 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
 152 +                       else
 153 +                               page = sh->dev[i].page;
 154 +
 155 +                       /* have to compute the parity immediately for
 156 +                        * a highmem page. it would happen for zerocopy. -jay
 157 +                        */
 158 +                       if (PageHighMem(page)) {
 159 +                               h_ptr[1] = kmap_atomic(page, KM_USER0);
 160 +                               xor_block(2, STRIPE_SIZE, h_ptr);
 161 +                               kunmap_atomic(page, KM_USER0);
 162 +                       } else {
 163 +                               ptr[count++] = page_address(page);
 164                         }
 165 +                       check_xor();
 166 +               }
 167         }
 168         if (count != 1)
 169                 xor_block(count, STRIPE_SIZE, ptr);
 170 @@ -1098,6 +1180,7 @@ static void compute_parity6(struct strip
 171         raid6_conf_t *conf = sh->raid_conf;
 172         int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
 173         struct bio *chosen;
 174 +       struct page *page;
 175         /**** FIX THIS: This could be very bad if disks is close to 256 ****/
 176         void *ptrs[disks];
 177
 178 @@ -1127,18 +1210,47 @@ static void compute_parity6(struct strip
 179                 BUG();          /* Not implemented yet */
 180         }
 181
 182 -       for (i = disks; i--;)
 183 -               if (sh->dev[i].written) {
 184 -                       sector_t sector = sh->dev[i].sector;
 185 -                       struct bio *wbi = sh->dev[i].written;
 186 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
 187 -                               copy_data(1, wbi, sh->dev[i].page, sector);
 188 -                               wbi = r5_next_bio(wbi, sector);
 189 +       for (i = disks; i--;) {
 190 +               struct r5dev *dev = &sh->dev[i];
 191 +               struct bio *wbi = dev->written;
 192 +               sector_t sector;
 193 +
 194 +               if (!wbi)
 195 +                       continue;
 196 +
 197 +               sector = sh->dev[i].sector;
 198 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
 199 +               BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
 200 +
 201 +               /* check if it's covered by a single page
 202 +                * and whole stripe is written at once.
 203 +                * in this case we can avoid memcpy() */
 204 +               if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
 205 +                   test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
 206 +                       page = zero_copy_data(wbi, sector);
 207 +                       /* we don't do zerocopy on a HighMem page. Raid6 tend
 208 +                        * to prepare all of the pages' content to be accessed
 209 +                        * before computing PQ parity. If we need to support HighMem
 210 +                        * page also, we have to modify the gen_syndrome()
 211 +                        * algorithm. -jay */
 212 +                       if (page && !PageHighMem(page)) {
 213 +                               atomic_inc(&conf->writes_zcopy);
 214 +                               sh->dev[i].req.bi_io_vec[0].bv_page = page;
 215 +                               set_bit(R5_Direct, &sh->dev[i].flags);
 216 +                               clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 217 +                               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
 218 +                               continue;
 219                         }
 220 +               }
 221
 222 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
 223 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
 224 +               atomic_inc(&conf->writes_copied);
 225 +               clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
 226 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
 227 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
 228 +                       copy_data(1, wbi, sh->dev[i].page, sector);
 229 +                       wbi = r5_next_bio(wbi, sector);
 230                 }
 231 +       }
 232
 233  //     switch(method) {
 234  //     case RECONSTRUCT_WRITE:
 235 @@ -1149,8 +1261,12 @@ static void compute_parity6(struct strip
 236                 count = 0;
 237                 i = d0_idx;
 238                 do {
 239 -                       ptrs[count++] = page_address(sh->dev[i].page);
 240 -                       if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
 241 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
 242 +                               ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
 243 +                       else
 244 +                               ptrs[count++] = page_address(sh->dev[i].page);
 245 +                       if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
 246 +                           !test_bit(R5_Direct, &sh->dev[i].flags))
 247                                 printk("block %d/%d not uptodate on parity calc\n", i,count);
 248                         i = raid6_next_disk(i, disks);
 249                 } while ( i != d0_idx );
 250 @@ -1597,7 +1713,8 @@ static void handle_stripe5(struct stripe
 251                 if (sh->dev[i].written) {
 252                     dev = &sh->dev[i];
 253                     if (!test_bit(R5_LOCKED, &dev->flags) &&
 254 -                        test_bit(R5_UPTODATE, &dev->flags) ) {
 255 +                        (test_bit(R5_UPTODATE, &dev->flags) ||
 256 +                         test_bit(R5_Direct, &dev->flags)) ) {
 257                         /* We can return any write requests */
 258                             struct bio *wbi, *wbi2;
 259                             int bitmap_end = 0;
 260 @@ -1605,6 +1722,7 @@ static void handle_stripe5(struct stripe
 261                             spin_lock_irq(&conf->device_lock);
 262                             wbi = dev->written;
 263                             dev->written = NULL;
 264 +                           clear_bit(R5_Direct, &dev->flags);
 265                             while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 266                                     wbi2 = r5_next_bio(wbi, dev->sector);
 267                                     if (--wbi->bi_phys_segments == 0) {
 268 @@ -2173,7 +2291,8 @@ static void handle_stripe6(struct stripe
 269                         if (sh->dev[i].written) {
 270                                 dev = &sh->dev[i];
 271                                 if (!test_bit(R5_LOCKED, &dev->flags) &&
 272 -                                   test_bit(R5_UPTODATE, &dev->flags) ) {
 273 +                                   (test_bit(R5_UPTODATE, &dev->flags) ||
 274 +                                    test_bit(R5_Direct, &dev->flags)) ) {
 275                                         /* We can return any write requests */
 276                                         int bitmap_end = 0;
 277                                         struct bio *wbi, *wbi2;
 278 @@ -2182,6 +2301,7 @@ static void handle_stripe6(struct stripe
 279                                         spin_lock_irq(&conf->device_lock);
 280                                         wbi = dev->written;
 281                                         dev->written = NULL;
 282 +                                       clear_bit(R5_Direct, &dev->flags);
 283                                         while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
 284                                                 wbi2 = r5_next_bio(wbi, dev->sector);
 285                                                 if (--wbi->bi_phys_segments == 0) {
 286 @@ -3450,6 +3570,9 @@ static int run(mddev_t *mddev)
 287         mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
 288         mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
 289
 290 +       /* raid5 device is able to do zcopy right now. */
 291 +       mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
 292 +
 293         return 0;
 294  abort:
 295         if (conf) {
 296 @@ -3536,9 +3659,11 @@ static void status (struct seq_file *seq
 297                         atomic_read(&conf->handled_in_raid5d),
 298                         atomic_read(&conf->out_of_stripes),
 299                         atomic_read(&conf->handle_called));
 300 -       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
 301 +       seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
 302                         atomic_read(&conf->reads_for_rmw),
 303 -                       atomic_read(&conf->reads_for_rcw));
 304 +                       atomic_read(&conf->reads_for_rcw),
 305 +                       atomic_read(&conf->writes_zcopy),
 306 +                       atomic_read(&conf->writes_copied));
 307         seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
 308                         atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
 309                         atomic_read(&conf->active_stripes),
 310 diff -pur linux-2.6.18-53.orig/include/linux/backing-dev.h linux-2.6.18-53/include/linux/backing-dev.h
 311 --- linux-2.6.18-53.orig/include/linux/backing-dev.h    2007-12-28 14:49:26.000000000 +0800
 312 +++ linux-2.6.18-53/include/linux/backing-dev.h 2007-12-28 19:09:32.000000000 +0800
 313 @@ -48,6 +48,7 @@ struct backing_dev_info {
 314  #define BDI_CAP_READ_MAP       0x00000010      /* Can be mapped for reading */
 315  #define BDI_CAP_WRITE_MAP      0x00000020      /* Can be mapped for writing */
 316  #define BDI_CAP_EXEC_MAP       0x00000040      /* Can be mapped for execution */
 317 +#define BDI_CAP_PAGE_CONSTANT_WRITE    0x00000080      /* Zcopy write - for raid5 */
 318  #define BDI_CAP_VMFLAGS \
 319         (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
 320
 321 @@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
 322  #define bdi_cap_account_dirty(bdi) \
 323         (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
 324
 325 +#define bdi_cap_page_constant_write(bdi) \
 326 +       ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
 327 +
 328  #define mapping_cap_writeback_dirty(mapping) \
 329         bdi_cap_writeback_dirty((mapping)->backing_dev_info)
 330
 331  #define mapping_cap_account_dirty(mapping) \
 332         bdi_cap_account_dirty((mapping)->backing_dev_info)
 333
 334 +#define mapping_cap_page_constant_write(mapping) \
 335 +       bdi_cap_page_constant_write((mapping)->backing_dev_info)
 336 +
 337 +
 338
 339  #endif         /* _LINUX_BACKING_DEV_H */
 340 diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/include/linux/page-flags.h
 341 --- linux-2.6.18-53.orig/include/linux/page-flags.h     2007-12-28 14:49:26.000000000 +0800
 342 +++ linux-2.6.18-53/include/linux/page-flags.h  2007-12-28 19:09:32.000000000 +0800
 343 @@ -86,6 +86,7 @@
 344  #define PG_reclaim             17      /* To be reclaimed asap */
 345  #define PG_nosave_free         18      /* Free, should not be written */
 346  #define PG_buddy               19      /* Page is free, on buddy lists */
 347 +#define PG_constant            20      /* To mark if the page is constant */
 348
 349  /* PG_owner_priv_1 users should have descriptive aliases */
 350  #define PG_checked              PG_owner_priv_1 /* Used by some filesystems */
 351 @@ -252,6 +253,14 @@
 352
 353  struct page;   /* forward declaration */
 354
 355 +#define PageConstant(page)     test_bit(PG_constant, &(page)->flags)
 356 +#define SetPageConstant(page)  set_bit(PG_constant, &(page)->flags)
 357 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
 358 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
 359 +
 360 +extern int set_page_constant(struct page *page);
 361 +extern void clear_page_constant(struct page *);
 362 +
 363  int test_clear_page_dirty(struct page *page);
 364  int test_clear_page_writeback(struct page *page);
 365  int test_set_page_writeback(struct page *page);
 366 diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
 367 --- linux-2.6.18-53.orig/include/linux/raid/raid5.h     2007-12-28 18:55:24.000000000 +0800
 368 +++ linux-2.6.18-53/include/linux/raid/raid5.h  2007-12-28 19:09:32.000000000 +0800
 369 @@ -156,8 +156,9 @@ struct stripe_head {
 370  #define        R5_Overlap      7       /* There is a pending overlapping request on this block */
 371  #define        R5_ReadError    8       /* seen a read error here recently */
 372  #define        R5_ReWrite      9       /* have tried to over-write the readerror */
 373 -
 374  #define        R5_Expanded     10      /* This block now has post-expand data */
 375 +#define        R5_Direct       11      /* Use the pages in bio to do the write directly. */
 376 +
 377  /*
 378   * Write method
 379   */
 380 diff -pur linux-2.6.18-53.orig/mm/filemap.c linux-2.6.18-53/mm/filemap.c
 381 --- linux-2.6.18-53.orig/mm/filemap.c   2007-12-28 14:49:26.000000000 +0800
 382 +++ linux-2.6.18-53/mm/filemap.c        2007-12-28 19:09:32.000000000 +0800
 383 @@ -30,6 +30,7 @@
 384  #include <linux/security.h>
 385  #include <linux/syscalls.h>
 386  #include <linux/cpuset.h>
 387 +#include <linux/rmap.h>
 388  #include "filemap.h"
 389  #include "internal.h"
 390
 391 @@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
 392                 if (!test_clear_page_writeback(page))
 393                         BUG();
 394         }
 395 +       clear_page_constant(page);
 396         smp_mb__after_clear_bit();
 397         wake_up_page(page, PG_writeback);
 398  }
 399  EXPORT_SYMBOL(end_page_writeback);
 400
 401 +/* Make a page to be constant, `constant' means any write to this page will
 402 + * be blocked until clear_page_constant is called.
 403 + * The page lock must be held.
 404 + */
 405 +int set_page_constant(struct page *page)
 406 +{
 407 +       BUG_ON(!PageLocked(page));
 408 +
 409 +       /* If it's an anonymous page and haven't been added to swap cache,
 410 +        * return directly because we have no way to swap this page.
 411 +        */
 412 +       if (page_mapping(page) == NULL)
 413 +               return SWAP_FAIL;
 414 +
 415 +       BUG_ON(!PageUptodate(page));
 416 +
 417 +       /* I have to clear page uptodate before trying to remove
 418 +        * it from user's page table because otherwise, the page may be
 419 +        * reinstalled by a page access which happens between try_to_unmap()
 420 +        * and ClearPageUptodate(). -jay
 421 +        */
 422 +       ClearPageUptodate(page);
 423 +       if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
 424 +               SetPageUptodate(page);
 425 +               return SWAP_FAIL;
 426 +       }
 427 +       SetPageConstant(page);
 428 +       return SWAP_SUCCESS;
 429 +}
 430 +
 431 +void clear_page_constant(struct page *page)
 432 +{
 433 +       if (PageConstant(page)) {
 434 +               BUG_ON(!PageLocked(page));
 435 +               BUG_ON(PageUptodate(page));
 436 +               ClearPageConstant(page);
 437 +               SetPageUptodate(page);
 438 +               unlock_page(page);
 439 +       }
 440 +}
 441 +EXPORT_SYMBOL(set_page_constant);
 442 +EXPORT_SYMBOL(clear_page_constant);
 443 +
 444  /**
 445   * __lock_page - get a lock on the page, assuming we need to sleep to get it
 446   * @page: the page to lock