In case of full-stripe writes don't copy data into internal cache. This optimization reduces CPU load by 30% rougly. Index: linux-2.6.9/include/linux/raid/raid5.h =================================================================== --- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-21 17:57:25.000000000 +0400 +++ linux-2.6.9/include/linux/raid/raid5.h 2006-05-22 00:10:04.000000000 +0400 @@ -152,6 +152,7 @@ struct stripe_head { #define R5_Wantread 4 /* want to schedule a read */ #define R5_Wantwrite 5 #define R5_Syncio 6 /* this io need to be accounted as resync io */ +#define R5_Direct 7 /* use page fom passed bio to avoid memcpy */ /* * Write method Index: linux-2.6.9/drivers/md/raid5.c =================================================================== --- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:10:01.000000000 +0400 +++ linux-2.6.9/drivers/md/raid5.c 2006-05-22 00:10:04.000000000 +0400 @@ -411,6 +411,8 @@ static int raid5_end_read_request (struc clear_buffer_uptodate(bh); } #endif + if (test_bit(R5_Direct, &sh->dev[i].flags)) + printk("R5_Direct for READ ?!\n"); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); @@ -449,6 +451,10 @@ static int raid5_end_write_request (stru rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + if (test_bit(R5_Direct, &sh->dev[i].flags)) { + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; + } clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); __release_stripe(conf, sh); @@ -673,6 +679,49 @@ static void copy_data(int frombio, struc } } +static struct page *zero_copy_data(struct bio *bio, sector_t sector) +{ + struct bio_vec *bvl; + int i; + + for (;bio && bio->bi_sector < sector+STRIPE_SECTORS; + bio = r5_next_bio(bio, sector) ) { + int page_offset; + if (bio->bi_sector >= sector) + page_offset = (signed)(bio->bi_sector - sector) * 512; + else + page_offset = (signed)(sector - bio->bi_sector) * -512; + bio_for_each_segment(bvl, bio, i) { + int len = bio_iovec_idx(bio,i)->bv_len; + int clen; + int b_offset = 0; + + if (page_offset < 0) { + b_offset = -page_offset; + page_offset += b_offset; + len -= b_offset; + } + + if (len > 0 && page_offset + len > STRIPE_SIZE) + clen = STRIPE_SIZE - page_offset; + else clen = len; + + if (clen > 0) { + BUG_ON(clen < STRIPE_SIZE); + /*printk(" sector %lu: page %p from index %u\n", + (unsigned long) sector, + bio_iovec_idx(bio, i)->bv_page, i);*/ + return bio_iovec_idx(bio, i)->bv_page; + } + if (clen < len) /* hit end of page */ + break; + page_offset += len; + } + } + BUG(); + return NULL; +} + #define check_xor() do { \ if (count == MAX_XOR_BLOCKS) { \ xor_block(count, STRIPE_SIZE, ptr); \ @@ -717,6 +766,8 @@ static void compute_parity(struct stripe int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; void *ptr[MAX_XOR_BLOCKS]; struct bio *chosen; + struct page *page; + int zerocopy = 0; PRINTK("compute_parity, stripe %llu, method %d\n", (unsigned long long)sh->sector, method); @@ -743,13 +794,17 @@ static void compute_parity(struct stripe break; case RECONSTRUCT_WRITE: memset(ptr[0], 0, STRIPE_SIZE); - for (i= disks; i-- ;) + zerocopy = 1; + for (i= disks; i-- ;) { + if (i != pd_idx && !sh->dev[i].towrite) + zerocopy = 0; if (i!=pd_idx && sh->dev[i].towrite) { chosen = sh->dev[i].towrite; sh->dev[i].towrite = NULL; if (sh->dev[i].written) BUG(); sh->dev[i].written = chosen; } + } break; case CHECK_PARITY: break; @@ -759,34 +814,62 @@ static void compute_parity(struct stripe count = 1; } - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } + for (i = disks; i--;) { + struct bio *wbi = sh->dev[i].written; + sector_t sector; + + if (!wbi) + continue; + + sector = sh->dev[i].sector; + set_bit(R5_LOCKED, &sh->dev[i].flags); + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); + + /* check if it's covered by a single page + and whole stripe is written at once. + * in this case we can avoid memcpy() */ + if (zerocopy && wbi && wbi->bi_next == NULL && + test_bit(R5_OVERWRITE, &sh->dev[i].flags)) { + page = zero_copy_data(wbi, sector); + BUG_ON(PageHighMem(page)); + sh->dev[i].req.bi_io_vec[0].bv_page = page; + set_bit(R5_Direct, &sh->dev[i].flags); + clear_bit(R5_UPTODATE, &sh->dev[i].flags); + continue; + } - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); + set_bit(R5_UPTODATE, &sh->dev[i].flags); + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { + copy_data(1, wbi, sh->dev[i].page, sector); + wbi = r5_next_bio(wbi, sector); } + } switch(method) { case RECONSTRUCT_WRITE: case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); - } + for (i=disks; i--;) { + if (i == pd_idx) + continue; + if (test_bit(R5_Direct, &sh->dev[i].flags)) + page = sh->dev[i].req.bi_io_vec[0].bv_page; + else + page = sh->dev[i].page; + ptr[count++] = page_address(page); + check_xor(); + } break; case READ_MODIFY_WRITE: - for (i = disks; i--;) - if (sh->dev[i].written) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); - } + for (i = disks; i--;) { + if (!sh->dev[i].written) + continue; + if (test_bit(R5_Direct, &sh->dev[i].flags)) + page = sh->dev[i].req.bi_io_vec[0].bv_page; + else + page = sh->dev[i].page; + ptr[count++] = page_address(page); + check_xor(); + } } if (count != 1) xor_block(count, STRIPE_SIZE, ptr); @@ -1012,7 +1094,7 @@ static void handle_stripe(struct stripe_ dev = &sh->dev[sh->pd_idx]; if ( written && ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags)) + (test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Direct, &dev->flags))) || (failed == 1 && failed_num == sh->pd_idx)) ) { /* any written block on an uptodate or failed drive can be returned. @@ -1023,13 +1105,16 @@ static void handle_stripe(struct stripe_ if (sh->dev[i].written) { dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && - test_bit(R5_UPTODATE, &dev->flags) ) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Direct, &dev->flags)) ) { /* We can return any write requests */ struct bio *wbi, *wbi2; PRINTK("Return write for disc %d\n", i); spin_lock_irq(&conf->device_lock); wbi = dev->written; dev->written = NULL; + if (test_bit(R5_Direct, &dev->flags)) + clear_bit(R5_Direct, &dev->flags); while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { wbi2 = r5_next_bio(wbi, dev->sector); if (--wbi->bi_phys_segments == 0) {