1 In case of full-stripe writes don't copy data into internal cache.
2 This optimization reduces CPU load by 30% rougly.
4 Index: linux-2.6.9/include/linux/raid/raid5.h
5 ===================================================================
6 --- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-21 17:57:25.000000000 +0400
7 +++ linux-2.6.9/include/linux/raid/raid5.h 2006-05-22 00:10:04.000000000 +0400
8 @@ -152,6 +152,7 @@ struct stripe_head {
9 #define R5_Wantread 4 /* want to schedule a read */
10 #define R5_Wantwrite 5
11 #define R5_Syncio 6 /* this io need to be accounted as resync io */
12 +#define R5_Direct 7 /* use page fom passed bio to avoid memcpy */
16 Index: linux-2.6.9/drivers/md/raid5.c
17 ===================================================================
18 --- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:10:01.000000000 +0400
19 +++ linux-2.6.9/drivers/md/raid5.c 2006-05-22 00:10:04.000000000 +0400
20 @@ -411,6 +411,8 @@ static int raid5_end_read_request (struc
21 clear_buffer_uptodate(bh);
24 + if (test_bit(R5_Direct, &sh->dev[i].flags))
25 + printk("R5_Direct for READ ?!\n");
26 clear_bit(R5_LOCKED, &sh->dev[i].flags);
27 set_bit(STRIPE_HANDLE, &sh->state);
29 @@ -449,6 +451,10 @@ static int raid5_end_write_request (stru
31 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
33 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
34 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
35 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
37 clear_bit(R5_LOCKED, &sh->dev[i].flags);
38 set_bit(STRIPE_HANDLE, &sh->state);
39 __release_stripe(conf, sh);
40 @@ -673,6 +679,49 @@ static void copy_data(int frombio, struc
44 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
46 + struct bio_vec *bvl;
49 + for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
50 + bio = r5_next_bio(bio, sector) ) {
52 + if (bio->bi_sector >= sector)
53 + page_offset = (signed)(bio->bi_sector - sector) * 512;
55 + page_offset = (signed)(sector - bio->bi_sector) * -512;
56 + bio_for_each_segment(bvl, bio, i) {
57 + int len = bio_iovec_idx(bio,i)->bv_len;
61 + if (page_offset < 0) {
62 + b_offset = -page_offset;
63 + page_offset += b_offset;
67 + if (len > 0 && page_offset + len > STRIPE_SIZE)
68 + clen = STRIPE_SIZE - page_offset;
72 + BUG_ON(clen < STRIPE_SIZE);
73 + /*printk(" sector %lu: page %p from index %u\n",
74 + (unsigned long) sector,
75 + bio_iovec_idx(bio, i)->bv_page, i);*/
76 + return bio_iovec_idx(bio, i)->bv_page;
78 + if (clen < len) /* hit end of page */
87 #define check_xor() do { \
88 if (count == MAX_XOR_BLOCKS) { \
89 xor_block(count, STRIPE_SIZE, ptr); \
90 @@ -717,6 +766,8 @@ static void compute_parity(struct stripe
91 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
92 void *ptr[MAX_XOR_BLOCKS];
97 PRINTK("compute_parity, stripe %llu, method %d\n",
98 (unsigned long long)sh->sector, method);
99 @@ -743,13 +794,17 @@ static void compute_parity(struct stripe
101 case RECONSTRUCT_WRITE:
102 memset(ptr[0], 0, STRIPE_SIZE);
103 - for (i= disks; i-- ;)
105 + for (i= disks; i-- ;) {
106 + if (i != pd_idx && !sh->dev[i].towrite)
108 if (i!=pd_idx && sh->dev[i].towrite) {
109 chosen = sh->dev[i].towrite;
110 sh->dev[i].towrite = NULL;
111 if (sh->dev[i].written) BUG();
112 sh->dev[i].written = chosen;
118 @@ -759,34 +814,62 @@ static void compute_parity(struct stripe
122 - for (i = disks; i--;)
123 - if (sh->dev[i].written) {
124 - sector_t sector = sh->dev[i].sector;
125 - struct bio *wbi = sh->dev[i].written;
126 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
127 - copy_data(1, wbi, sh->dev[i].page, sector);
128 - wbi = r5_next_bio(wbi, sector);
130 + for (i = disks; i--;) {
131 + struct bio *wbi = sh->dev[i].written;
137 + sector = sh->dev[i].sector;
138 + set_bit(R5_LOCKED, &sh->dev[i].flags);
139 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
141 + /* check if it's covered by a single page
142 + and whole stripe is written at once.
143 + * in this case we can avoid memcpy() */
144 + if (zerocopy && wbi && wbi->bi_next == NULL &&
145 + test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
146 + page = zero_copy_data(wbi, sector);
147 + BUG_ON(PageHighMem(page));
148 + sh->dev[i].req.bi_io_vec[0].bv_page = page;
149 + set_bit(R5_Direct, &sh->dev[i].flags);
150 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
154 - set_bit(R5_LOCKED, &sh->dev[i].flags);
155 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
156 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
157 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
158 + copy_data(1, wbi, sh->dev[i].page, sector);
159 + wbi = r5_next_bio(wbi, sector);
164 case RECONSTRUCT_WRITE:
166 - for (i=disks; i--;)
168 - ptr[count++] = page_address(sh->dev[i].page);
171 + for (i=disks; i--;) {
174 + if (test_bit(R5_Direct, &sh->dev[i].flags))
175 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
177 + page = sh->dev[i].page;
178 + ptr[count++] = page_address(page);
182 case READ_MODIFY_WRITE:
183 - for (i = disks; i--;)
184 - if (sh->dev[i].written) {
185 - ptr[count++] = page_address(sh->dev[i].page);
188 + for (i = disks; i--;) {
189 + if (!sh->dev[i].written)
191 + if (test_bit(R5_Direct, &sh->dev[i].flags))
192 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
194 + page = sh->dev[i].page;
195 + ptr[count++] = page_address(page);
200 xor_block(count, STRIPE_SIZE, ptr);
201 @@ -1012,7 +1094,7 @@ static void handle_stripe(struct stripe_
202 dev = &sh->dev[sh->pd_idx];
204 ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
205 - test_bit(R5_UPTODATE, &dev->flags))
206 + (test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Direct, &dev->flags)))
207 || (failed == 1 && failed_num == sh->pd_idx))
209 /* any written block on an uptodate or failed drive can be returned.
210 @@ -1023,13 +1105,16 @@ static void handle_stripe(struct stripe_
211 if (sh->dev[i].written) {
213 if (!test_bit(R5_LOCKED, &dev->flags) &&
214 - test_bit(R5_UPTODATE, &dev->flags) ) {
215 + (test_bit(R5_UPTODATE, &dev->flags) ||
216 + test_bit(R5_Direct, &dev->flags)) ) {
217 /* We can return any write requests */
218 struct bio *wbi, *wbi2;
219 PRINTK("Return write for disc %d\n", i);
220 spin_lock_irq(&conf->device_lock);
223 + if (test_bit(R5_Direct, &dev->flags))
224 + clear_bit(R5_Direct, &dev->flags);
225 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
226 wbi2 = r5_next_bio(wbi, dev->sector);
227 if (--wbi->bi_phys_segments == 0) {