1 diff -pur linux-2.6.18-92.1.22.orig/drivers/md/raid5.c linux-2.6.18-92.1.22/drivers/md/raid5.c
2 --- linux-2.6.18-92.1.22.orig/drivers/md/raid5.c 2009-02-10 13:47:54.000000000 +0800
3 +++ linux-2.6.18-92.1.22/drivers/md/raid5.c 2009-02-10 14:44:24.000000000 +0800
4 @@ -633,6 +633,9 @@ static int raid5_end_read_request(struct
5 clear_buffer_uptodate(bh);
8 + /* Read on a Directing write is allowable */
9 + /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
10 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
11 clear_bit(R5_LOCKED, &sh->dev[i].flags);
12 set_bit(STRIPE_HANDLE, &sh->state);
14 @@ -671,6 +674,10 @@ static int raid5_end_write_request (stru
16 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
18 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
19 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
20 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
22 clear_bit(R5_LOCKED, &sh->dev[i].flags);
23 set_bit(STRIPE_HANDLE, &sh->state);
24 __release_stripe(conf, sh);
25 @@ -911,7 +918,27 @@ static sector_t compute_blocknr(struct s
29 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
31 + sector_t bi_sector = bio->bi_sector;
32 + struct page *page = NULL;
33 + struct bio_vec *bvl;
36 + bio_for_each_segment(bvl, bio, i) {
37 + if (sector == bi_sector)
38 + page = bio_iovec_idx(bio, i)->bv_page;
39 + bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
40 + if (bi_sector >= sector + STRIPE_SECTORS) {
41 + /* check if the stripe is covered by one page */
42 + if (page == bio_iovec_idx(bio, i)->bv_page &&
52 * Copy data between a page in the stripe cache, and one or more bion
53 @@ -1003,8 +1030,9 @@ static void compute_parity5(struct strip
55 raid5_conf_t *conf = sh->raid_conf;
56 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
57 - void *ptr[MAX_XOR_BLOCKS];
58 + void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
62 PRINTK("compute_parity5, stripe %llu, method %d\n",
63 (unsigned long long)sh->sector, method);
64 @@ -1054,34 +1082,92 @@ static void compute_parity5(struct strip
68 - for (i = disks; i--;)
69 - if (sh->dev[i].written) {
70 - sector_t sector = sh->dev[i].sector;
71 - struct bio *wbi = sh->dev[i].written;
72 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
73 - copy_data(1, wbi, sh->dev[i].page, sector);
74 - wbi = r5_next_bio(wbi, sector);
75 + for (i = disks; i--;) {
76 + struct r5dev *dev = &sh->dev[i];
77 + struct bio *wbi = dev->written;
83 + sector = dev->sector;
84 + set_bit(R5_LOCKED, &sh->dev[i].flags);
85 + BUG_ON(test_bit(R5_Direct, &dev->flags));
87 + /* check if it's covered by a single page
88 + and whole stripe is written at once.
89 + * in this case we can avoid memcpy() */
90 + if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
91 + test_bit(R5_Insync, &dev->flags)) {
92 + page = zero_copy_data(wbi, sector);
94 + atomic_inc(&conf->writes_zcopy);
95 + /* The pointer must be restored whenever the LOCKED
97 + dev->req.bi_io_vec[0].bv_page = page;
98 + set_bit(R5_Direct, &dev->flags);
99 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
100 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
105 - set_bit(R5_LOCKED, &sh->dev[i].flags);
106 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
107 + /* do copy write */
108 + atomic_inc(&conf->writes_copied);
109 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
110 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
111 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
112 + copy_data(1, wbi, sh->dev[i].page, sector);
113 + wbi = r5_next_bio(wbi, sector);
119 case RECONSTRUCT_WRITE:
121 - for (i=disks; i--;)
123 - ptr[count++] = page_address(sh->dev[i].page);
125 + for (i=disks; i--;) {
128 + if (test_bit(R5_Direct, &sh->dev[i].flags))
129 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
131 + page = sh->dev[i].page;
133 + /* have to compute the parity immediately for
134 + * a highmem page. it would happen for zerocopy. -jay
136 + if (PageHighMem(page)) {
137 + h_ptr[1] = kmap_atomic(page, KM_USER0);
138 + xor_block(2, STRIPE_SIZE, h_ptr);
139 + kunmap_atomic(page, KM_USER0);
141 + ptr[count++] = page_address(page);
146 case READ_MODIFY_WRITE:
147 - for (i = disks; i--;)
148 - if (sh->dev[i].written) {
149 - ptr[count++] = page_address(sh->dev[i].page);
151 + for (i = disks; i--;) {
152 + if (!sh->dev[i].written)
154 + if (test_bit(R5_Direct, &sh->dev[i].flags))
155 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
157 + page = sh->dev[i].page;
159 + /* have to compute the parity immediately for
160 + * a highmem page. it would happen for zerocopy. -jay
162 + if (PageHighMem(page)) {
163 + h_ptr[1] = kmap_atomic(page, KM_USER0);
164 + xor_block(2, STRIPE_SIZE, h_ptr);
165 + kunmap_atomic(page, KM_USER0);
167 + ptr[count++] = page_address(page);
173 xor_block(count, STRIPE_SIZE, ptr);
174 @@ -1098,6 +1184,7 @@ static void compute_parity6(struct strip
175 raid6_conf_t *conf = sh->raid_conf;
176 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
179 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
182 @@ -1127,18 +1214,49 @@ static void compute_parity6(struct strip
183 BUG(); /* Not implemented yet */
186 - for (i = disks; i--;)
187 - if (sh->dev[i].written) {
188 - sector_t sector = sh->dev[i].sector;
189 - struct bio *wbi = sh->dev[i].written;
190 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
191 - copy_data(1, wbi, sh->dev[i].page, sector);
192 - wbi = r5_next_bio(wbi, sector);
193 + for (i = disks; i--;) {
194 + struct r5dev *dev = &sh->dev[i];
195 + struct bio *wbi = dev->written;
201 + sector = sh->dev[i].sector;
202 + set_bit(R5_LOCKED, &sh->dev[i].flags);
203 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
205 + /* check if it's covered by a single page
206 + * and whole stripe is written at once.
207 + * in this case we can avoid memcpy() */
208 + if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
209 + test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
210 + page = zero_copy_data(wbi, sector);
211 + /* we don't do zerocopy on a HighMem page. Raid6 tend
212 + * to prepare all of the pages' content to be accessed
213 + * before computing PQ parity. If we need to support HighMem
214 + * page also, we have to modify the gen_syndrome()
215 + * algorithm. -jay */
216 + if (page && !PageHighMem(page)) {
217 + atomic_inc(&conf->writes_zcopy);
218 + /* The pointer must be restored whenever the LOCKED
220 + sh->dev[i].req.bi_io_vec[0].bv_page = page;
221 + set_bit(R5_Direct, &sh->dev[i].flags);
222 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
223 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
228 - set_bit(R5_LOCKED, &sh->dev[i].flags);
229 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
230 + atomic_inc(&conf->writes_copied);
231 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
232 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
233 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
234 + copy_data(1, wbi, sh->dev[i].page, sector);
235 + wbi = r5_next_bio(wbi, sector);
240 // case RECONSTRUCT_WRITE:
241 @@ -1149,8 +1267,12 @@ static void compute_parity6(struct strip
245 - ptrs[count++] = page_address(sh->dev[i].page);
246 - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
247 + if (test_bit(R5_Direct, &sh->dev[i].flags))
248 + ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
250 + ptrs[count++] = page_address(sh->dev[i].page);
251 + if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
252 + !test_bit(R5_Direct, &sh->dev[i].flags))
253 printk("block %d/%d not uptodate on parity calc\n", i,count);
254 i = raid6_next_disk(i, disks);
255 } while ( i != d0_idx );
256 @@ -1599,7 +1721,8 @@ static void handle_stripe5(struct stripe
257 if (sh->dev[i].written) {
259 if (!test_bit(R5_LOCKED, &dev->flags) &&
260 - test_bit(R5_UPTODATE, &dev->flags) ) {
261 + (test_bit(R5_UPTODATE, &dev->flags) ||
262 + test_bit(R5_Direct, &dev->flags)) ) {
263 /* We can return any write requests */
264 struct bio *wbi, *wbi2;
266 @@ -1607,6 +1730,7 @@ static void handle_stripe5(struct stripe
267 spin_lock_irq(&conf->device_lock);
270 + clear_bit(R5_Direct, &dev->flags);
271 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
272 wbi2 = r5_next_bio(wbi, dev->sector);
273 if (--wbi->bi_phys_segments == 0) {
274 @@ -1970,6 +2094,15 @@ static void handle_stripe5(struct stripe
275 set_bit(STRIPE_DEGRADED, &sh->state);
276 PRINTK("skip op %ld on disc %d for sector %llu\n",
277 bi->bi_rw, i, (unsigned long long)sh->sector);
279 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
280 + /* restore the page pointer of req, otherwise,
281 + * no any read is permitted on this stripe, this is
282 + * not what we want. -jay */
283 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
284 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
287 clear_bit(R5_LOCKED, &sh->dev[i].flags);
288 set_bit(STRIPE_HANDLE, &sh->state);
290 @@ -2175,7 +2308,8 @@ static void handle_stripe6(struct stripe
291 if (sh->dev[i].written) {
293 if (!test_bit(R5_LOCKED, &dev->flags) &&
294 - test_bit(R5_UPTODATE, &dev->flags) ) {
295 + (test_bit(R5_UPTODATE, &dev->flags) ||
296 + test_bit(R5_Direct, &dev->flags)) ) {
297 /* We can return any write requests */
299 struct bio *wbi, *wbi2;
300 @@ -2184,6 +2318,7 @@ static void handle_stripe6(struct stripe
301 spin_lock_irq(&conf->device_lock);
304 + clear_bit(R5_Direct, &dev->flags);
305 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
306 wbi2 = r5_next_bio(wbi, dev->sector);
307 if (--wbi->bi_phys_segments == 0) {
308 @@ -2535,6 +2670,15 @@ static void handle_stripe6(struct stripe
309 set_bit(STRIPE_DEGRADED, &sh->state);
310 PRINTK("skip op %ld on disc %d for sector %llu\n",
311 bi->bi_rw, i, (unsigned long long)sh->sector);
313 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
314 + /* restore the page pointer of req, otherwise,
315 + * no any read is permitted on this stripe, this is
316 + * not what we want. -jay */
317 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
318 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
321 clear_bit(R5_LOCKED, &sh->dev[i].flags);
322 set_bit(STRIPE_HANDLE, &sh->state);
324 @@ -3456,6 +3600,9 @@ static int run(mddev_t *mddev)
325 mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
326 mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
328 + /* raid5 device is able to do zcopy right now. */
329 + mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
334 @@ -3542,9 +3689,11 @@ static void status (struct seq_file *seq
335 atomic_read(&conf->handled_in_raid5d),
336 atomic_read(&conf->out_of_stripes),
337 atomic_read(&conf->handle_called));
338 - seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
339 + seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
340 atomic_read(&conf->reads_for_rmw),
341 - atomic_read(&conf->reads_for_rcw));
342 + atomic_read(&conf->reads_for_rcw),
343 + atomic_read(&conf->writes_zcopy),
344 + atomic_read(&conf->writes_copied));
345 seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
346 atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
347 atomic_read(&conf->active_stripes),
348 diff -pur linux-2.6.18-92.1.22.orig/include/linux/backing-dev.h linux-2.6.18-92.1.22/include/linux/backing-dev.h
349 --- linux-2.6.18-92.1.22.orig/include/linux/backing-dev.h 2009-02-10 13:47:54.000000000 +0800
350 +++ linux-2.6.18-92.1.22/include/linux/backing-dev.h 2009-02-10 14:44:14.000000000 +0800
351 @@ -48,6 +48,7 @@ struct backing_dev_info {
352 #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
353 #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
354 #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
355 +#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */
356 #define BDI_CAP_VMFLAGS \
357 (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
359 @@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
360 #define bdi_cap_account_dirty(bdi) \
361 (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
363 +#define bdi_cap_page_constant_write(bdi) \
364 + ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
366 #define mapping_cap_writeback_dirty(mapping) \
367 bdi_cap_writeback_dirty((mapping)->backing_dev_info)
369 #define mapping_cap_account_dirty(mapping) \
370 bdi_cap_account_dirty((mapping)->backing_dev_info)
372 +#define mapping_cap_page_constant_write(mapping) \
373 + bdi_cap_page_constant_write((mapping)->backing_dev_info)
377 #endif /* _LINUX_BACKING_DEV_H */
378 diff -pur linux-2.6.18-92.1.22.orig/include/linux/page-flags.h linux-2.6.18-92.1.22/include/linux/page-flags.h
379 --- linux-2.6.18-92.1.22.orig/include/linux/page-flags.h 2009-02-10 13:47:54.000000000 +0800
380 +++ linux-2.6.18-92.1.22/include/linux/page-flags.h 2009-02-10 14:44:14.000000000 +0800
382 #define PG_reclaim 17 /* To be reclaimed asap */
383 #define PG_nosave_free 18 /* Free, should not be written */
384 #define PG_buddy 19 /* Page is free, on buddy lists */
385 +#define PG_constant 20 /* To mark if the page is constant */
386 #define PG_xpmem 27 /* Testing for xpmem. */
388 /* PG_owner_priv_1 users should have descriptive aliases */
391 struct page; /* forward declaration */
393 +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
394 +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
395 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
396 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
398 +extern int set_page_constant(struct page *page);
399 +extern void clear_page_constant(struct page *);
401 int test_clear_page_dirty(struct page *page);
402 int test_clear_page_writeback(struct page *page);
403 int test_set_page_writeback(struct page *page);
404 diff -pur linux-2.6.18-92.1.22.orig/include/linux/raid/raid5.h linux-2.6.18-92.1.22/include/linux/raid/raid5.h
405 --- linux-2.6.18-92.1.22.orig/include/linux/raid/raid5.h 2009-02-10 13:47:54.000000000 +0800
406 +++ linux-2.6.18-92.1.22/include/linux/raid/raid5.h 2009-02-10 14:44:14.000000000 +0800
407 @@ -156,8 +156,9 @@ struct stripe_head {
408 #define R5_Overlap 7 /* There is a pending overlapping request on this block */
409 #define R5_ReadError 8 /* seen a read error here recently */
410 #define R5_ReWrite 9 /* have tried to over-write the readerror */
412 #define R5_Expanded 10 /* This block now has post-expand data */
413 +#define R5_Direct 11 /* Use the pages in bio to do the write directly. */
418 diff -pur linux-2.6.18-92.1.22.orig/mm/filemap.c linux-2.6.18-92.1.22/mm/filemap.c
419 --- linux-2.6.18-92.1.22.orig/mm/filemap.c 2009-02-10 13:47:54.000000000 +0800
420 +++ linux-2.6.18-92.1.22/mm/filemap.c 2009-02-10 14:44:14.000000000 +0800
422 #include <linux/security.h>
423 #include <linux/syscalls.h>
424 #include <linux/cpuset.h>
425 +#include <linux/rmap.h>
427 #include "internal.h"
429 @@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
430 if (!test_clear_page_writeback(page))
433 + clear_page_constant(page);
434 smp_mb__after_clear_bit();
435 wake_up_page(page, PG_writeback);
437 EXPORT_SYMBOL(end_page_writeback);
439 +/* Make a page to be constant, `constant' means any write to this page will
440 + * be blocked until clear_page_constant is called.
441 + * The page lock must be held.
443 +int set_page_constant(struct page *page)
445 + BUG_ON(!PageLocked(page));
447 + /* If it's an anonymous page and haven't been added to swap cache,
448 + * return directly because we have no way to swap this page.
450 + if (page_mapping(page) == NULL)
453 + BUG_ON(!PageUptodate(page));
455 + /* I have to clear page uptodate before trying to remove
456 + * it from user's page table because otherwise, the page may be
457 + * reinstalled by a page access which happens between try_to_unmap()
458 + * and ClearPageUptodate(). -jay
460 + ClearPageUptodate(page);
461 + if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
462 + SetPageUptodate(page);
465 + SetPageConstant(page);
466 + return SWAP_SUCCESS;
469 +void clear_page_constant(struct page *page)
471 + if (PageConstant(page)) {
472 + BUG_ON(!PageLocked(page));
473 + BUG_ON(PageUptodate(page));
474 + ClearPageConstant(page);
475 + SetPageUptodate(page);
479 +EXPORT_SYMBOL(set_page_constant);
480 +EXPORT_SYMBOL(clear_page_constant);
483 * __lock_page - get a lock on the page, assuming we need to sleep to get it
484 * @page: the page to lock