1 diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c
2 --- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 19:09:20.000000000 +0800
3 +++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:09:32.000000000 +0800
4 @@ -633,6 +633,7 @@ static int raid5_end_read_request(struct
5 clear_buffer_uptodate(bh);
8 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
9 clear_bit(R5_LOCKED, &sh->dev[i].flags);
10 set_bit(STRIPE_HANDLE, &sh->state);
12 @@ -671,6 +672,10 @@ static int raid5_end_write_request (stru
14 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
16 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
17 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
18 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
20 clear_bit(R5_LOCKED, &sh->dev[i].flags);
21 set_bit(STRIPE_HANDLE, &sh->state);
22 __release_stripe(conf, sh);
23 @@ -911,7 +916,27 @@ static sector_t compute_blocknr(struct s
27 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
29 + sector_t bi_sector = bio->bi_sector;
30 + struct page *page = NULL;
31 + struct bio_vec *bvl;
34 + bio_for_each_segment(bvl, bio, i) {
35 + if (sector == bi_sector)
36 + page = bio_iovec_idx(bio, i)->bv_page;
37 + bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
38 + if (bi_sector >= sector + STRIPE_SECTORS) {
39 + /* check if the stripe is covered by one page */
40 + if (page == bio_iovec_idx(bio, i)->bv_page &&
50 * Copy data between a page in the stripe cache, and one or more bion
51 @@ -1003,8 +1028,9 @@ static void compute_parity5(struct strip
53 raid5_conf_t *conf = sh->raid_conf;
54 int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
55 - void *ptr[MAX_XOR_BLOCKS];
56 + void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
60 PRINTK("compute_parity5, stripe %llu, method %d\n",
61 (unsigned long long)sh->sector, method);
62 @@ -1054,34 +1080,90 @@ static void compute_parity5(struct strip
66 - for (i = disks; i--;)
67 - if (sh->dev[i].written) {
68 - sector_t sector = sh->dev[i].sector;
69 - struct bio *wbi = sh->dev[i].written;
70 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
71 - copy_data(1, wbi, sh->dev[i].page, sector);
72 - wbi = r5_next_bio(wbi, sector);
73 + for (i = disks; i--;) {
74 + struct r5dev *dev = &sh->dev[i];
75 + struct bio *wbi = dev->written;
81 + sector = dev->sector;
82 + set_bit(R5_LOCKED, &sh->dev[i].flags);
83 + BUG_ON(test_bit(R5_Direct, &dev->flags));
85 + /* check if it's covered by a single page
86 + and whole stripe is written at once.
87 + * in this case we can avoid memcpy() */
88 + if (!wbi->bi_next && test_bit(R5_OVERWRITE, &dev->flags) &&
89 + test_bit(R5_Insync, &dev->flags)) {
90 + page = zero_copy_data(wbi, sector);
92 + atomic_inc(&conf->writes_zcopy);
93 + dev->req.bi_io_vec[0].bv_page = page;
94 + set_bit(R5_Direct, &dev->flags);
95 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
96 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
101 - set_bit(R5_LOCKED, &sh->dev[i].flags);
102 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
103 + /* do copy write */
104 + atomic_inc(&conf->writes_copied);
105 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
106 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
107 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
108 + copy_data(1, wbi, sh->dev[i].page, sector);
109 + wbi = r5_next_bio(wbi, sector);
115 case RECONSTRUCT_WRITE:
117 - for (i=disks; i--;)
119 - ptr[count++] = page_address(sh->dev[i].page);
121 + for (i=disks; i--;) {
124 + if (test_bit(R5_Direct, &sh->dev[i].flags))
125 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
127 + page = sh->dev[i].page;
129 + /* have to compute the parity immediately for
130 + * a highmem page. it would happen for zerocopy. -jay
132 + if (PageHighMem(page)) {
133 + h_ptr[1] = kmap_atomic(page, KM_USER0);
134 + xor_block(2, STRIPE_SIZE, h_ptr);
135 + kunmap_atomic(page, KM_USER0);
137 + ptr[count++] = page_address(page);
142 case READ_MODIFY_WRITE:
143 - for (i = disks; i--;)
144 - if (sh->dev[i].written) {
145 - ptr[count++] = page_address(sh->dev[i].page);
147 + for (i = disks; i--;) {
148 + if (!sh->dev[i].written)
150 + if (test_bit(R5_Direct, &sh->dev[i].flags))
151 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
153 + page = sh->dev[i].page;
155 + /* have to compute the parity immediately for
156 + * a highmem page. it would happen for zerocopy. -jay
158 + if (PageHighMem(page)) {
159 + h_ptr[1] = kmap_atomic(page, KM_USER0);
160 + xor_block(2, STRIPE_SIZE, h_ptr);
161 + kunmap_atomic(page, KM_USER0);
163 + ptr[count++] = page_address(page);
169 xor_block(count, STRIPE_SIZE, ptr);
170 @@ -1098,6 +1180,7 @@ static void compute_parity6(struct strip
171 raid6_conf_t *conf = sh->raid_conf;
172 int i, pd_idx = sh->pd_idx, qd_idx, d0_idx, disks = conf->raid_disks, count;
175 /**** FIX THIS: This could be very bad if disks is close to 256 ****/
178 @@ -1127,18 +1210,47 @@ static void compute_parity6(struct strip
179 BUG(); /* Not implemented yet */
182 - for (i = disks; i--;)
183 - if (sh->dev[i].written) {
184 - sector_t sector = sh->dev[i].sector;
185 - struct bio *wbi = sh->dev[i].written;
186 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
187 - copy_data(1, wbi, sh->dev[i].page, sector);
188 - wbi = r5_next_bio(wbi, sector);
189 + for (i = disks; i--;) {
190 + struct r5dev *dev = &sh->dev[i];
191 + struct bio *wbi = dev->written;
197 + sector = sh->dev[i].sector;
198 + set_bit(R5_LOCKED, &sh->dev[i].flags);
199 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
201 + /* check if it's covered by a single page
202 + * and whole stripe is written at once.
203 + * in this case we can avoid memcpy() */
204 + if (!wbi->bi_next && test_bit(R5_Insync, &sh->dev[i].flags) &&
205 + test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
206 + page = zero_copy_data(wbi, sector);
207 + /* we don't do zerocopy on a HighMem page. Raid6 tend
208 + * to prepare all of the pages' content to be accessed
209 + * before computing PQ parity. If we need to support HighMem
210 + * page also, we have to modify the gen_syndrome()
211 + * algorithm. -jay */
212 + if (page && !PageHighMem(page)) {
213 + atomic_inc(&conf->writes_zcopy);
214 + sh->dev[i].req.bi_io_vec[0].bv_page = page;
215 + set_bit(R5_Direct, &sh->dev[i].flags);
216 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
217 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
222 - set_bit(R5_LOCKED, &sh->dev[i].flags);
223 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
224 + atomic_inc(&conf->writes_copied);
225 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
226 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
227 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
228 + copy_data(1, wbi, sh->dev[i].page, sector);
229 + wbi = r5_next_bio(wbi, sector);
234 // case RECONSTRUCT_WRITE:
235 @@ -1149,8 +1261,12 @@ static void compute_parity6(struct strip
239 - ptrs[count++] = page_address(sh->dev[i].page);
240 - if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags))
241 + if (test_bit(R5_Direct, &sh->dev[i].flags))
242 + ptrs[count++] = page_address(sh->dev[i].req.bi_io_vec[0].bv_page);
244 + ptrs[count++] = page_address(sh->dev[i].page);
245 + if (count <= disks-2 && !test_bit(R5_UPTODATE, &sh->dev[i].flags) &&
246 + !test_bit(R5_Direct, &sh->dev[i].flags))
247 printk("block %d/%d not uptodate on parity calc\n", i,count);
248 i = raid6_next_disk(i, disks);
249 } while ( i != d0_idx );
250 @@ -1597,7 +1713,8 @@ static void handle_stripe5(struct stripe
251 if (sh->dev[i].written) {
253 if (!test_bit(R5_LOCKED, &dev->flags) &&
254 - test_bit(R5_UPTODATE, &dev->flags) ) {
255 + (test_bit(R5_UPTODATE, &dev->flags) ||
256 + test_bit(R5_Direct, &dev->flags)) ) {
257 /* We can return any write requests */
258 struct bio *wbi, *wbi2;
260 @@ -1605,6 +1722,7 @@ static void handle_stripe5(struct stripe
261 spin_lock_irq(&conf->device_lock);
264 + clear_bit(R5_Direct, &dev->flags);
265 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
266 wbi2 = r5_next_bio(wbi, dev->sector);
267 if (--wbi->bi_phys_segments == 0) {
268 @@ -2173,7 +2291,8 @@ static void handle_stripe6(struct stripe
269 if (sh->dev[i].written) {
271 if (!test_bit(R5_LOCKED, &dev->flags) &&
272 - test_bit(R5_UPTODATE, &dev->flags) ) {
273 + (test_bit(R5_UPTODATE, &dev->flags) ||
274 + test_bit(R5_Direct, &dev->flags)) ) {
275 /* We can return any write requests */
277 struct bio *wbi, *wbi2;
278 @@ -2182,6 +2301,7 @@ static void handle_stripe6(struct stripe
279 spin_lock_irq(&conf->device_lock);
282 + clear_bit(R5_Direct, &dev->flags);
283 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
284 wbi2 = r5_next_bio(wbi, dev->sector);
285 if (--wbi->bi_phys_segments == 0) {
286 @@ -3450,6 +3570,9 @@ static int run(mddev_t *mddev)
287 mddev->queue->max_phys_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;
288 mddev->queue->max_hw_segments = conf->chunk_size * conf->previous_raid_disks >> PAGE_SHIFT;;
290 + /* raid5 device is able to do zcopy right now. */
291 + mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONSTANT_WRITE;
296 @@ -3536,9 +3659,11 @@ static void status (struct seq_file *seq
297 atomic_read(&conf->handled_in_raid5d),
298 atomic_read(&conf->out_of_stripes),
299 atomic_read(&conf->handle_called));
300 - seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
301 + seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
302 atomic_read(&conf->reads_for_rmw),
303 - atomic_read(&conf->reads_for_rcw));
304 + atomic_read(&conf->reads_for_rcw),
305 + atomic_read(&conf->writes_zcopy),
306 + atomic_read(&conf->writes_copied));
307 seq_printf (seq, "\n\t\t%u delayed, %u bit delayed, %u active, queues: %u in, %u out\n",
308 atomic_read(&conf->delayed), atomic_read(&conf->bit_delayed),
309 atomic_read(&conf->active_stripes),
310 diff -pur linux-2.6.18-53.orig/include/linux/backing-dev.h linux-2.6.18-53/include/linux/backing-dev.h
311 --- linux-2.6.18-53.orig/include/linux/backing-dev.h 2007-12-28 14:49:26.000000000 +0800
312 +++ linux-2.6.18-53/include/linux/backing-dev.h 2007-12-28 19:09:32.000000000 +0800
313 @@ -48,6 +48,7 @@ struct backing_dev_info {
314 #define BDI_CAP_READ_MAP 0x00000010 /* Can be mapped for reading */
315 #define BDI_CAP_WRITE_MAP 0x00000020 /* Can be mapped for writing */
316 #define BDI_CAP_EXEC_MAP 0x00000040 /* Can be mapped for execution */
317 +#define BDI_CAP_PAGE_CONSTANT_WRITE 0x00000080 /* Zcopy write - for raid5 */
318 #define BDI_CAP_VMFLAGS \
319 (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
321 @@ -94,11 +95,18 @@ static inline int bdi_rw_congested(struc
322 #define bdi_cap_account_dirty(bdi) \
323 (!((bdi)->capabilities & BDI_CAP_NO_ACCT_DIRTY))
325 +#define bdi_cap_page_constant_write(bdi) \
326 + ((bdi)->capabilities & BDI_CAP_PAGE_CONSTANT_WRITE)
328 #define mapping_cap_writeback_dirty(mapping) \
329 bdi_cap_writeback_dirty((mapping)->backing_dev_info)
331 #define mapping_cap_account_dirty(mapping) \
332 bdi_cap_account_dirty((mapping)->backing_dev_info)
334 +#define mapping_cap_page_constant_write(mapping) \
335 + bdi_cap_page_constant_write((mapping)->backing_dev_info)
339 #endif /* _LINUX_BACKING_DEV_H */
340 diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/include/linux/page-flags.h
341 --- linux-2.6.18-53.orig/include/linux/page-flags.h 2007-12-28 14:49:26.000000000 +0800
342 +++ linux-2.6.18-53/include/linux/page-flags.h 2007-12-28 19:09:32.000000000 +0800
344 #define PG_reclaim 17 /* To be reclaimed asap */
345 #define PG_nosave_free 18 /* Free, should not be written */
346 #define PG_buddy 19 /* Page is free, on buddy lists */
347 +#define PG_constant 20 /* To mark if the page is constant */
349 /* PG_owner_priv_1 users should have descriptive aliases */
350 #define PG_checked PG_owner_priv_1 /* Used by some filesystems */
353 struct page; /* forward declaration */
355 +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
356 +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
357 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
358 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
360 +extern int set_page_constant(struct page *page);
361 +extern void clear_page_constant(struct page *);
363 int test_clear_page_dirty(struct page *page);
364 int test_clear_page_writeback(struct page *page);
365 int test_set_page_writeback(struct page *page);
366 diff -pur linux-2.6.18-53.orig/include/linux/raid/raid5.h linux-2.6.18-53/include/linux/raid/raid5.h
367 --- linux-2.6.18-53.orig/include/linux/raid/raid5.h 2007-12-28 18:55:24.000000000 +0800
368 +++ linux-2.6.18-53/include/linux/raid/raid5.h 2007-12-28 19:09:32.000000000 +0800
369 @@ -156,8 +156,9 @@ struct stripe_head {
370 #define R5_Overlap 7 /* There is a pending overlapping request on this block */
371 #define R5_ReadError 8 /* seen a read error here recently */
372 #define R5_ReWrite 9 /* have tried to over-write the readerror */
374 #define R5_Expanded 10 /* This block now has post-expand data */
375 +#define R5_Direct 11 /* Use the pages in bio to do the write directly. */
380 diff -pur linux-2.6.18-53.orig/mm/filemap.c linux-2.6.18-53/mm/filemap.c
381 --- linux-2.6.18-53.orig/mm/filemap.c 2007-12-28 14:49:26.000000000 +0800
382 +++ linux-2.6.18-53/mm/filemap.c 2007-12-28 19:09:32.000000000 +0800
384 #include <linux/security.h>
385 #include <linux/syscalls.h>
386 #include <linux/cpuset.h>
387 +#include <linux/rmap.h>
389 #include "internal.h"
391 @@ -566,11 +567,55 @@ void end_page_writeback(struct page *pag
392 if (!test_clear_page_writeback(page))
395 + clear_page_constant(page);
396 smp_mb__after_clear_bit();
397 wake_up_page(page, PG_writeback);
399 EXPORT_SYMBOL(end_page_writeback);
401 +/* Make a page to be constant, `constant' means any write to this page will
402 + * be blocked until clear_page_constant is called.
403 + * The page lock must be held.
405 +int set_page_constant(struct page *page)
407 + BUG_ON(!PageLocked(page));
409 + /* If it's an anonymous page and haven't been added to swap cache,
410 + * return directly because we have no way to swap this page.
412 + if (page_mapping(page) == NULL)
415 + BUG_ON(!PageUptodate(page));
417 + /* I have to clear page uptodate before trying to remove
418 + * it from user's page table because otherwise, the page may be
419 + * reinstalled by a page access which happens between try_to_unmap()
420 + * and ClearPageUptodate(). -jay
422 + ClearPageUptodate(page);
423 + if (page_mapped(page) && try_to_unmap(page, 0) != SWAP_SUCCESS) {
424 + SetPageUptodate(page);
427 + SetPageConstant(page);
428 + return SWAP_SUCCESS;
431 +void clear_page_constant(struct page *page)
433 + if (PageConstant(page)) {
434 + BUG_ON(!PageLocked(page));
435 + BUG_ON(PageUptodate(page));
436 + ClearPageConstant(page);
437 + SetPageUptodate(page);
441 +EXPORT_SYMBOL(set_page_constant);
442 +EXPORT_SYMBOL(clear_page_constant);
445 * __lock_page - get a lock on the page, assuming we need to sleep to get it
446 * @page: the page to lock