1 diff -pur linux-2.6.9-67.orig/drivers/md/raid5.c linux-2.6.9-67/drivers/md/raid5.c
2 --- linux-2.6.9-67.orig/drivers/md/raid5.c 2009-02-15 10:11:54.000000000 +0800
3 +++ linux-2.6.9-67/drivers/md/raid5.c 2009-02-15 10:22:51.000000000 +0800
4 @@ -412,6 +412,9 @@ static int raid5_end_read_request (struc
5 clear_buffer_uptodate(bh);
8 + /* Read on a Directing write is allowable */
9 + /* BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)) */
10 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page != sh->dev[i].page);
11 clear_bit(R5_LOCKED, &sh->dev[i].flags);
12 set_bit(STRIPE_HANDLE, &sh->state);
14 @@ -450,6 +453,10 @@ static int raid5_end_write_request (stru
16 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
18 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
19 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
20 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
22 clear_bit(R5_LOCKED, &sh->dev[i].flags);
23 set_bit(STRIPE_HANDLE, &sh->state);
24 __release_stripe(conf, sh);
25 @@ -620,7 +627,27 @@ static sector_t compute_blocknr(struct s
29 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
31 + sector_t bi_sector = bio->bi_sector;
32 + struct page *page = NULL;
33 + struct bio_vec *bvl;
36 + bio_for_each_segment(bvl, bio, i) {
37 + if (sector == bi_sector)
38 + page = bio_iovec_idx(bio, i)->bv_page;
39 + bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
40 + if (bi_sector >= sector + STRIPE_SECTORS) {
41 + /* check if the stripe is covered by one page */
42 + if (page == bio_iovec_idx(bio, i)->bv_page &&
52 * Copy data between a page in the stripe cache, and one or more bion
53 @@ -716,8 +743,9 @@ static void compute_parity(struct stripe
55 raid5_conf_t *conf = sh->raid_conf;
56 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
57 - void *ptr[MAX_XOR_BLOCKS];
58 + void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
62 PRINTK("compute_parity, stripe %llu, method %d\n",
63 (unsigned long long)sh->sector, method);
64 @@ -744,13 +772,14 @@ static void compute_parity(struct stripe
66 case RECONSTRUCT_WRITE:
67 memset(ptr[0], 0, STRIPE_SIZE);
68 - for (i= disks; i-- ;)
69 + for (i= disks; i-- ;) {
70 if (i!=pd_idx && sh->dev[i].towrite) {
71 chosen = sh->dev[i].towrite;
72 sh->dev[i].towrite = NULL;
73 if (sh->dev[i].written) BUG();
74 sh->dev[i].written = chosen;
80 @@ -760,34 +789,90 @@ static void compute_parity(struct stripe
84 - for (i = disks; i--;)
85 - if (sh->dev[i].written) {
86 - sector_t sector = sh->dev[i].sector;
87 - struct bio *wbi = sh->dev[i].written;
88 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
89 - copy_data(1, wbi, sh->dev[i].page, sector);
90 - wbi = r5_next_bio(wbi, sector);
91 + for (i = disks; i--;) {
92 + struct bio *wbi = sh->dev[i].written;
98 + sector = sh->dev[i].sector;
99 + set_bit(R5_LOCKED, &sh->dev[i].flags);
100 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
102 + /* check if it's covered by a single page
103 + and whole stripe is written at once.
104 + * in this case we can avoid memcpy() */
105 + if (!wbi->bi_next && test_bit(R5_OVERWRITE, &sh->dev[i].flags) &&
106 + test_bit(R5_Insync, &sh->dev[i].flags)) {
107 + page = zero_copy_data(wbi, sector);
109 + atomic_inc(&conf->writes_zcopy);
110 + /* The pointer must be restored whenever the LOCKED
112 + sh->dev[i].req.bi_io_vec[0].bv_page = page;
113 + set_bit(R5_Direct, &sh->dev[i].flags);
114 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
115 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
120 - set_bit(R5_LOCKED, &sh->dev[i].flags);
121 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
122 + atomic_inc(&conf->writes_copied);
123 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
124 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
125 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
126 + copy_data(1, wbi, sh->dev[i].page, sector);
127 + wbi = r5_next_bio(wbi, sector);
133 case RECONSTRUCT_WRITE:
135 - for (i=disks; i--;)
137 - ptr[count++] = page_address(sh->dev[i].page);
139 + for (i=disks; i--;) {
142 + if (test_bit(R5_Direct, &sh->dev[i].flags))
143 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
145 + page = sh->dev[i].page;
147 + /* have to compute the parity immediately for
148 + * a highmem page. it would happen for zerocopy. -jay
150 + if (PageHighMem(page)) {
151 + h_ptr[1] = kmap_atomic(page, KM_USER0);
152 + xor_block(2, STRIPE_SIZE, h_ptr);
153 + kunmap_atomic(page, KM_USER0);
155 + ptr[count++] = page_address(page);
160 case READ_MODIFY_WRITE:
161 - for (i = disks; i--;)
162 - if (sh->dev[i].written) {
163 - ptr[count++] = page_address(sh->dev[i].page);
165 + for (i = disks; i--;) {
166 + if (!sh->dev[i].written)
168 + if (test_bit(R5_Direct, &sh->dev[i].flags))
169 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
171 + page = sh->dev[i].page;
173 + /* have to compute the parity immediately for
174 + * a highmem page. it would happen for zerocopy. -jay
176 + if (PageHighMem(page)) {
177 + h_ptr[1] = kmap_atomic(page, KM_USER0);
178 + xor_block(2, STRIPE_SIZE, h_ptr);
179 + kunmap_atomic(page, KM_USER0);
181 + ptr[count++] = page_address(page);
187 xor_block(count, STRIPE_SIZE, ptr);
188 @@ -1061,13 +1146,15 @@ static void handle_stripe(struct stripe_
189 if (sh->dev[i].written) {
191 if (!test_bit(R5_LOCKED, &dev->flags) &&
192 - test_bit(R5_UPTODATE, &dev->flags) ) {
193 + (test_bit(R5_UPTODATE, &dev->flags) ||
194 + test_bit(R5_Direct, &dev->flags)) ) {
195 /* We can return any write requests */
196 struct bio *wbi, *wbi2;
197 PRINTK("Return write for disc %d\n", i);
198 spin_lock_irq(&conf->device_lock);
201 + clear_bit(R5_Direct, &dev->flags);
202 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
203 wbi2 = r5_next_bio(wbi, dev->sector);
204 if (--wbi->bi_phys_segments == 0) {
205 @@ -1337,6 +1424,15 @@ static void handle_stripe(struct stripe_
207 PRINTK("skip op %ld on disc %d for sector %llu\n",
208 bi->bi_rw, i, (unsigned long long)sh->sector);
210 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
211 + /* restore the page pointer of req, otherwise,
212 + * no any read is permitted on this stripe, this is
213 + * not what we want. -jay */
214 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
215 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
218 clear_bit(R5_LOCKED, &sh->dev[i].flags);
219 set_bit(STRIPE_HANDLE, &sh->state);
221 @@ -1835,6 +1931,7 @@ memory = conf->max_nr_stripes * (sizeof(
222 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
223 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
225 + mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
227 /* Ok, everything is just fine now */
228 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
229 @@ -1922,9 +2019,11 @@ static void status (struct seq_file *seq
230 atomic_read(&conf->handled_in_raid5d),
231 atomic_read(&conf->out_of_stripes),
232 atomic_read(&conf->handle_called));
233 - seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
234 + seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
235 atomic_read(&conf->reads_for_rmw),
236 - atomic_read(&conf->reads_for_rcw));
237 + atomic_read(&conf->reads_for_rcw),
238 + atomic_read(&conf->writes_zcopy),
239 + atomic_read(&conf->writes_copied));
240 seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
241 atomic_read(&conf->delayed),
242 atomic_read(&conf->active_stripes),
243 diff -pur linux-2.6.9-67.orig/include/linux/backing-dev.h linux-2.6.9-67/include/linux/backing-dev.h
244 --- linux-2.6.9-67.orig/include/linux/backing-dev.h 2009-02-15 10:11:54.000000000 +0800
245 +++ linux-2.6.9-67/include/linux/backing-dev.h 2009-02-15 10:22:40.000000000 +0800
246 @@ -30,8 +30,11 @@ struct backing_dev_info {
247 void *congested_data; /* Pointer to aux data for congested func */
248 void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
249 void *unplug_io_data;
250 + unsigned int capabilities;
253 +#define BDI_CAP_PAGE_CONST_WRITE 0x00000001
255 extern struct backing_dev_info default_backing_dev_info;
256 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
258 @@ -62,4 +65,7 @@ static inline int bdi_rw_congested(struc
259 (1 << BDI_write_congested));
262 +#define mapping_cap_page_constant_write(mapping) \
263 + ((mapping)->backing_dev_info->capabilities & BDI_CAP_PAGE_CONST_WRITE)
265 #endif /* _LINUX_BACKING_DEV_H */
266 diff -pur linux-2.6.9-67.orig/include/linux/page-flags.h linux-2.6.9-67/include/linux/page-flags.h
267 --- linux-2.6.9-67.orig/include/linux/page-flags.h 2009-02-15 10:11:54.000000000 +0800
268 +++ linux-2.6.9-67/include/linux/page-flags.h 2009-02-15 10:22:40.000000000 +0800
270 #define PG_swapcache 16 /* Swap page: swp_entry_t in private */
271 #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
272 #define PG_reclaim 18 /* To be reclaimed asap */
273 +#define PG_constant 19 /* To mark the page is constant */
277 @@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u
278 #define PageSwapCache(page) 0
281 +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
282 +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
283 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
284 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
286 struct page; /* forward declaration */
288 int test_clear_page_dirty(struct page *page);
289 diff -pur linux-2.6.9-67.orig/include/linux/pagemap.h linux-2.6.9-67/include/linux/pagemap.h
290 --- linux-2.6.9-67.orig/include/linux/pagemap.h 2009-02-15 10:11:54.000000000 +0800
291 +++ linux-2.6.9-67/include/linux/pagemap.h 2009-02-15 10:22:40.000000000 +0800
292 @@ -191,6 +191,19 @@ static inline void wait_on_page_writebac
294 extern void end_page_writeback(struct page *page);
296 +extern int set_page_constant(struct page *page);
297 +extern void clear_page_constant(struct page *);
298 +static inline int set_page_constant_lock(struct page *page)
300 + BUG_ON(PageLocked(page));
302 + if (set_page_constant(page)) {
310 * Fault a userspace page into pagetables. Return non-zero on a fault.
312 diff -pur linux-2.6.9-67.orig/include/linux/raid/raid5.h linux-2.6.9-67/include/linux/raid/raid5.h
313 --- linux-2.6.9-67.orig/include/linux/raid/raid5.h 2009-02-15 10:11:54.000000000 +0800
314 +++ linux-2.6.9-67/include/linux/raid/raid5.h 2009-02-15 10:22:40.000000000 +0800
315 @@ -153,6 +153,7 @@ struct stripe_head {
316 #define R5_Wantread 4 /* want to schedule a read */
317 #define R5_Wantwrite 5
318 #define R5_Syncio 6 /* this io need to be accounted as resync io */
319 +#define R5_Direct 7 /* use page from passed bio to avoid memcpy */
323 @@ -234,6 +235,8 @@ struct raid5_private_data {
324 atomic_t out_of_stripes;
325 atomic_t reads_for_rmw;
326 atomic_t reads_for_rcw;
327 + atomic_t writes_zcopy;
328 + atomic_t writes_copied;
329 atomic_t handle_called;
331 atomic_t in_reqs_in_queue;
332 diff -pur linux-2.6.9-67.orig/mm/filemap.c linux-2.6.9-67/mm/filemap.c
333 --- linux-2.6.9-67.orig/mm/filemap.c 2009-02-15 10:11:55.000000000 +0800
334 +++ linux-2.6.9-67/mm/filemap.c 2009-02-15 10:22:40.000000000 +0800
336 #include <linux/pagevec.h>
337 #include <linux/blkdev.h>
338 #include <linux/security.h>
339 +#include <linux/rmap.h>
342 * This is needed for the following functions:
343 * - try_to_release_page
344 @@ -485,11 +487,52 @@ void end_page_writeback(struct page *pag
346 smp_mb__after_clear_bit();
348 + clear_page_constant(page);
352 EXPORT_SYMBOL(end_page_writeback);
354 +/* Mark a page in bio to be constant, page must be locked */
355 +int set_page_constant(struct page *page)
357 + BUG_ON(!PageLocked(page));
359 + /* If it's an anonymous page and haven't been added to swap cache,
362 + if (PageAnon(page) && !PageSwapCache(page))
365 + BUG_ON(!PageUptodate(page));
367 + /* I have to clear page uptodate before trying to remove
368 + * it from user's page table because otherwise, the page may be
369 + * reinstalled by a page access which happens between try_to_unmap()
370 + * and ClearPageUptodate(). -jay
372 + ClearPageUptodate(page);
373 + if (page_mapped(page) && try_to_unmap(page) != SWAP_SUCCESS) {
374 + SetPageUptodate(page);
377 + SetPageConstant(page);
381 +void clear_page_constant(struct page *page)
383 + if (PageConstant(page)) {
384 + BUG_ON(!PageLocked(page));
385 + BUG_ON(PageUptodate(page));
386 + ClearPageConstant(page);
387 + SetPageUptodate(page);
391 +EXPORT_SYMBOL(set_page_constant);
392 +EXPORT_SYMBOL(clear_page_constant);
395 * Get a lock on the page, assuming we need to sleep to get it.