1 diff -pru linux-2.6.9.orig/drivers/md/raid5.c linux-2.6.9/drivers/md/raid5.c
2 --- linux-2.6.9.orig/drivers/md/raid5.c 2007-07-09 02:43:33.000000000 -0600
3 +++ linux-2.6.9/drivers/md/raid5.c 2007-07-13 00:39:15.000000000 -0600
4 @@ -412,6 +412,7 @@ static int raid5_end_read_request (struc
5 clear_buffer_uptodate(bh);
8 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
9 clear_bit(R5_LOCKED, &sh->dev[i].flags);
10 set_bit(STRIPE_HANDLE, &sh->state);
12 @@ -450,6 +451,10 @@ static int raid5_end_write_request (stru
14 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
16 + if (test_bit(R5_Direct, &sh->dev[i].flags)) {
17 + BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
18 + sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
20 clear_bit(R5_LOCKED, &sh->dev[i].flags);
21 set_bit(STRIPE_HANDLE, &sh->state);
22 __release_stripe(conf, sh);
23 @@ -621,6 +626,25 @@ static sector_t compute_blocknr(struct s
27 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
29 + sector_t bi_sector = bio->bi_sector;
31 + struct bio_vec *bvl;
34 + bio_for_each_segment(bvl, bio, i) {
35 + if (sector > bi_sector) {
36 + bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
39 + BUG_ON(sector != bi_sector);
40 + page = bio_iovec_idx(bio, i)->bv_page;
41 + return PageConstant(page) ? page : NULL;
48 * Copy data between a page in the stripe cache, and one or more bion
49 @@ -716,8 +740,9 @@ static void compute_parity(struct stripe
51 raid5_conf_t *conf = sh->raid_conf;
52 int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
53 - void *ptr[MAX_XOR_BLOCKS];
54 + void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
58 PRINTK("compute_parity, stripe %llu, method %d\n",
59 (unsigned long long)sh->sector, method);
60 @@ -744,13 +769,14 @@ static void compute_parity(struct stripe
62 case RECONSTRUCT_WRITE:
63 memset(ptr[0], 0, STRIPE_SIZE);
64 - for (i= disks; i-- ;)
65 + for (i= disks; i-- ;) {
66 if (i!=pd_idx && sh->dev[i].towrite) {
67 chosen = sh->dev[i].towrite;
68 sh->dev[i].towrite = NULL;
69 if (sh->dev[i].written) BUG();
70 sh->dev[i].written = chosen;
76 @@ -760,34 +786,88 @@ static void compute_parity(struct stripe
80 - for (i = disks; i--;)
81 - if (sh->dev[i].written) {
82 - sector_t sector = sh->dev[i].sector;
83 - struct bio *wbi = sh->dev[i].written;
84 - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
85 - copy_data(1, wbi, sh->dev[i].page, sector);
86 - wbi = r5_next_bio(wbi, sector);
87 + for (i = disks; i--;) {
88 + struct bio *wbi = sh->dev[i].written;
94 + sector = sh->dev[i].sector;
95 + set_bit(R5_LOCKED, &sh->dev[i].flags);
96 + BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
98 + /* check if it's covered by a single page
99 + and whole stripe is written at once.
100 + * in this case we can avoid memcpy() */
101 + if (!wbi->bi_next && test_bit(R5_OVERWRITE, &sh->dev[i].flags) &&
102 + test_bit(R5_Insync, &sh->dev[i].flags)) {
103 + page = zero_copy_data(wbi, sector);
105 + atomic_inc(&conf->writes_zcopy);
106 + sh->dev[i].req.bi_io_vec[0].bv_page = page;
107 + set_bit(R5_Direct, &sh->dev[i].flags);
108 + clear_bit(R5_UPTODATE, &sh->dev[i].flags);
109 + clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
114 - set_bit(R5_LOCKED, &sh->dev[i].flags);
115 - set_bit(R5_UPTODATE, &sh->dev[i].flags);
116 + atomic_inc(&conf->writes_copied);
117 + test_and_clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
118 + set_bit(R5_UPTODATE, &sh->dev[i].flags);
119 + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
120 + copy_data(1, wbi, sh->dev[i].page, sector);
121 + wbi = r5_next_bio(wbi, sector);
127 case RECONSTRUCT_WRITE:
129 - for (i=disks; i--;)
131 - ptr[count++] = page_address(sh->dev[i].page);
133 + for (i=disks; i--;) {
136 + if (test_bit(R5_Direct, &sh->dev[i].flags))
137 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
139 + page = sh->dev[i].page;
141 + /* have to compute the parity immediately for
142 + * a highmem page. it would happen for zerocopy. -jay
144 + if (PageHighMem(page)) {
145 + h_ptr[1] = kmap_atomic(page, KM_USER0);
146 + xor_block(2, STRIPE_SIZE, h_ptr);
147 + kunmap_atomic(page, KM_USER0);
149 + ptr[count++] = page_address(page);
154 case READ_MODIFY_WRITE:
155 - for (i = disks; i--;)
156 - if (sh->dev[i].written) {
157 - ptr[count++] = page_address(sh->dev[i].page);
159 + for (i = disks; i--;) {
160 + if (!sh->dev[i].written)
162 + if (test_bit(R5_Direct, &sh->dev[i].flags))
163 + page = sh->dev[i].req.bi_io_vec[0].bv_page;
165 + page = sh->dev[i].page;
167 + /* have to compute the parity immediately for
168 + * a highmem page. it would happen for zerocopy. -jay
170 + if (PageHighMem(page)) {
171 + h_ptr[1] = kmap_atomic(page, KM_USER0);
172 + xor_block(2, STRIPE_SIZE, h_ptr);
173 + kunmap_atomic(page, KM_USER0);
175 + ptr[count++] = page_address(page);
181 xor_block(count, STRIPE_SIZE, ptr);
182 @@ -1059,13 +1139,15 @@ static void handle_stripe(struct stripe_
183 if (sh->dev[i].written) {
185 if (!test_bit(R5_LOCKED, &dev->flags) &&
186 - test_bit(R5_UPTODATE, &dev->flags) ) {
187 + (test_bit(R5_UPTODATE, &dev->flags) ||
188 + test_bit(R5_Direct, &dev->flags)) ) {
189 /* We can return any write requests */
190 struct bio *wbi, *wbi2;
191 PRINTK("Return write for disc %d\n", i);
192 spin_lock_irq(&conf->device_lock);
195 + test_and_clear_bit(R5_Direct, &dev->flags);
196 while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
197 wbi2 = r5_next_bio(wbi, dev->sector);
198 if (--wbi->bi_phys_segments == 0) {
199 @@ -1831,6 +1913,7 @@ memory = conf->max_nr_stripes * (sizeof(
200 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
201 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
203 + mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
205 /* Ok, everything is just fine now */
206 mddev->array_size = mddev->size * (mddev->raid_disks - 1);
207 @@ -1918,9 +2001,11 @@ static void status (struct seq_file *seq
208 atomic_read(&conf->handled_in_raid5d),
209 atomic_read(&conf->out_of_stripes),
210 atomic_read(&conf->handle_called));
211 - seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
212 + seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
213 atomic_read(&conf->reads_for_rmw),
214 - atomic_read(&conf->reads_for_rcw));
215 + atomic_read(&conf->reads_for_rcw),
216 + atomic_read(&conf->writes_zcopy),
217 + atomic_read(&conf->writes_copied));
218 seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
219 atomic_read(&conf->delayed),
220 atomic_read(&conf->active_stripes),
221 diff -pru linux-2.6.9.orig/include/linux/backing-dev.h linux-2.6.9/include/linux/backing-dev.h
222 --- linux-2.6.9.orig/include/linux/backing-dev.h 2004-10-18 15:53:46.000000000 -0600
223 +++ linux-2.6.9/include/linux/backing-dev.h 2007-07-13 00:12:46.000000000 -0600
224 @@ -30,8 +30,11 @@ struct backing_dev_info {
225 void *congested_data; /* Pointer to aux data for congested func */
226 void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
227 void *unplug_io_data;
228 + unsigned int capabilities;
231 +#define BDI_CAP_PAGE_CONST_WRITE 0x00000001
233 extern struct backing_dev_info default_backing_dev_info;
234 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
236 @@ -62,4 +65,7 @@ static inline int bdi_rw_congested(struc
237 (1 << BDI_write_congested));
240 +#define mapping_cap_page_constant_write(mapping) \
241 + ((mapping)->backing_dev_info->capabilities & BDI_CAP_PAGE_CONST_WRITE)
243 #endif /* _LINUX_BACKING_DEV_H */
244 diff -pru linux-2.6.9.orig/include/linux/page-flags.h linux-2.6.9/include/linux/page-flags.h
245 --- linux-2.6.9.orig/include/linux/page-flags.h 2004-10-18 15:54:39.000000000 -0600
246 +++ linux-2.6.9/include/linux/page-flags.h 2007-07-13 00:12:46.000000000 -0600
248 #define PG_swapcache 16 /* Swap page: swp_entry_t in private */
249 #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
250 #define PG_reclaim 18 /* To be reclaimed asap */
251 +#define PG_constant 19 /* To mark the page is constant */
255 @@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u
256 #define PageSwapCache(page) 0
259 +#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
260 +#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
261 +#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
262 +#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
264 struct page; /* forward declaration */
266 int test_clear_page_dirty(struct page *page);
267 diff -pru linux-2.6.9.orig/include/linux/pagemap.h linux-2.6.9/include/linux/pagemap.h
268 --- linux-2.6.9.orig/include/linux/pagemap.h 2004-10-18 15:53:06.000000000 -0600
269 +++ linux-2.6.9/include/linux/pagemap.h 2007-07-13 00:12:46.000000000 -0600
270 @@ -191,6 +191,19 @@ static inline void wait_on_page_writebac
272 extern void end_page_writeback(struct page *page);
274 +extern int set_page_constant(struct page *page);
275 +extern void clear_page_constant(struct page *);
276 +static inline int set_page_constant_lock(struct page *page)
278 + BUG_ON(PageLocked(page));
280 + if (set_page_constant(page)) {
288 * Fault a userspace page into pagetables. Return non-zero on a fault.
290 diff -pru linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h
291 --- linux-2.6.9.orig/include/linux/raid/raid5.h 2007-07-09 02:43:33.000000000 -0600
292 +++ linux-2.6.9/include/linux/raid/raid5.h 2007-07-13 00:39:15.000000000 -0600
293 @@ -153,6 +153,7 @@ struct stripe_head {
294 #define R5_Wantread 4 /* want to schedule a read */
295 #define R5_Wantwrite 5
296 #define R5_Syncio 6 /* this io need to be accounted as resync io */
297 +#define R5_Direct 7 /* use page from passed bio to avoid memcpy */
301 @@ -234,6 +235,8 @@ struct raid5_private_data {
302 atomic_t out_of_stripes;
303 atomic_t reads_for_rmw;
304 atomic_t reads_for_rcw;
305 + atomic_t writes_zcopy;
306 + atomic_t writes_copied;
307 atomic_t handle_called;
309 atomic_t in_reqs_in_queue;
310 diff -pru linux-2.6.9.orig/mm/filemap.c linux-2.6.9/mm/filemap.c
311 --- linux-2.6.9.orig/mm/filemap.c 2007-07-09 02:43:33.000000000 -0600
312 +++ linux-2.6.9/mm/filemap.c 2007-07-13 00:12:46.000000000 -0600
314 #include <linux/pagevec.h>
315 #include <linux/blkdev.h>
316 #include <linux/security.h>
317 +#include <linux/rmap.h>
320 * This is needed for the following functions:
321 * - try_to_release_page
322 @@ -486,11 +488,52 @@ void end_page_writeback(struct page *pag
324 smp_mb__after_clear_bit();
326 + clear_page_constant(page);
330 EXPORT_SYMBOL(end_page_writeback);
332 +/* Mark a page in bio to be constant, page must be locked */
333 +int set_page_constant(struct page *page)
335 + BUG_ON(!PageLocked(page));
337 + /* If it's an anonymous page and haven't been added to swap cache,
340 + if (PageAnon(page) && !PageSwapCache(page))
343 + BUG_ON(!PageUptodate(page));
345 + /* I have to clear page uptodate before trying to remove
346 + * it from user's page table because otherwise, the page may be
347 + * reinstalled by a page access which happens between try_to_unmap()
348 + * and ClearPageUptodate(). -jay
350 + ClearPageUptodate(page);
351 + if (page_mapped(page) && try_to_unmap(page) != SWAP_SUCCESS) {
352 + SetPageUptodate(page);
355 + SetPageConstant(page);
359 +void clear_page_constant(struct page *page)
361 + if (PageConstant(page)) {
362 + BUG_ON(!PageLocked(page));
363 + BUG_ON(PageUptodate(page));
364 + ClearPageConstant(page);
365 + SetPageUptodate(page);
369 +EXPORT_SYMBOL(set_page_constant);
370 +EXPORT_SYMBOL(clear_page_constant);
373 * Get a lock on the page, assuming we need to sleep to get it.