Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / raid5-optimize-memcpy.patch
1 In case of full-stripe writes don't copy data into internal cache.
2 This optimization reduces CPU load by 30% rougly.
3
4 Index: linux-2.6.9/include/linux/raid/raid5.h
5 ===================================================================
6 --- linux-2.6.9.orig/include/linux/raid/raid5.h 2006-05-21 17:57:25.000000000 +0400
7 +++ linux-2.6.9/include/linux/raid/raid5.h      2006-05-22 00:10:04.000000000 +0400
8 @@ -152,6 +152,7 @@ struct stripe_head {
9  #define        R5_Wantread     4       /* want to schedule a read */
10  #define        R5_Wantwrite    5
11  #define        R5_Syncio       6       /* this io need to be accounted as resync io */
12 +#define        R5_Direct       7       /* use page fom passed bio to avoid memcpy */
13  
14  /*
15   * Write method
16 Index: linux-2.6.9/drivers/md/raid5.c
17 ===================================================================
18 --- linux-2.6.9.orig/drivers/md/raid5.c 2006-05-22 00:10:01.000000000 +0400
19 +++ linux-2.6.9/drivers/md/raid5.c      2006-05-22 00:10:04.000000000 +0400
20 @@ -411,6 +411,8 @@ static int raid5_end_read_request (struc
21                 clear_buffer_uptodate(bh);
22         }
23  #endif
24 +       if (test_bit(R5_Direct, &sh->dev[i].flags))
25 +               printk("R5_Direct for READ ?!\n");
26         clear_bit(R5_LOCKED, &sh->dev[i].flags);
27         set_bit(STRIPE_HANDLE, &sh->state);
28         release_stripe(sh);
29 @@ -449,6 +451,10 @@ static int raid5_end_write_request (stru
30  
31         rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
32         
33 +       if (test_bit(R5_Direct, &sh->dev[i].flags)) {
34 +               BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
35 +               sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
36 +       }
37         clear_bit(R5_LOCKED, &sh->dev[i].flags);
38         set_bit(STRIPE_HANDLE, &sh->state);
39         __release_stripe(conf, sh);
40 @@ -673,6 +679,49 @@ static void copy_data(int frombio, struc
41         }
42  }
43  
44 +static struct page *zero_copy_data(struct bio *bio, sector_t sector)
45 +{
46 +       struct bio_vec *bvl;
47 +       int i;
48 +
49 +       for (;bio && bio->bi_sector < sector+STRIPE_SECTORS;
50 +             bio = r5_next_bio(bio, sector) ) {
51 +               int page_offset;
52 +               if (bio->bi_sector >= sector)
53 +                       page_offset = (signed)(bio->bi_sector - sector) * 512;
54 +               else 
55 +                       page_offset = (signed)(sector - bio->bi_sector) * -512;
56 +               bio_for_each_segment(bvl, bio, i) {
57 +                       int len = bio_iovec_idx(bio,i)->bv_len;
58 +                       int clen;
59 +                       int b_offset = 0;                       
60 +
61 +                       if (page_offset < 0) {
62 +                               b_offset = -page_offset;
63 +                               page_offset += b_offset;
64 +                               len -= b_offset;
65 +                       }
66 +
67 +                       if (len > 0 && page_offset + len > STRIPE_SIZE)
68 +                               clen = STRIPE_SIZE - page_offset;       
69 +                       else clen = len;
70 +                       
71 +                       if (clen > 0) {
72 +                               BUG_ON(clen < STRIPE_SIZE);
73 +                               /*printk("  sector %lu: page %p from index %u\n",
74 +                                       (unsigned long) sector,
75 +                                       bio_iovec_idx(bio, i)->bv_page, i);*/
76 +                               return bio_iovec_idx(bio, i)->bv_page;
77 +                       }       
78 +                       if (clen < len) /* hit end of page */
79 +                               break;
80 +                       page_offset +=  len;
81 +               }
82 +       }
83 +       BUG();
84 +       return NULL;
85 +}
86 +
87  #define check_xor()    do {                                            \
88                            if (count == MAX_XOR_BLOCKS) {               \
89                                 xor_block(count, STRIPE_SIZE, ptr);     \
90 @@ -717,6 +766,8 @@ static void compute_parity(struct stripe
91         int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
92         void *ptr[MAX_XOR_BLOCKS];
93         struct bio *chosen;
94 +       struct page *page;
95 +       int zerocopy = 0;
96  
97         PRINTK("compute_parity, stripe %llu, method %d\n",
98                 (unsigned long long)sh->sector, method);
99 @@ -743,13 +794,17 @@ static void compute_parity(struct stripe
100                 break;
101         case RECONSTRUCT_WRITE:
102                 memset(ptr[0], 0, STRIPE_SIZE);
103 -               for (i= disks; i-- ;)
104 +               zerocopy = 1;
105 +               for (i= disks; i-- ;) {
106 +                       if (i != pd_idx && !sh->dev[i].towrite)
107 +                               zerocopy = 0;
108                         if (i!=pd_idx && sh->dev[i].towrite) {
109                                 chosen = sh->dev[i].towrite;
110                                 sh->dev[i].towrite = NULL;
111                                 if (sh->dev[i].written) BUG();
112                                 sh->dev[i].written = chosen;
113                         }
114 +               }
115                 break;
116         case CHECK_PARITY:
117                 break;
118 @@ -759,34 +814,62 @@ static void compute_parity(struct stripe
119                 count = 1;
120         }
121         
122 -       for (i = disks; i--;)
123 -               if (sh->dev[i].written) {
124 -                       sector_t sector = sh->dev[i].sector;
125 -                       struct bio *wbi = sh->dev[i].written;
126 -                       while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
127 -                               copy_data(1, wbi, sh->dev[i].page, sector);
128 -                               wbi = r5_next_bio(wbi, sector);
129 -                       }
130 +       for (i = disks; i--;) {
131 +               struct bio *wbi = sh->dev[i].written;
132 +               sector_t sector;
133 +
134 +               if (!wbi)
135 +                       continue;
136 +
137 +               sector = sh->dev[i].sector;
138 +               set_bit(R5_LOCKED, &sh->dev[i].flags);
139 +               BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
140 +
141 +               /* check if it's covered by a single page
142 +                  and whole stripe is written at once.
143 +                * in this case we can avoid memcpy() */
144 +               if (zerocopy && wbi && wbi->bi_next == NULL && 
145 +                               test_bit(R5_OVERWRITE, &sh->dev[i].flags)) {
146 +                       page = zero_copy_data(wbi, sector);
147 +                       BUG_ON(PageHighMem(page));
148 +                       sh->dev[i].req.bi_io_vec[0].bv_page = page;
149 +                       set_bit(R5_Direct, &sh->dev[i].flags);
150 +                       clear_bit(R5_UPTODATE, &sh->dev[i].flags);
151 +                       continue;
152 +               }
153  
154 -                       set_bit(R5_LOCKED, &sh->dev[i].flags);
155 -                       set_bit(R5_UPTODATE, &sh->dev[i].flags);
156 +               set_bit(R5_UPTODATE, &sh->dev[i].flags);
157 +               while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
158 +                       copy_data(1, wbi, sh->dev[i].page, sector);
159 +                       wbi = r5_next_bio(wbi, sector);
160                 }
161 +       }
162  
163         switch(method) {
164         case RECONSTRUCT_WRITE:
165         case CHECK_PARITY:
166 -               for (i=disks; i--;)
167 -                       if (i != pd_idx) {
168 -                               ptr[count++] = page_address(sh->dev[i].page);
169 -                               check_xor();
170 -                       }
171 +               for (i=disks; i--;) {
172 +                       if (i == pd_idx)
173 +                               continue;
174 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
175 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
176 +                       else
177 +                               page = sh->dev[i].page;
178 +                       ptr[count++] = page_address(page);
179 +                       check_xor();
180 +               }
181                 break;
182         case READ_MODIFY_WRITE:
183 -               for (i = disks; i--;)
184 -                       if (sh->dev[i].written) {
185 -                               ptr[count++] = page_address(sh->dev[i].page);
186 -                               check_xor();
187 -                       }
188 +               for (i = disks; i--;) {
189 +                       if (!sh->dev[i].written)
190 +                               continue;
191 +                       if (test_bit(R5_Direct, &sh->dev[i].flags))
192 +                               page = sh->dev[i].req.bi_io_vec[0].bv_page;
193 +                       else
194 +                               page = sh->dev[i].page;
195 +                       ptr[count++] = page_address(page);
196 +                       check_xor();
197 +               }
198         }
199         if (count != 1)
200                 xor_block(count, STRIPE_SIZE, ptr);
201 @@ -1012,7 +1094,7 @@ static void handle_stripe(struct stripe_
202         dev = &sh->dev[sh->pd_idx];
203         if ( written &&
204              ( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
205 -               test_bit(R5_UPTODATE, &dev->flags))
206 +               (test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Direct, &dev->flags)))
207                || (failed == 1 && failed_num == sh->pd_idx))
208             ) {
209             /* any written block on an uptodate or failed drive can be returned.
210 @@ -1023,13 +1105,16 @@ static void handle_stripe(struct stripe_
211                 if (sh->dev[i].written) {
212                     dev = &sh->dev[i];
213                     if (!test_bit(R5_LOCKED, &dev->flags) &&
214 -                        test_bit(R5_UPTODATE, &dev->flags) ) {
215 +                        (test_bit(R5_UPTODATE, &dev->flags) ||
216 +                               test_bit(R5_Direct, &dev->flags)) ) {
217                         /* We can return any write requests */
218                             struct bio *wbi, *wbi2;
219                             PRINTK("Return write for disc %d\n", i);
220                             spin_lock_irq(&conf->device_lock);
221                             wbi = dev->written;
222                             dev->written = NULL;
223 +                           if (test_bit(R5_Direct, &dev->flags))
224 +                                   clear_bit(R5_Direct, &dev->flags);
225                             while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
226                                     wbi2 = r5_next_bio(wbi, dev->sector);
227                                     if (--wbi->bi_phys_segments == 0) {