Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / new-tcp-zero-copy-2.6.5-sles9.patch
1 diff -ur linux-2.6.5-7.252/include/linux/skbuff.h linux-2.6.5-7.252-tcp/include/linux/skbuff.h
2 --- linux-2.6.5-7.252/include/linux/skbuff.h    2006-10-11 21:46:38.000000000 +0300
3 +++ linux-2.6.5-7.252-tcp/include/linux/skbuff.h        2006-10-11 21:52:56.000000000 +0300
4 @@ -135,6 +135,36 @@
5         __u16 size;
6  };
7  
8 +/* Zero Copy Callback Descriptor
9 + * This struct supports receiving notification when zero-copy network I/O has
10 + * completed.  The ZCCD can be embedded in a struct containing the state of a
11 + * zero-copy network send.  Every skbuff that references that send's pages also
12 + * keeps a reference on the ZCCD.  When they have all been disposed of, the
13 + * reference count on the ZCCD drops to zero and the callback is made, telling
14 + * the original caller that the pages may now be overwritten. */
15 +struct zccd 
16 +{
17 +       atomic_t         zccd_refcount;
18 +       void           (*zccd_callback)(struct zccd *); 
19 +};
20 +
21 +static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
22 +{
23 +       atomic_set (&d->zccd_refcount, 1);
24 +       d->zccd_callback = callback;
25 +}
26 +
27 +static inline void zccd_incref (struct zccd *d)        /* take a reference */
28 +{
29 +       atomic_inc (&d->zccd_refcount);
30 +}
31 +
32 +static inline void zccd_decref (struct zccd *d)        /* release a reference */
33 +{
34 +       if (atomic_dec_and_test (&d->zccd_refcount))
35 +               (d->zccd_callback)(d);
36 +}
37 +
38  /* This data is invariant across clones and lives at
39   * the end of the header data, ie. at skb->end.
40   */
41 @@ -144,6 +174,11 @@
42         unsigned short  tso_size;
43         unsigned short  tso_segs;
44         struct sk_buff  *frag_list;
45 +       struct zccd     *zccd1;
46 +       struct zccd     *zccd2;
47 +       /* NB zero-copy data is normally whole pages.  We have 2 zccds in an
48 +        * skbuff so we don't unneccessarily split the packet where pages fall
49 +        * into the same packet. */
50         skb_frag_t      frags[MAX_SKB_FRAGS];
51  };
52  
53 @@ -1152,6 +1187,23 @@
54  #endif
55  }
56  
57 +/* This skbuf has dropped its pages: drop refs on any zero-copy callback
58 + * descriptors it has. */
59 +static inline void skb_complete_zccd (struct sk_buff *skb)
60 +{
61 +       struct skb_shared_info *info = skb_shinfo(skb);
62 +       
63 +       if (info->zccd1 != NULL) {
64 +               zccd_decref(info->zccd1);
65 +               info->zccd1 = NULL;
66 +       }
67 +
68 +       if (info->zccd2 != NULL) {
69 +               zccd_decref(info->zccd2);
70 +               info->zccd2 = NULL;
71 +       }
72 +}
73 +
74  #define skb_queue_walk(queue, skb) \
75                 for (skb = (queue)->next, prefetch(skb->next);  \
76                      (skb != (struct sk_buff *)(queue));        \
77 diff -ur linux-2.6.5-7.252/include/net/tcp.h linux-2.6.5-7.252-tcp/include/net/tcp.h
78 --- linux-2.6.5-7.252/include/net/tcp.h 2006-10-11 21:46:38.000000000 +0300
79 +++ linux-2.6.5-7.252-tcp/include/net/tcp.h     2006-10-11 21:52:56.000000000 +0300
80 @@ -764,6 +764,8 @@
81  extern int                     tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
82                                             struct msghdr *msg, size_t size);
83  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
84 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
85 +                                                 int flags, struct zccd *zccd);
86  
87  extern int                     tcp_ioctl(struct sock *sk, 
88                                           int cmd, 
89 diff -ur linux-2.6.5-7.252/net/core/dev.c linux-2.6.5-7.252-tcp/net/core/dev.c
90 --- linux-2.6.5-7.252/net/core/dev.c    2006-10-11 21:46:38.000000000 +0300
91 +++ linux-2.6.5-7.252-tcp/net/core/dev.c        2006-10-11 21:52:56.000000000 +0300
92 @@ -1322,6 +1322,8 @@
93         ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
94         ninfo->nr_frags = 0;
95         ninfo->frag_list = NULL;
96 +       ninfo->zccd1 = NULL;                    /* zero copy completion callback */
97 +       ninfo->zccd2 = NULL;                    /* not required */
98  
99         /* Offset between the two in bytes */
100         offset = data - skb->head;
101 diff -ur linux-2.6.5-7.252/net/core/skbuff.c linux-2.6.5-7.252-tcp/net/core/skbuff.c
102 --- linux-2.6.5-7.252/net/core/skbuff.c 2006-10-11 21:46:38.000000000 +0300
103 +++ linux-2.6.5-7.252-tcp/net/core/skbuff.c     2006-10-11 22:06:31.000000000 +0300
104 @@ -152,6 +152,8 @@
105         skb_shinfo(skb)->tso_size = 0;
106         skb_shinfo(skb)->tso_segs = 0;
107         skb_shinfo(skb)->frag_list = NULL;
108 +       skb_shinfo(skb)->zccd1 = NULL;          /* zero-copy completion callback */
109 +       skb_shinfo(skb)->zccd2 = NULL;          /* not required (yet) */
110  out:
111         return skb;
112  nodata:
113 @@ -186,6 +188,9 @@
114  {
115         if (!skb->cloned ||
116             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
117 +               /* complete zero-copy callbacks (if any) */
118 +               skb_complete_zccd(skb);
119 +
120                 if (skb_shinfo(skb)->nr_frags) {
121                         int i;
122                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
123 @@ -456,7 +461,29 @@
124                         skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
125                         get_page(skb_shinfo(n)->frags[i].page);
126                 }
127 +
128 +               /* Transfer zero-copy callback descriptors */
129 +               BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
130 +               skb_shinfo(n)->zccd1    = skb_shinfo(skb)->zccd1;
131 +               skb_shinfo(skb)->zccd1     = NULL;
132 +        
133 +               BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
134 +               skb_shinfo(n)->zccd2    = skb_shinfo(skb)->zccd2;
135 +               skb_shinfo(skb)->zccd2     = NULL;
136 +
137                 skb_shinfo(n)->nr_frags = i;
138 +
139 +               if (skb_shinfo(skb)->zccd1 != NULL) {
140 +                       BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
141 +                       skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
142 +                       zccd_incref(skb_shinfo(n)->zccd1);
143 +               }
144 +
145 +               if (skb_shinfo(skb)->zccd2 != NULL) {
146 +                       BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
147 +                       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
148 +                       zccd_incref(skb_shinfo(n)->zccd2);
149 +               }
150         }
151         skb_shinfo(n)->tso_size = skb_shinfo(skb)->tso_size;
152         skb_shinfo(n)->tso_segs = skb_shinfo(skb)->tso_segs;
153 @@ -508,6 +535,13 @@
154         memcpy(data + nhead, skb->head, skb->tail - skb->head);
155         memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
156  
157 +       /* zero-copy descriptors have been copied into the new shinfo - 
158 +        * account the new references */
159 +       if (skb_shinfo(skb)->zccd1 != NULL)
160 +          zccd_incref(skb_shinfo(skb)->zccd1);
161 +       if (skb_shinfo(skb)->zccd2 != NULL)
162 +          zccd_incref(skb_shinfo(skb)->zccd2);
163 +       
164         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
165                 get_page(skb_shinfo(skb)->frags[i].page);
166  
167 @@ -671,6 +705,9 @@
168                 offset = end;
169         }
170  
171 +       if (skb_shinfo(skb)->nr_frags == 0)     /* dropped all the pages */
172 +               skb_complete_zccd(skb);         /* drop zccd refs */
173 +       
174         if (offset < len) {
175                 skb->data_len -= skb->len - len;
176                 skb->len       = len;
177 @@ -823,6 +860,9 @@
178         }
179         skb_shinfo(skb)->nr_frags = k;
180  
181 +       if (k == 0)                             /* dropped all the pages */
182 +               skb_complete_zccd(skb);         /* drop zccd refs */
183 +               
184         skb->tail     += delta;
185         skb->data_len -= delta;
186  
187 diff -ur linux-2.6.5-7.252/net/ipv4/tcp.c linux-2.6.5-7.252-tcp/net/ipv4/tcp.c
188 --- linux-2.6.5-7.252/net/ipv4/tcp.c    2006-10-11 21:46:38.000000000 +0300
189 +++ linux-2.6.5-7.252-tcp/net/ipv4/tcp.c        2006-10-11 23:15:24.000000000 +0300
190 @@ -799,7 +799,7 @@
191  }
192  
193  ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
194 -                        size_t psize, int flags);
195 +                        size_t psize, int flags, struct zccd *zccd);
196  
197  static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
198                                int off)
199 @@ -881,8 +881,9 @@
200         return err;
201  }
202  
203 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
204  ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
205 -                        size_t psize, int flags)
206 +                        size_t psize, int flags, struct zccd *zccd)
207  {
208         struct tcp_opt *tp = tcp_sk(sk);
209         int mss_now;
210 @@ -929,6 +930,16 @@
211                         copy = size;
212  
213                 i = skb_shinfo(skb)->nr_frags;
214 +
215 +               if (zccd != NULL &&                   /* completion callback wanted */
216 +                       skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
217 +                       skb_shinfo(skb)->zccd2 != NULL && 
218 +                       skb_shinfo(skb)->zccd1 != zccd && /* room needed */
219 +                       skb_shinfo(skb)->zccd2 != zccd) {
220 +                       tcp_mark_push (tp, skb);
221 +                       goto new_segment;
222 +               }
223 +
224                 if (can_coalesce(skb, i, page, offset)) {
225                         skb_shinfo(skb)->frags[i - 1].size += copy;
226                 } else if (i < MAX_SKB_FRAGS) {
227 @@ -939,6 +950,18 @@
228                         goto new_segment;
229                 }
230  
231 +               if (zccd != NULL &&                   /* completion callback wanted */
232 +                   skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
233 +                   skb_shinfo(skb)->zccd2 != zccd) {
234 +                       if (skb_shinfo(skb)->zccd1 == NULL) {
235 +                               skb_shinfo(skb)->zccd1 = zccd;
236 +                       } else {
237 +                               BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
238 +                               skb_shinfo(skb)->zccd2 = zccd;
239 +                       }
240 +                       zccd_incref(zccd);            /* new reference */
241 +               }
242 +
243                 skb->len += copy;
244                 skb->data_len += copy;
245                 skb->ip_summed = CHECKSUM_HW;
246 @@ -987,8 +1010,8 @@
247         return tcp_error(sk, flags, err);
248  }
249  
250 -ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
251 -                    size_t size, int flags)
252 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
253 +                         size_t size, int flags, struct zccd *zccd)
254  {
255         ssize_t res;
256         struct sock *sk = sock->sk;
257 @@ -1003,12 +1026,19 @@
258  
259         lock_sock(sk);
260         TCP_CHECK_TIMER(sk);
261 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
262 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
263         TCP_CHECK_TIMER(sk);
264         release_sock(sk);
265         return res;
266  }
267  
268 +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
269 +                    size_t size, int flags)
270 +{
271 +       return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
272 +}
273 +
274 +
275  #define TCP_PAGE(sk)   (inet_sk(sk)->sndmsg_page)
276  #define TCP_OFF(sk)    (inet_sk(sk)->sndmsg_off)
277  
278 @@ -2872,6 +2902,7 @@
279  EXPORT_SYMBOL(tcp_recvmsg);
280  EXPORT_SYMBOL(tcp_sendmsg);
281  EXPORT_SYMBOL(tcp_sendpage);
282 +EXPORT_SYMBOL(tcp_sendpage_zccd);
283  EXPORT_SYMBOL(tcp_setsockopt);
284  EXPORT_SYMBOL(tcp_shutdown);
285  EXPORT_SYMBOL(tcp_sockets_allocated);
286 diff -ur linux-2.6.5-7.252/net/ipv4/tcp_output.c linux-2.6.5-7.252-tcp/net/ipv4/tcp_output.c
287 --- linux-2.6.5-7.252/net/ipv4/tcp_output.c     2006-10-11 21:46:38.000000000 +0300
288 +++ linux-2.6.5-7.252-tcp/net/ipv4/tcp_output.c 2006-10-11 22:14:04.000000000 +0300
289 @@ -411,6 +411,30 @@
290                         pos += size;
291                 }
292                 skb_shinfo(skb1)->nr_frags = k;
293 +
294 +               if (k != 0) {                           
295 +                       /* skb1 has pages. Transfer or clone the zccds */
296
297 +                       if (skb_shinfo(skb)->zccd1 != NULL) {
298 +                               BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
299 +                               skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
300
301 +                               if (skb_shinfo(skb)->nr_frags == 0)
302 +                                       skb_shinfo(skb)->zccd1 = NULL;
303 +                               else
304 +                                       zccd_incref(skb_shinfo(skb)->zccd1);
305 +                       }
306 +               
307 +                       if (skb_shinfo(skb)->zccd2 != NULL) {
308 +                               BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
309 +                               skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
310
311 +                               if (skb_shinfo(skb)->nr_frags == 0)
312 +                                       skb_shinfo(skb)->zccd2 = NULL;
313 +                               else
314 +                                       zccd_incref(skb_shinfo(skb)->zccd2);
315 +                       }
316 +               }
317         }
318  }
319  
320 @@ -505,6 +529,9 @@
321         }
322         skb_shinfo(skb)->nr_frags = k;
323  
324 +       if (k == 0)                             /* dropped all pages */
325 +               skb_complete_zccd(skb);
326 +       
327         skb->tail = skb->data;
328         skb->data_len -= len;
329         skb->len = skb->data_len;