Whamcloud - gitweb
land v0.9.1 on HEAD, in preparation for a 1.0.x branch
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy-2.4.22-rh.patch
1  include/linux/skbuff.h |   30 +++++
2  include/net/tcp.h      |    5 
3  net/core/skbuff.c      |   25 ++++
4  net/ipv4/tcp.c         |  252 ++++++++++++++++++++++++++++++++++++++++++++++++-
5  net/netsyms.c          |    2 
6  5 files changed, 311 insertions(+), 3 deletions(-)
7
8 --- linux-2.4.22-ac1/include/linux/skbuff.h~tcp-zero-copy-2.4.22-rh     2003-08-25 15:44:44.000000000 +0400
9 +++ linux-2.4.22-ac1-alexey/include/linux/skbuff.h      2003-09-26 00:38:48.000000000 +0400
10 @@ -116,6 +116,30 @@ struct skb_frag_struct
11         __u16 size;
12  };
13  
14 +/* Support for callback when skb data has been released */
15 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
16 +{                                              /* (embed as first member of custom struct) */
17 +       atomic_t        zccd_count;             /* reference count */
18 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
19 +} zccd_t;
20 +
21 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
22 +{
23 +       atomic_set (&d->zccd_count, 1);
24 +       d->zccd_destructor = callback;
25 +}
26 +
27 +static inline void zccd_get (zccd_t *d)                /* take a reference */
28 +{
29 +       atomic_inc (&d->zccd_count);
30 +}
31 +
32 +static inline void zccd_put (zccd_t *d)                /* release a reference */
33 +{
34 +       if (atomic_dec_and_test (&d->zccd_count))
35 +               (d->zccd_destructor)(d);
36 +}
37 +
38  /* This data is invariant across clones and lives at
39   * the end of the header data, ie. at skb->end.
40   */
41 @@ -123,6 +147,12 @@ struct skb_shared_info {
42         atomic_t        dataref;
43         unsigned int    nr_frags;
44         struct sk_buff  *frag_list;
45 +       zccd_t          *zccd;                  /* zero copy descriptor */
46 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
47 +       /* NB we expect zero-copy data to be at least 1 packet, so
48 +        * having 2 zccds means we don't unneccessarily split the packet
49 +        * where consecutive zero-copy sends abutt.
50 +        */
51         skb_frag_t      frags[MAX_SKB_FRAGS];
52  };
53  
54 --- linux-2.4.22-ac1/include/net/tcp.h~tcp-zero-copy-2.4.22-rh  2003-08-25 15:44:44.000000000 +0400
55 +++ linux-2.4.22-ac1-alexey/include/net/tcp.h   2003-09-26 00:38:48.000000000 +0400
56 @@ -643,6 +643,8 @@ extern int                  tcp_v4_tw_remember_stam
57  
58  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
59  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
60 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
61 +                                                 int flags, zccd_t *zccd);
62  
63  extern int                     tcp_ioctl(struct sock *sk, 
64                                           int cmd, 
65 @@ -737,6 +739,9 @@ extern int                  tcp_recvmsg(struct sock *sk
66                                             struct msghdr *msg,
67                                             int len, int nonblock, 
68                                             int flags, int *addr_len);
69 +extern int                     tcp_recvpackets(struct sock *sk,
70 +                                               struct sk_buff_head *packets,
71 +                                               int len, int nonblock);
72  
73  extern int                     tcp_listen_start(struct sock *sk);
74  
75 --- linux-2.4.22-ac1/net/core/skbuff.c~tcp-zero-copy-2.4.22-rh  2003-08-25 15:44:44.000000000 +0400
76 +++ linux-2.4.22-ac1-alexey/net/core/skbuff.c   2003-09-26 00:38:48.000000000 +0400
77 @@ -208,6 +208,8 @@ struct sk_buff *alloc_skb(unsigned int s
78         atomic_set(&(skb_shinfo(skb)->dataref), 1);
79         skb_shinfo(skb)->nr_frags = 0;
80         skb_shinfo(skb)->frag_list = NULL;
81 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
82 +       skb_shinfo(skb)->zccd2 = NULL;
83         return skb;
84  
85  nodata:
86 @@ -277,6 +279,10 @@ static void skb_release_data(struct sk_b
87  {
88         if (!skb->cloned ||
89             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
90 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
91 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
92 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
93 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
94                 if (skb_shinfo(skb)->nr_frags) {
95                         int i;
96                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
97 @@ -535,6 +541,8 @@ int skb_linearize(struct sk_buff *skb, i
98         atomic_set(&(skb_shinfo(skb)->dataref), 1);
99         skb_shinfo(skb)->nr_frags = 0;
100         skb_shinfo(skb)->frag_list = NULL;
101 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
102 +       skb_shinfo(skb)->zccd2 = NULL;
103  
104         /* We are no longer a clone, even if we were. */
105         skb->cloned = 0;
106 @@ -581,6 +589,14 @@ struct sk_buff *pskb_copy(struct sk_buff
107         n->data_len = skb->data_len;
108         n->len = skb->len;
109  
110 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
111 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
112 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
113 +
114 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
115 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
116 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
117 +
118         if (skb_shinfo(skb)->nr_frags) {
119                 int i;
120  
121 @@ -623,6 +639,8 @@ int pskb_expand_head(struct sk_buff *skb
122         u8 *data;
123         int size = nhead + (skb->end - skb->head) + ntail;
124         long off;
125 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
126 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
127  
128         if (skb_shared(skb))
129                 BUG();
130 @@ -644,6 +662,11 @@ int pskb_expand_head(struct sk_buff *skb
131         if (skb_shinfo(skb)->frag_list)
132                 skb_clone_fraglist(skb);
133  
134 +       if (zccd != NULL)                       /* user zero copy descriptor? */
135 +               zccd_get (zccd);                /* extra ref (pages are shared) */
136 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
137 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
138 +
139         skb_release_data(skb);
140  
141         off = (data+nhead) - skb->head;
142 @@ -658,6 +681,8 @@ int pskb_expand_head(struct sk_buff *skb
143         skb->nh.raw += off;
144         skb->cloned = 0;
145         atomic_set(&skb_shinfo(skb)->dataref, 1);
146 +       skb_shinfo(skb)->zccd = zccd;
147 +       skb_shinfo(skb)->zccd2 = zccd2;
148         return 0;
149  
150  nodata:
151 --- linux-2.4.22-ac1/net/ipv4/tcp.c~tcp-zero-copy-2.4.22-rh     2003-08-25 15:44:44.000000000 +0400
152 +++ linux-2.4.22-ac1-alexey/net/ipv4/tcp.c      2003-09-26 00:38:48.000000000 +0400
153 @@ -747,7 +747,7 @@ do_interrupted:
154         goto out;
155  }
156  
157 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
158 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
159  
160  static inline int
161  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
162 @@ -826,7 +826,8 @@ static int tcp_error(struct sock *sk, in
163         return err;
164  }
165  
166 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
167 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
168 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
169  {
170         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
171         int mss_now;
172 @@ -874,6 +875,17 @@ new_segment:
173                         copy = size;
174  
175                 i = skb_shinfo(skb)->nr_frags;
176 +
177 +               if (zccd != NULL &&             /* this is a zcc I/O */
178 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
179 +                   skb_shinfo(skb)->zccd2 != NULL &&
180 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
181 +                   skb_shinfo(skb)->zccd2 != zccd)
182 +               {
183 +                       tcp_mark_push (tp, skb);
184 +                       goto new_segment;
185 +               }
186 +
187                 if (can_coalesce(skb, i, page, offset)) {
188                         skb_shinfo(skb)->frags[i-1].size += copy;
189                 } else if (i < MAX_SKB_FRAGS) {
190 @@ -884,6 +896,20 @@ new_segment:
191                         goto new_segment;
192                 }
193  
194 +               if (zccd != NULL &&     /* this is a zcc I/O */
195 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
196 +                   skb_shinfo(skb)->zccd2 != zccd)
197 +               {
198 +                       zccd_get (zccd);        /* bump ref count */
199 +
200 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
201 +
202 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
203 +                               skb_shinfo(skb)->zccd = zccd;
204 +                       else
205 +                               skb_shinfo(skb)->zccd2 = zccd;
206 +               }
207 +
208                 skb->len += copy;
209                 skb->data_len += copy;
210                 skb->ip_summed = CHECKSUM_HW;
211 @@ -947,7 +973,31 @@ ssize_t tcp_sendpage(struct socket *sock
212  
213         lock_sock(sk);
214         TCP_CHECK_TIMER(sk);
215 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
216 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
217 +       TCP_CHECK_TIMER(sk);
218 +       release_sock(sk);
219 +       return res;
220 +}
221 +
222 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
223 +                         int flags, zccd_t *zccd)
224 +{
225 +       ssize_t res;
226 +       struct sock *sk = sock->sk;
227 +
228 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
229 +
230 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
231 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
232 +               BUG ();
233 +
234 +#undef TCP_ZC_CSUM_FLAGS
235 +
236 +       lock_sock(sk);
237 +       TCP_CHECK_TIMER(sk);
238 +
239 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
240 +
241         TCP_CHECK_TIMER(sk);
242         release_sock(sk);
243         return res;
244 @@ -1771,6 +1821,202 @@ recv_urg:
245         goto out;
246  }
247  
248 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
249 +                    int len, int nonblock)
250 +{
251 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
252 +       int copied;
253 +       long timeo;
254 +
255 +       BUG_TRAP (len > 0);
256 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
257 +
258 +       lock_sock(sk);
259 +
260 +       TCP_CHECK_TIMER(sk);
261 +
262 +       copied = -ENOTCONN;
263 +       if (sk->state == TCP_LISTEN)
264 +               goto out;
265 +
266 +       copied = 0;
267 +       timeo = sock_rcvtimeo(sk, nonblock);
268 +
269 +       do {
270 +               struct sk_buff * skb;
271 +               u32 offset;
272 +               unsigned long used;
273 +               int exhausted;
274 +               int eaten;
275 +
276 +               /* Are we at urgent data? Stop if we have read anything. */
277 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
278 +                       break;
279 +
280 +               /* We need to check signals first, to get correct SIGURG
281 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
282 +                * and move it down to the bottom of the loop
283 +                */
284 +               if (signal_pending(current)) {
285 +                       if (copied)
286 +                               break;
287 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
288 +                       break;
289 +               }
290 +
291 +               /* Next get a buffer. */
292 +
293 +               skb = skb_peek(&sk->receive_queue);
294 +
295 +               if (skb == NULL)                /* nothing ready */
296 +               {
297 +                       if (copied) {
298 +                               if (sk->err ||
299 +                                   sk->state == TCP_CLOSE ||
300 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
301 +                                   !timeo ||
302 +                                   (0))
303 +                                       break;
304 +                       } else {
305 +                               if (sk->done)
306 +                                       break;
307 +
308 +                               if (sk->err) {
309 +                                       copied = sock_error(sk);
310 +                                       break;
311 +                               }
312 +
313 +                               if (sk->shutdown & RCV_SHUTDOWN)
314 +                                       break;
315 +
316 +                               if (sk->state == TCP_CLOSE) {
317 +                                       if (!sk->done) {
318 +                                               /* This occurs when user tries to read
319 +                                                * from never connected socket.
320 +                                                */
321 +                                               copied = -ENOTCONN;
322 +                                               break;
323 +                                       }
324 +                                       break;
325 +                               }
326 +
327 +                               if (!timeo) {
328 +                                       copied = -EAGAIN;
329 +                                       break;
330 +                               }
331 +                       }
332 +
333 +                       cleanup_rbuf(sk, copied);
334 +                       timeo = tcp_data_wait(sk, timeo);
335 +                       continue;
336 +               }
337 +
338 +               BUG_TRAP (atomic_read (&skb->users) == 1);
339 +
340 +               exhausted = eaten = 0;
341 +
342 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
343 +               if (skb->h.th->syn)
344 +                       offset--;
345 +
346 +               used = skb->len - offset;
347 +
348 +               if (tp->urg_data) {
349 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
350 +                       if (urg_offset < used) {
351 +                               if (!urg_offset) { /* at urgent date */
352 +                                       if (!sk->urginline) {
353 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
354 +                                               offset++;
355 +                                               used--;
356 +                                       }
357 +                               } else          /* truncate read */
358 +                                       used = urg_offset;
359 +                       }
360 +               }
361 +
362 +               BUG_TRAP (used >= 0);
363 +               if (len < used)
364 +                       used = len;
365 +
366 +               if (used == 0)
367 +                       exhausted = 1;
368 +               else
369 +               {
370 +                       if (skb_is_nonlinear (skb))
371 +                       {
372 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
373 +
374 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
375 +
376 +                               if (rc)
377 +                               {
378 +                                       if (!copied)
379 +                                               copied = rc;
380 +                                       break;
381 +                               }
382 +                       }
383 +
384 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
385 +                       {
386 +                               __skb_unlink (skb, &sk->receive_queue);
387 +                               dst_release (skb->dst);
388 +                               skb_orphan (skb);
389 +                               __skb_pull (skb, offset);
390 +                               __skb_queue_tail (packets, skb);
391 +                               exhausted = eaten = 1;
392 +                       }
393 +                       else                    /* consuming only part of the packet */
394 +                       {
395 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
396 +
397 +                               if (skb2 == NULL)
398 +                               {
399 +                                       if (!copied)
400 +                                               copied = -ENOMEM;
401 +                                       break;
402 +                               }
403 +
404 +                               dst_release (skb2->dst);
405 +                               __skb_pull (skb2, offset);
406 +                               __skb_trim (skb2, used);
407 +                               __skb_queue_tail (packets, skb2);
408 +                       }
409 +
410 +                       tp->copied_seq += used;
411 +                       copied += used;
412 +                       len -= used;
413 +               }
414 +
415 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
416 +                       tp->urg_data = 0;
417 +                       tcp_fast_path_check(sk, tp);
418 +               }
419 +
420 +               if (!exhausted)
421 +                       continue;
422 +
423 +               if (skb->h.th->fin)
424 +               {
425 +                       tp->copied_seq++;
426 +                       if (!eaten)
427 +                               tcp_eat_skb (sk, skb);
428 +                       break;
429 +               }
430 +
431 +               if (!eaten)
432 +                       tcp_eat_skb (sk, skb);
433 +
434 +       } while (len > 0);
435 +
436 + out:
437 +       /* Clean up data we have read: This will do ACK frames. */
438 +       cleanup_rbuf(sk, copied);
439 +       TCP_CHECK_TIMER(sk);
440 +       release_sock(sk);
441 +       return copied;
442 +}
443 +
444  /*
445   *     State processing on a close. This implements the state shift for
446   *     sending our FIN frame. Note that we only send a FIN for some
447 --- linux-2.4.22-ac1/net/netsyms.c~tcp-zero-copy-2.4.22-rh      2003-09-25 14:16:26.000000000 +0400
448 +++ linux-2.4.22-ac1-alexey/net/netsyms.c       2003-09-26 00:39:16.000000000 +0400
449 @@ -396,6 +396,8 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
450  EXPORT_SYMBOL(sysctl_tcp_ecn);
451  EXPORT_SYMBOL(tcp_cwnd_application_limited);
452  EXPORT_SYMBOL(tcp_sendpage);
453 +EXPORT_SYMBOL(tcp_sendpage_zccd);
454 +EXPORT_SYMBOL(tcp_recvpackets);
455  EXPORT_SYMBOL(sysctl_tcp_low_latency);
456  
457  EXPORT_SYMBOL(tcp_write_xmit);
458
459 _