Whamcloud - gitweb
b=4336
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp_zero_copy_2.4.20_chaos.patch
1  include/linux/skbuff.h |   30 +++++
2  include/net/tcp.h      |    5 
3  net/core/skbuff.c      |   25 ++++
4  net/ipv4/tcp.c         |  252 ++++++++++++++++++++++++++++++++++++++++++++++++-
5  net/netsyms.c          |    2 
6  5 files changed, 311 insertions(+), 3 deletions(-)
7
8 Index: linux-2.4.20-rh-20.9/include/linux/skbuff.h
9 ===================================================================
10 --- linux-2.4.20-rh-20.9.orig/include/linux/skbuff.h    2003-09-13 19:34:24.000000000 +0400
11 +++ linux-2.4.20-rh-20.9/include/linux/skbuff.h 2003-12-19 14:14:55.000000000 +0300
12 @@ -116,6 +116,30 @@
13         __u16 size;
14  };
15  
16 +/* Support for callback when skb data has been released */
17 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
18 +{                                              /* (embed as first member of custom struct) */
19 +       atomic_t        zccd_count;             /* reference count */
20 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
21 +} zccd_t;
22 +
23 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
24 +{
25 +       atomic_set (&d->zccd_count, 1);
26 +       d->zccd_destructor = callback;
27 +}
28 +
29 +static inline void zccd_get (zccd_t *d)                /* take a reference */
30 +{
31 +       atomic_inc (&d->zccd_count);
32 +}
33 +
34 +static inline void zccd_put (zccd_t *d)                /* release a reference */
35 +{
36 +       if (atomic_dec_and_test (&d->zccd_count))
37 +               (d->zccd_destructor)(d);
38 +}
39 +
40  /* This data is invariant across clones and lives at
41   * the end of the header data, ie. at skb->end.
42   */
43 @@ -123,6 +147,12 @@
44         atomic_t        dataref;
45         unsigned int    nr_frags;
46         struct sk_buff  *frag_list;
47 +       zccd_t          *zccd;                  /* zero copy descriptor */
48 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
49 +       /* NB we expect zero-copy data to be at least 1 packet, so
50 +        * having 2 zccds means we don't unneccessarily split the packet
51 +        * where consecutive zero-copy sends abutt.
52 +        */
53         skb_frag_t      frags[MAX_SKB_FRAGS];
54  };
55  
56 Index: linux-2.4.20-rh-20.9/include/net/tcp.h
57 ===================================================================
58 --- linux-2.4.20-rh-20.9.orig/include/net/tcp.h 2003-09-13 19:34:25.000000000 +0400
59 +++ linux-2.4.20-rh-20.9/include/net/tcp.h      2003-12-19 14:14:55.000000000 +0300
60 @@ -643,6 +643,8 @@
61  
62  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
63  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
64 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
65 +                                                 int flags, zccd_t *zccd);
66  
67  extern int                     tcp_ioctl(struct sock *sk, 
68                                           int cmd, 
69 @@ -737,6 +739,9 @@
70                                             struct msghdr *msg,
71                                             int len, int nonblock, 
72                                             int flags, int *addr_len);
73 +extern int                     tcp_recvpackets(struct sock *sk,
74 +                                               struct sk_buff_head *packets,
75 +                                               int len, int nonblock);
76  
77  extern int                     tcp_listen_start(struct sock *sk);
78  
79 Index: linux-2.4.20-rh-20.9/net/netsyms.c
80 ===================================================================
81 --- linux-2.4.20-rh-20.9.orig/net/netsyms.c     2003-09-13 19:34:24.000000000 +0400
82 +++ linux-2.4.20-rh-20.9/net/netsyms.c  2003-12-19 14:15:24.000000000 +0300
83 @@ -396,7 +396,7 @@
84  EXPORT_SYMBOL(sysctl_tcp_wmem);
85  EXPORT_SYMBOL(sysctl_tcp_ecn);
86  EXPORT_SYMBOL(tcp_cwnd_application_limited);
87 -EXPORT_SYMBOL(tcp_sendpage);
88 +EXPORT_SYMBOL(tcp_recvpackets);
89  EXPORT_SYMBOL(sysctl_tcp_low_latency);
90  
91  EXPORT_SYMBOL(tcp_write_xmit);
92 @@ -417,6 +417,8 @@
93  
94  #endif
95  
96 +EXPORT_SYMBOL(tcp_sendpage);
97 +EXPORT_SYMBOL(tcp_sendpage_zccd);
98  EXPORT_SYMBOL(tcp_read_sock);
99  
100  EXPORT_SYMBOL(netlink_set_err);
101 Index: linux-2.4.20-rh-20.9/net/core/skbuff.c
102 ===================================================================
103 --- linux-2.4.20-rh-20.9.orig/net/core/skbuff.c 2003-09-13 19:34:19.000000000 +0400
104 +++ linux-2.4.20-rh-20.9/net/core/skbuff.c      2003-12-19 14:14:56.000000000 +0300
105 @@ -208,6 +208,8 @@
106         atomic_set(&(skb_shinfo(skb)->dataref), 1);
107         skb_shinfo(skb)->nr_frags = 0;
108         skb_shinfo(skb)->frag_list = NULL;
109 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
110 +       skb_shinfo(skb)->zccd2 = NULL;
111         return skb;
112  
113  nodata:
114 @@ -276,6 +278,10 @@
115  {
116         if (!skb->cloned ||
117             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
118 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
119 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
120 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
121 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
122                 if (skb_shinfo(skb)->nr_frags) {
123                         int i;
124                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
125 @@ -532,6 +538,8 @@
126         atomic_set(&(skb_shinfo(skb)->dataref), 1);
127         skb_shinfo(skb)->nr_frags = 0;
128         skb_shinfo(skb)->frag_list = NULL;
129 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
130 +       skb_shinfo(skb)->zccd2 = NULL;
131  
132         /* We are no longer a clone, even if we were. */
133         skb->cloned = 0;
134 @@ -578,6 +586,14 @@
135         n->data_len = skb->data_len;
136         n->len = skb->len;
137  
138 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
139 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
140 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
141 +
142 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
143 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
144 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
145 +
146         if (skb_shinfo(skb)->nr_frags) {
147                 int i;
148  
149 @@ -620,6 +636,8 @@
150         u8 *data;
151         int size = nhead + (skb->end - skb->head) + ntail;
152         long off;
153 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
154 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
155  
156         if (skb_shared(skb))
157                 BUG();
158 @@ -641,6 +659,11 @@
159         if (skb_shinfo(skb)->frag_list)
160                 skb_clone_fraglist(skb);
161  
162 +       if (zccd != NULL)                       /* user zero copy descriptor? */
163 +               zccd_get (zccd);                /* extra ref (pages are shared) */
164 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
165 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
166 +
167         skb_release_data(skb);
168  
169         off = (data+nhead) - skb->head;
170 @@ -655,6 +678,8 @@
171         skb->nh.raw += off;
172         skb->cloned = 0;
173         atomic_set(&skb_shinfo(skb)->dataref, 1);
174 +       skb_shinfo(skb)->zccd = zccd;
175 +       skb_shinfo(skb)->zccd2 = zccd2;
176         return 0;
177  
178  nodata:
179 Index: linux-2.4.20-rh-20.9/net/ipv4/tcp.c
180 ===================================================================
181 --- linux-2.4.20-rh-20.9.orig/net/ipv4/tcp.c    2003-09-13 19:34:25.000000000 +0400
182 +++ linux-2.4.20-rh-20.9/net/ipv4/tcp.c 2003-12-19 14:14:56.000000000 +0300
183 @@ -747,7 +747,7 @@
184         goto out;
185  }
186  
187 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
188 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
189  
190  static inline int
191  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
192 @@ -826,7 +826,8 @@
193         return err;
194  }
195  
196 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
197 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
198 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
199  {
200         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
201         int mss_now;
202 @@ -874,6 +875,17 @@
203                         copy = size;
204  
205                 i = skb_shinfo(skb)->nr_frags;
206 +
207 +               if (zccd != NULL &&             /* this is a zcc I/O */
208 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
209 +                   skb_shinfo(skb)->zccd2 != NULL &&
210 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
211 +                   skb_shinfo(skb)->zccd2 != zccd)
212 +               {
213 +                       tcp_mark_push (tp, skb);
214 +                       goto new_segment;
215 +               }
216 +
217                 if (can_coalesce(skb, i, page, offset)) {
218                         skb_shinfo(skb)->frags[i-1].size += copy;
219                 } else if (i < MAX_SKB_FRAGS) {
220 @@ -884,6 +896,20 @@
221                         goto new_segment;
222                 }
223  
224 +               if (zccd != NULL &&     /* this is a zcc I/O */
225 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
226 +                   skb_shinfo(skb)->zccd2 != zccd)
227 +               {
228 +                       zccd_get (zccd);        /* bump ref count */
229 +
230 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
231 +
232 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
233 +                               skb_shinfo(skb)->zccd = zccd;
234 +                       else
235 +                               skb_shinfo(skb)->zccd2 = zccd;
236 +               }
237 +
238                 skb->len += copy;
239                 skb->data_len += copy;
240                 skb->ip_summed = CHECKSUM_HW;
241 @@ -947,7 +973,31 @@
242  
243         lock_sock(sk);
244         TCP_CHECK_TIMER(sk);
245 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
246 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
247 +       TCP_CHECK_TIMER(sk);
248 +       release_sock(sk);
249 +       return res;
250 +}
251 +
252 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
253 +                         int flags, zccd_t *zccd)
254 +{
255 +       ssize_t res;
256 +       struct sock *sk = sock->sk;
257 +
258 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
259 +
260 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
261 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
262 +               BUG ();
263 +
264 +#undef TCP_ZC_CSUM_FLAGS
265 +
266 +       lock_sock(sk);
267 +       TCP_CHECK_TIMER(sk);
268 +
269 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
270 +
271         TCP_CHECK_TIMER(sk);
272         release_sock(sk);
273         return res;
274 @@ -1771,6 +1821,202 @@
275         goto out;
276  }
277  
278 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
279 +                    int len, int nonblock)
280 +{
281 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
282 +       int copied;
283 +       long timeo;
284 +
285 +       BUG_TRAP (len > 0);
286 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
287 +
288 +       lock_sock(sk);
289 +
290 +       TCP_CHECK_TIMER(sk);
291 +
292 +       copied = -ENOTCONN;
293 +       if (sk->state == TCP_LISTEN)
294 +               goto out;
295 +
296 +       copied = 0;
297 +       timeo = sock_rcvtimeo(sk, nonblock);
298 +
299 +       do {
300 +               struct sk_buff * skb;
301 +               u32 offset;
302 +               unsigned long used;
303 +               int exhausted;
304 +               int eaten;
305 +
306 +               /* Are we at urgent data? Stop if we have read anything. */
307 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
308 +                       break;
309 +
310 +               /* We need to check signals first, to get correct SIGURG
311 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
312 +                * and move it down to the bottom of the loop
313 +                */
314 +               if (signal_pending(current)) {
315 +                       if (copied)
316 +                               break;
317 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
318 +                       break;
319 +               }
320 +
321 +               /* Next get a buffer. */
322 +
323 +               skb = skb_peek(&sk->receive_queue);
324 +
325 +               if (skb == NULL)                /* nothing ready */
326 +               {
327 +                       if (copied) {
328 +                               if (sk->err ||
329 +                                   sk->state == TCP_CLOSE ||
330 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
331 +                                   !timeo ||
332 +                                   (0))
333 +                                       break;
334 +                       } else {
335 +                               if (sk->done)
336 +                                       break;
337 +
338 +                               if (sk->err) {
339 +                                       copied = sock_error(sk);
340 +                                       break;
341 +                               }
342 +
343 +                               if (sk->shutdown & RCV_SHUTDOWN)
344 +                                       break;
345 +
346 +                               if (sk->state == TCP_CLOSE) {
347 +                                       if (!sk->done) {
348 +                                               /* This occurs when user tries to read
349 +                                                * from never connected socket.
350 +                                                */
351 +                                               copied = -ENOTCONN;
352 +                                               break;
353 +                                       }
354 +                                       break;
355 +                               }
356 +
357 +                               if (!timeo) {
358 +                                       copied = -EAGAIN;
359 +                                       break;
360 +                               }
361 +                       }
362 +
363 +                       cleanup_rbuf(sk, copied);
364 +                       timeo = tcp_data_wait(sk, timeo);
365 +                       continue;
366 +               }
367 +
368 +               BUG_TRAP (atomic_read (&skb->users) == 1);
369 +
370 +               exhausted = eaten = 0;
371 +
372 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
373 +               if (skb->h.th->syn)
374 +                       offset--;
375 +
376 +               used = skb->len - offset;
377 +
378 +               if (tp->urg_data) {
379 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
380 +                       if (urg_offset < used) {
381 +                               if (!urg_offset) { /* at urgent date */
382 +                                       if (!sk->urginline) {
383 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
384 +                                               offset++;
385 +                                               used--;
386 +                                       }
387 +                               } else          /* truncate read */
388 +                                       used = urg_offset;
389 +                       }
390 +               }
391 +
392 +               BUG_TRAP (used >= 0);
393 +               if (len < used)
394 +                       used = len;
395 +
396 +               if (used == 0)
397 +                       exhausted = 1;
398 +               else
399 +               {
400 +                       if (skb_is_nonlinear (skb))
401 +                       {
402 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
403 +
404 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
405 +
406 +                               if (rc)
407 +                               {
408 +                                       if (!copied)
409 +                                               copied = rc;
410 +                                       break;
411 +                               }
412 +                       }
413 +
414 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
415 +                       {
416 +                               __skb_unlink (skb, &sk->receive_queue);
417 +                               dst_release (skb->dst);
418 +                               skb_orphan (skb);
419 +                               __skb_pull (skb, offset);
420 +                               __skb_queue_tail (packets, skb);
421 +                               exhausted = eaten = 1;
422 +                       }
423 +                       else                    /* consuming only part of the packet */
424 +                       {
425 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
426 +
427 +                               if (skb2 == NULL)
428 +                               {
429 +                                       if (!copied)
430 +                                               copied = -ENOMEM;
431 +                                       break;
432 +                               }
433 +
434 +                               dst_release (skb2->dst);
435 +                               __skb_pull (skb2, offset);
436 +                               __skb_trim (skb2, used);
437 +                               __skb_queue_tail (packets, skb2);
438 +                       }
439 +
440 +                       tp->copied_seq += used;
441 +                       copied += used;
442 +                       len -= used;
443 +               }
444 +
445 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
446 +                       tp->urg_data = 0;
447 +                       tcp_fast_path_check(sk, tp);
448 +               }
449 +
450 +               if (!exhausted)
451 +                       continue;
452 +
453 +               if (skb->h.th->fin)
454 +               {
455 +                       tp->copied_seq++;
456 +                       if (!eaten)
457 +                               tcp_eat_skb (sk, skb);
458 +                       break;
459 +               }
460 +
461 +               if (!eaten)
462 +                       tcp_eat_skb (sk, skb);
463 +
464 +       } while (len > 0);
465 +
466 + out:
467 +       /* Clean up data we have read: This will do ACK frames. */
468 +       cleanup_rbuf(sk, copied);
469 +       TCP_CHECK_TIMER(sk);
470 +       release_sock(sk);
471 +       return copied;
472 +}
473 +
474  /*
475   *     State processing on a close. This implements the state shift for
476   *     sending our FIN frame. Note that we only send a FIN for some