Whamcloud - gitweb
merge b_devel into HEAD, which will become 0.7.3
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp_zero_copy_2.4.20_chaos.patch
1  include/linux/skbuff.h |   30 +++++
2  include/net/tcp.h      |    5 
3  net/core/skbuff.c      |   25 ++++
4  net/ipv4/tcp.c         |  252 ++++++++++++++++++++++++++++++++++++++++++++++++-
5  net/netsyms.c          |    2 
6  5 files changed, 311 insertions(+), 3 deletions(-)
7
8 --- kernel-2.4.20-6chaos_18_7/include/linux/skbuff.h~tcp_zero_copy_2.4.20_chaos 2003-06-24 11:31:17.000000000 -0600
9 +++ kernel-2.4.20-6chaos_18_7-braam/include/linux/skbuff.h      2003-07-12 15:38:07.000000000 -0600
10 @@ -116,6 +116,30 @@ struct skb_frag_struct
11         __u16 size;
12  };
13  
14 +/* Support for callback when skb data has been released */
15 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
16 +{                                              /* (embed as first member of custom struct) */
17 +       atomic_t        zccd_count;             /* reference count */
18 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
19 +} zccd_t;
20 +
21 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
22 +{
23 +       atomic_set (&d->zccd_count, 1);
24 +       d->zccd_destructor = callback;
25 +}
26 +
27 +static inline void zccd_get (zccd_t *d)                /* take a reference */
28 +{
29 +       atomic_inc (&d->zccd_count);
30 +}
31 +
32 +static inline void zccd_put (zccd_t *d)                /* release a reference */
33 +{
34 +       if (atomic_dec_and_test (&d->zccd_count))
35 +               (d->zccd_destructor)(d);
36 +}
37 +
38  /* This data is invariant across clones and lives at
39   * the end of the header data, ie. at skb->end.
40   */
41 @@ -123,6 +147,12 @@ struct skb_shared_info {
42         atomic_t        dataref;
43         unsigned int    nr_frags;
44         struct sk_buff  *frag_list;
45 +       zccd_t          *zccd;                  /* zero copy descriptor */
46 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
47 +       /* NB we expect zero-copy data to be at least 1 packet, so
48 +        * having 2 zccds means we don't unneccessarily split the packet
49 +        * where consecutive zero-copy sends abutt.
50 +        */
51         skb_frag_t      frags[MAX_SKB_FRAGS];
52  };
53  
54 --- kernel-2.4.20-6chaos_18_7/include/net/tcp.h~tcp_zero_copy_2.4.20_chaos      2003-06-24 11:31:17.000000000 -0600
55 +++ kernel-2.4.20-6chaos_18_7-braam/include/net/tcp.h   2003-07-12 15:38:07.000000000 -0600
56 @@ -643,6 +643,8 @@ extern int                  tcp_v4_tw_remember_stam
57  
58  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
59  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
60 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
61 +                                                 int flags, zccd_t *zccd);
62  
63  extern int                     tcp_ioctl(struct sock *sk, 
64                                           int cmd, 
65 @@ -737,6 +739,9 @@ extern int                  tcp_recvmsg(struct sock *sk
66                                             struct msghdr *msg,
67                                             int len, int nonblock, 
68                                             int flags, int *addr_len);
69 +extern int                     tcp_recvpackets(struct sock *sk,
70 +                                               struct sk_buff_head *packets,
71 +                                               int len, int nonblock);
72  
73  extern int                     tcp_listen_start(struct sock *sk);
74  
75 --- kernel-2.4.20-6chaos_18_7/net/netsyms.c~tcp_zero_copy_2.4.20_chaos  2003-05-15 21:15:18.000000000 -0600
76 +++ kernel-2.4.20-6chaos_18_7-braam/net/netsyms.c       2003-07-12 15:38:54.000000000 -0600
77 @@ -397,6 +397,8 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
78  EXPORT_SYMBOL(sysctl_tcp_ecn);
79  EXPORT_SYMBOL(tcp_cwnd_application_limited);
80  EXPORT_SYMBOL(tcp_sendpage);
81 +EXPORT_SYMBOL(tcp_sendpage_zccd);
82 +EXPORT_SYMBOL(tcp_recvpackets);
83  EXPORT_SYMBOL(sysctl_tcp_low_latency);
84  
85  EXPORT_SYMBOL(tcp_write_xmit);
86 --- kernel-2.4.20-6chaos_18_7/net/core/skbuff.c~tcp_zero_copy_2.4.20_chaos      2003-05-15 21:15:21.000000000 -0600
87 +++ kernel-2.4.20-6chaos_18_7-braam/net/core/skbuff.c   2003-07-12 15:38:07.000000000 -0600
88 @@ -208,6 +208,8 @@ struct sk_buff *alloc_skb(unsigned int s
89         atomic_set(&(skb_shinfo(skb)->dataref), 1);
90         skb_shinfo(skb)->nr_frags = 0;
91         skb_shinfo(skb)->frag_list = NULL;
92 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
93 +       skb_shinfo(skb)->zccd2 = NULL;
94         return skb;
95  
96  nodata:
97 @@ -276,6 +278,10 @@ static void skb_release_data(struct sk_b
98  {
99         if (!skb->cloned ||
100             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
101 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
102 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
103 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
104 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
105                 if (skb_shinfo(skb)->nr_frags) {
106                         int i;
107                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
108 @@ -532,6 +538,8 @@ int skb_linearize(struct sk_buff *skb, i
109         atomic_set(&(skb_shinfo(skb)->dataref), 1);
110         skb_shinfo(skb)->nr_frags = 0;
111         skb_shinfo(skb)->frag_list = NULL;
112 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
113 +       skb_shinfo(skb)->zccd2 = NULL;
114  
115         /* We are no longer a clone, even if we were. */
116         skb->cloned = 0;
117 @@ -578,6 +586,14 @@ struct sk_buff *pskb_copy(struct sk_buff
118         n->data_len = skb->data_len;
119         n->len = skb->len;
120  
121 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
122 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
123 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
124 +
125 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
126 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
127 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
128 +
129         if (skb_shinfo(skb)->nr_frags) {
130                 int i;
131  
132 @@ -620,6 +636,8 @@ int pskb_expand_head(struct sk_buff *skb
133         u8 *data;
134         int size = nhead + (skb->end - skb->head) + ntail;
135         long off;
136 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
137 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
138  
139         if (skb_shared(skb))
140                 BUG();
141 @@ -641,6 +659,11 @@ int pskb_expand_head(struct sk_buff *skb
142         if (skb_shinfo(skb)->frag_list)
143                 skb_clone_fraglist(skb);
144  
145 +       if (zccd != NULL)                       /* user zero copy descriptor? */
146 +               zccd_get (zccd);                /* extra ref (pages are shared) */
147 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
148 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
149 +
150         skb_release_data(skb);
151  
152         off = (data+nhead) - skb->head;
153 @@ -655,6 +678,8 @@ int pskb_expand_head(struct sk_buff *skb
154         skb->nh.raw += off;
155         skb->cloned = 0;
156         atomic_set(&skb_shinfo(skb)->dataref, 1);
157 +       skb_shinfo(skb)->zccd = zccd;
158 +       skb_shinfo(skb)->zccd2 = zccd2;
159         return 0;
160  
161  nodata:
162 --- kernel-2.4.20-6chaos_18_7/net/ipv4/tcp.c~tcp_zero_copy_2.4.20_chaos 2003-05-15 21:15:21.000000000 -0600
163 +++ kernel-2.4.20-6chaos_18_7-braam/net/ipv4/tcp.c      2003-07-12 15:38:07.000000000 -0600
164 @@ -747,7 +747,7 @@ do_interrupted:
165         goto out;
166  }
167  
168 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
169 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
170  
171  static inline int
172  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
173 @@ -826,7 +826,8 @@ static int tcp_error(struct sock *sk, in
174         return err;
175  }
176  
177 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
178 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
179 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
180  {
181         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
182         int mss_now;
183 @@ -874,6 +875,17 @@ new_segment:
184                         copy = size;
185  
186                 i = skb_shinfo(skb)->nr_frags;
187 +
188 +               if (zccd != NULL &&             /* this is a zcc I/O */
189 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
190 +                   skb_shinfo(skb)->zccd2 != NULL &&
191 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
192 +                   skb_shinfo(skb)->zccd2 != zccd)
193 +               {
194 +                       tcp_mark_push (tp, skb);
195 +                       goto new_segment;
196 +               }
197 +
198                 if (can_coalesce(skb, i, page, offset)) {
199                         skb_shinfo(skb)->frags[i-1].size += copy;
200                 } else if (i < MAX_SKB_FRAGS) {
201 @@ -884,6 +896,20 @@ new_segment:
202                         goto new_segment;
203                 }
204  
205 +               if (zccd != NULL &&     /* this is a zcc I/O */
206 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
207 +                   skb_shinfo(skb)->zccd2 != zccd)
208 +               {
209 +                       zccd_get (zccd);        /* bump ref count */
210 +
211 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
212 +
213 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
214 +                               skb_shinfo(skb)->zccd = zccd;
215 +                       else
216 +                               skb_shinfo(skb)->zccd2 = zccd;
217 +               }
218 +
219                 skb->len += copy;
220                 skb->data_len += copy;
221                 skb->ip_summed = CHECKSUM_HW;
222 @@ -947,7 +973,31 @@ ssize_t tcp_sendpage(struct socket *sock
223  
224         lock_sock(sk);
225         TCP_CHECK_TIMER(sk);
226 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
227 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
228 +       TCP_CHECK_TIMER(sk);
229 +       release_sock(sk);
230 +       return res;
231 +}
232 +
233 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
234 +                         int flags, zccd_t *zccd)
235 +{
236 +       ssize_t res;
237 +       struct sock *sk = sock->sk;
238 +
239 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
240 +
241 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
242 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
243 +               BUG ();
244 +
245 +#undef TCP_ZC_CSUM_FLAGS
246 +
247 +       lock_sock(sk);
248 +       TCP_CHECK_TIMER(sk);
249 +
250 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
251 +
252         TCP_CHECK_TIMER(sk);
253         release_sock(sk);
254         return res;
255 @@ -1771,6 +1821,202 @@ recv_urg:
256         goto out;
257  }
258  
259 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
260 +                    int len, int nonblock)
261 +{
262 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
263 +       int copied;
264 +       long timeo;
265 +
266 +       BUG_TRAP (len > 0);
267 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
268 +
269 +       lock_sock(sk);
270 +
271 +       TCP_CHECK_TIMER(sk);
272 +
273 +       copied = -ENOTCONN;
274 +       if (sk->state == TCP_LISTEN)
275 +               goto out;
276 +
277 +       copied = 0;
278 +       timeo = sock_rcvtimeo(sk, nonblock);
279 +
280 +       do {
281 +               struct sk_buff * skb;
282 +               u32 offset;
283 +               unsigned long used;
284 +               int exhausted;
285 +               int eaten;
286 +
287 +               /* Are we at urgent data? Stop if we have read anything. */
288 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
289 +                       break;
290 +
291 +               /* We need to check signals first, to get correct SIGURG
292 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
293 +                * and move it down to the bottom of the loop
294 +                */
295 +               if (signal_pending(current)) {
296 +                       if (copied)
297 +                               break;
298 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
299 +                       break;
300 +               }
301 +
302 +               /* Next get a buffer. */
303 +
304 +               skb = skb_peek(&sk->receive_queue);
305 +
306 +               if (skb == NULL)                /* nothing ready */
307 +               {
308 +                       if (copied) {
309 +                               if (sk->err ||
310 +                                   sk->state == TCP_CLOSE ||
311 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
312 +                                   !timeo ||
313 +                                   (0))
314 +                                       break;
315 +                       } else {
316 +                               if (sk->done)
317 +                                       break;
318 +
319 +                               if (sk->err) {
320 +                                       copied = sock_error(sk);
321 +                                       break;
322 +                               }
323 +
324 +                               if (sk->shutdown & RCV_SHUTDOWN)
325 +                                       break;
326 +
327 +                               if (sk->state == TCP_CLOSE) {
328 +                                       if (!sk->done) {
329 +                                               /* This occurs when user tries to read
330 +                                                * from never connected socket.
331 +                                                */
332 +                                               copied = -ENOTCONN;
333 +                                               break;
334 +                                       }
335 +                                       break;
336 +                               }
337 +
338 +                               if (!timeo) {
339 +                                       copied = -EAGAIN;
340 +                                       break;
341 +                               }
342 +                       }
343 +
344 +                       cleanup_rbuf(sk, copied);
345 +                       timeo = tcp_data_wait(sk, timeo);
346 +                       continue;
347 +               }
348 +
349 +               BUG_TRAP (atomic_read (&skb->users) == 1);
350 +
351 +               exhausted = eaten = 0;
352 +
353 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
354 +               if (skb->h.th->syn)
355 +                       offset--;
356 +
357 +               used = skb->len - offset;
358 +
359 +               if (tp->urg_data) {
360 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
361 +                       if (urg_offset < used) {
362 +                               if (!urg_offset) { /* at urgent date */
363 +                                       if (!sk->urginline) {
364 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
365 +                                               offset++;
366 +                                               used--;
367 +                                       }
368 +                               } else          /* truncate read */
369 +                                       used = urg_offset;
370 +                       }
371 +               }
372 +
373 +               BUG_TRAP (used >= 0);
374 +               if (len < used)
375 +                       used = len;
376 +
377 +               if (used == 0)
378 +                       exhausted = 1;
379 +               else
380 +               {
381 +                       if (skb_is_nonlinear (skb))
382 +                       {
383 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
384 +
385 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
386 +
387 +                               if (rc)
388 +                               {
389 +                                       if (!copied)
390 +                                               copied = rc;
391 +                                       break;
392 +                               }
393 +                       }
394 +
395 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
396 +                       {
397 +                               __skb_unlink (skb, &sk->receive_queue);
398 +                               dst_release (skb->dst);
399 +                               skb_orphan (skb);
400 +                               __skb_pull (skb, offset);
401 +                               __skb_queue_tail (packets, skb);
402 +                               exhausted = eaten = 1;
403 +                       }
404 +                       else                    /* consuming only part of the packet */
405 +                       {
406 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
407 +
408 +                               if (skb2 == NULL)
409 +                               {
410 +                                       if (!copied)
411 +                                               copied = -ENOMEM;
412 +                                       break;
413 +                               }
414 +
415 +                               dst_release (skb2->dst);
416 +                               __skb_pull (skb2, offset);
417 +                               __skb_trim (skb2, used);
418 +                               __skb_queue_tail (packets, skb2);
419 +                       }
420 +
421 +                       tp->copied_seq += used;
422 +                       copied += used;
423 +                       len -= used;
424 +               }
425 +
426 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
427 +                       tp->urg_data = 0;
428 +                       tcp_fast_path_check(sk, tp);
429 +               }
430 +
431 +               if (!exhausted)
432 +                       continue;
433 +
434 +               if (skb->h.th->fin)
435 +               {
436 +                       tp->copied_seq++;
437 +                       if (!eaten)
438 +                               tcp_eat_skb (sk, skb);
439 +                       break;
440 +               }
441 +
442 +               if (!eaten)
443 +                       tcp_eat_skb (sk, skb);
444 +
445 +       } while (len > 0);
446 +
447 + out:
448 +       /* Clean up data we have read: This will do ACK frames. */
449 +       cleanup_rbuf(sk, copied);
450 +       TCP_CHECK_TIMER(sk);
451 +       release_sock(sk);
452 +       return copied;
453 +}
454 +
455  /*
456   *     State processing on a close. This implements the state shift for
457   *     sending our FIN frame. Note that we only send a FIN for some
458
459 _