Whamcloud - gitweb
- merge 0.7rc1 from b_devel to HEAD (20030612 merge point)
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy.patch
1 diff -u -r1.1.1.1 linux/include/linux/skbuff.h
2 --- linux/include/linux/skbuff.h        2 Aug 2002 10:59:25 -0000       1.1.1.1
3 +++ linux/include/linux/skbuff.h        2 Aug 2002 14:20:00 -0000
4 @@ -116,6 +116,30 @@
5         __u16 size;
6  };
7
8 +/* Support for callback when skb data has been released */
9 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
10 +{                                              /* (embed as first member of custom struct) */
11 +       atomic_t        zccd_count;             /* reference count */
12 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
13 +} zccd_t;
14 +
15 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
16 +{
17 +       atomic_set (&d->zccd_count, 1);
18 +       d->zccd_destructor = callback;
19 +}
20 +
21 +static inline void zccd_get (zccd_t *d)                /* take a reference */
22 +{
23 +       atomic_inc (&d->zccd_count);
24 +}
25 +
26 +static inline void zccd_put (zccd_t *d)                /* release a reference */
27 +{
28 +       if (atomic_dec_and_test (&d->zccd_count))
29 +               (d->zccd_destructor)(d);
30 +}
31 +
32  /* This data is invariant across clones and lives at
33   * the end of the header data, ie. at skb->end.
34   */
35 @@ -123,6 +147,12 @@
36         atomic_t        dataref;
37         unsigned int    nr_frags;
38         struct sk_buff  *frag_list;
39 +       zccd_t          *zccd;                  /* zero copy descriptor */
40 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
41 +       /* NB we expect zero-copy data to be at least 1 packet, so
42 +        * having 2 zccds means we don't unneccessarily split the packet
43 +        * where consecutive zero-copy sends abutt.
44 +        */
45         skb_frag_t      frags[MAX_SKB_FRAGS];
46  };
47
48 diff -u -r1.1.1.1 linux/include/net/tcp.h
49 --- linux/include/net/tcp.h     2 Aug 2002 10:59:29 -0000       1.1.1.1
50 +++ linux/include/net/tcp.h     2 Aug 2002 14:03:49 -0000
51 @@ -639,6 +639,8 @@
52
53  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
54  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
55 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
56 +                                                 int flags, zccd_t *zccd);
57
58  extern int                     tcp_ioctl(struct sock *sk,
59                                           int cmd,
60 @@ -732,6 +734,9 @@
61                                             struct msghdr *msg,
62                                             int len, int nonblock,
63                                             int flags, int *addr_len);
64 +extern int                     tcp_recvpackets(struct sock *sk,
65 +                                               struct sk_buff_head *packets,
66 +                                               int len, int nonblock);
67
68  extern int                     tcp_listen_start(struct sock *sk);
69
70 diff -u -r1.1.1.1 linux/net/netsyms.c
71 --- linux/net/netsyms.c 2 Aug 2002 10:59:31 -0000       1.1.1.1
72 +++ linux/net/netsyms.c 2 Aug 2002 14:21:31 -0000
73 @@ -395,6 +395,8 @@
74  EXPORT_SYMBOL(sysctl_tcp_ecn);
75  EXPORT_SYMBOL(tcp_cwnd_application_limited);
76  EXPORT_SYMBOL(tcp_sendpage);
77 +EXPORT_SYMBOL(tcp_sendpage_zccd);
78 +EXPORT_SYMBOL(tcp_recvpackets);
79
80  EXPORT_SYMBOL(tcp_write_xmit);
81
82 diff -u -r1.1.1.1 linux/net/core/skbuff.c
83 --- linux/net/core/skbuff.c     2 Aug 2002 10:59:32 -0000       1.1.1.1
84 +++ linux/net/core/skbuff.c     2 Aug 2002 14:07:13 -0000
85 @@ -208,6 +208,8 @@
86         atomic_set(&(skb_shinfo(skb)->dataref), 1);
87         skb_shinfo(skb)->nr_frags = 0;
88         skb_shinfo(skb)->frag_list = NULL;
89 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
90 +       skb_shinfo(skb)->zccd2 = NULL;
91         return skb;
92
93  nodata:
94 @@ -276,6 +278,10 @@
95  {
96         if (!skb->cloned ||
97             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
98 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
99 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
100 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
101 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
102                 if (skb_shinfo(skb)->nr_frags) {
103                         int i;
104                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
105 @@ -532,6 +538,8 @@
106         atomic_set(&(skb_shinfo(skb)->dataref), 1);
107         skb_shinfo(skb)->nr_frags = 0;
108         skb_shinfo(skb)->frag_list = NULL;
109 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
110 +       skb_shinfo(skb)->zccd2 = NULL;
111
112         /* We are no longer a clone, even if we were. */
113         skb->cloned = 0;
114 @@ -577,6 +585,14 @@
115
116         n->data_len = skb->data_len;
117         n->len = skb->len;
118 +
119 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
120 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
121 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
122 +
123 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
124 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
125 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
126
127         if (skb_shinfo(skb)->nr_frags) {
128                 int i;
129 @@ -620,6 +636,8 @@
130         u8 *data;
131         int size = nhead + (skb->end - skb->head) + ntail;
132         long off;
133 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
134 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
135
136         if (skb_shared(skb))
137                 BUG();
138 @@ -641,6 +659,11 @@
139         if (skb_shinfo(skb)->frag_list)
140                 skb_clone_fraglist(skb);
141
142 +       if (zccd != NULL)                       /* user zero copy descriptor? */
143 +               zccd_get (zccd);                /* extra ref (pages are shared) */
144 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
145 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
146 +
147         skb_release_data(skb);
148
149         off = (data+nhead) - skb->head;
150 @@ -655,6 +678,8 @@
151         skb->nh.raw += off;
152         skb->cloned = 0;
153         atomic_set(&skb_shinfo(skb)->dataref, 1);
154 +       skb_shinfo(skb)->zccd = zccd;
155 +       skb_shinfo(skb)->zccd2 = zccd2;
156         return 0;
157
158  nodata:
159 diff -u -r1.1.1.1 linux/net/ipv4/tcp.c
160 --- linux/net/ipv4/tcp.c        2 Aug 2002 10:59:34 -0000       1.1.1.1
161 +++ linux/net/ipv4/tcp.c        2 Aug 2002 14:36:30 -0000
162 @@ -745,7 +745,7 @@
163         goto out;
164  }
165
166 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
167 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
168
169  static inline int
170  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
171 @@ -824,7 +824,8 @@
172         return err;
173  }
174
175 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
176 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
177 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
178  {
179         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
180         int mss_now;
181 @@ -872,6 +873,17 @@
182                         copy = size;
183
184                 i = skb_shinfo(skb)->nr_frags;
185 +
186 +               if (zccd != NULL &&             /* this is a zcc I/O */
187 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
188 +                   skb_shinfo(skb)->zccd2 != NULL &&
189 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
190 +                   skb_shinfo(skb)->zccd2 != zccd)
191 +               {
192 +                       tcp_mark_push (tp, skb);
193 +                       goto new_segment;
194 +               }
195 +
196                 if (can_coalesce(skb, i, page, offset)) {
197                         skb_shinfo(skb)->frags[i-1].size += copy;
198                 } else if (i < MAX_SKB_FRAGS) {
199 @@ -881,6 +893,20 @@
200                         tcp_mark_push(tp, skb);
201                         goto new_segment;
202                 }
203 +
204 +               if (zccd != NULL &&     /* this is a zcc I/O */
205 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
206 +                   skb_shinfo(skb)->zccd2 != zccd)
207 +               {
208 +                       zccd_get (zccd);        /* bump ref count */
209 +
210 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
211 +
212 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
213 +                               skb_shinfo(skb)->zccd = zccd;
214 +                       else
215 +                               skb_shinfo(skb)->zccd2 = zccd;
216 +               }
217
218                 skb->len += copy;
219                 skb->data_len += copy;
220 @@ -945,7 +971,31 @@
221
222         lock_sock(sk);
223         TCP_CHECK_TIMER(sk);
224 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
225 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
226 +       TCP_CHECK_TIMER(sk);
227 +       release_sock(sk);
228 +       return res;
229 +}
230 +
231 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
232 +                         int flags, zccd_t *zccd)
233 +{
234 +       ssize_t res;
235 +       struct sock *sk = sock->sk;
236 +
237 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
238 +
239 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
240 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
241 +               BUG ();
242 +
243 +#undef TCP_ZC_CSUM_FLAGS
244 +
245 +       lock_sock(sk);
246 +       TCP_CHECK_TIMER(sk);
247 +
248 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
249 +
250         TCP_CHECK_TIMER(sk);
251         release_sock(sk);
252         return res;
253 @@ -1767,6 +1817,202 @@
254  recv_urg:
255         err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
256         goto out;
257 +}
258 +
259 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
260 +                    int len, int nonblock)
261 +{
262 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
263 +       int copied;
264 +       long timeo;
265 +
266 +       BUG_TRAP (len > 0);
267 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
268 +
269 +       lock_sock(sk);
270 +
271 +       TCP_CHECK_TIMER(sk);
272 +
273 +       copied = -ENOTCONN;
274 +       if (sk->state == TCP_LISTEN)
275 +               goto out;
276 +
277 +       copied = 0;
278 +       timeo = sock_rcvtimeo(sk, nonblock);
279 +
280 +       do {
281 +               struct sk_buff * skb;
282 +               u32 offset;
283 +               unsigned long used;
284 +               int exhausted;
285 +               int eaten;
286 +
287 +               /* Are we at urgent data? Stop if we have read anything. */
288 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
289 +                       break;
290 +
291 +               /* We need to check signals first, to get correct SIGURG
292 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
293 +                * and move it down to the bottom of the loop
294 +                */
295 +               if (signal_pending(current)) {
296 +                       if (copied)
297 +                               break;
298 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
299 +                       break;
300 +               }
301 +
302 +               /* Next get a buffer. */
303 +
304 +               skb = skb_peek(&sk->receive_queue);
305 +
306 +               if (skb == NULL)                /* nothing ready */
307 +               {
308 +                       if (copied) {
309 +                               if (sk->err ||
310 +                                   sk->state == TCP_CLOSE ||
311 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
312 +                                   !timeo ||
313 +                                   (0))
314 +                                       break;
315 +                       } else {
316 +                               if (sk->done)
317 +                                       break;
318 +
319 +                               if (sk->err) {
320 +                                       copied = sock_error(sk);
321 +                                       break;
322 +                               }
323 +
324 +                               if (sk->shutdown & RCV_SHUTDOWN)
325 +                                       break;
326 +
327 +                               if (sk->state == TCP_CLOSE) {
328 +                                       if (!sk->done) {
329 +                                               /* This occurs when user tries to read
330 +                                                * from never connected socket.
331 +                                                */
332 +                                               copied = -ENOTCONN;
333 +                                               break;
334 +                                       }
335 +                                       break;
336 +                               }
337 +
338 +                               if (!timeo) {
339 +                                       copied = -EAGAIN;
340 +                                       break;
341 +                               }
342 +                       }
343 +
344 +                       cleanup_rbuf(sk, copied);
345 +                       timeo = tcp_data_wait(sk, timeo);
346 +                       continue;
347 +               }
348 +
349 +               BUG_TRAP (atomic_read (&skb->users) == 1);
350 +
351 +               exhausted = eaten = 0;
352 +
353 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
354 +               if (skb->h.th->syn)
355 +                       offset--;
356 +
357 +               used = skb->len - offset;
358 +
359 +               if (tp->urg_data) {
360 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
361 +                       if (urg_offset < used) {
362 +                               if (!urg_offset) { /* at urgent date */
363 +                                       if (!sk->urginline) {
364 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
365 +                                               offset++;
366 +                                               used--;
367 +                                       }
368 +                               } else          /* truncate read */
369 +                                       used = urg_offset;
370 +                       }
371 +               }
372 +
373 +               BUG_TRAP (used >= 0);
374 +               if (len < used)
375 +                       used = len;
376 +
377 +               if (used == 0)
378 +                       exhausted = 1;
379 +               else
380 +               {
381 +                       if (skb_is_nonlinear (skb))
382 +                       {
383 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
384 +
385 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
386 +
387 +                               if (rc)
388 +                               {
389 +                                       if (!copied)
390 +                                               copied = rc;
391 +                                       break;
392 +                               }
393 +                       }
394 +
395 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
396 +                       {
397 +                               __skb_unlink (skb, &sk->receive_queue);
398 +                               dst_release (skb->dst);
399 +                               skb_orphan (skb);
400 +                               __skb_pull (skb, offset);
401 +                               __skb_queue_tail (packets, skb);
402 +                               exhausted = eaten = 1;
403 +                       }
404 +                       else                    /* consuming only part of the packet */
405 +                       {
406 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
407 +
408 +                               if (skb2 == NULL)
409 +                               {
410 +                                       if (!copied)
411 +                                               copied = -ENOMEM;
412 +                                       break;
413 +                               }
414 +
415 +                               dst_release (skb2->dst);
416 +                               __skb_pull (skb2, offset);
417 +                               __skb_trim (skb2, used);
418 +                               __skb_queue_tail (packets, skb2);
419 +                       }
420 +
421 +                       tp->copied_seq += used;
422 +                       copied += used;
423 +                       len -= used;
424 +               }
425 +
426 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
427 +                       tp->urg_data = 0;
428 +                       tcp_fast_path_check(sk, tp);
429 +               }
430 +
431 +               if (!exhausted)
432 +                       continue;
433 +
434 +               if (skb->h.th->fin)
435 +               {
436 +                       tp->copied_seq++;
437 +                       if (!eaten)
438 +                               tcp_eat_skb (sk, skb);
439 +                       break;
440 +               }
441 +
442 +               if (!eaten)
443 +                       tcp_eat_skb (sk, skb);
444 +
445 +       } while (len > 0);
446 +
447 + out:
448 +       /* Clean up data we have read: This will do ACK frames. */
449 +       cleanup_rbuf(sk, copied);
450 +       TCP_CHECK_TIMER(sk);
451 +       release_sock(sk);
452 +       return copied;
453  }
454
455  /*