Whamcloud - gitweb
b=4336
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy-2.4.22-rh.patch
1  include/linux/skbuff.h |   30 +++++
2  include/net/tcp.h      |    5 
3  net/core/skbuff.c      |   25 ++++
4  net/ipv4/tcp.c         |  252 ++++++++++++++++++++++++++++++++++++++++++++++++-
5  net/netsyms.c          |    2 
6  5 files changed, 311 insertions(+), 3 deletions(-)
7
8 Index: linux-2.4.22-vanilla/include/linux/skbuff.h
9 ===================================================================
10 --- linux-2.4.22-vanilla.orig/include/linux/skbuff.h    2003-11-03 23:22:13.000000000 +0300
11 +++ linux-2.4.22-vanilla/include/linux/skbuff.h 2003-12-02 23:56:35.000000000 +0300
12 @@ -116,6 +116,30 @@
13         __u16 size;
14  };
15  
16 +/* Support for callback when skb data has been released */
17 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
18 +{                                              /* (embed as first member of custom struct) */
19 +       atomic_t        zccd_count;             /* reference count */
20 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
21 +} zccd_t;
22 +
23 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
24 +{
25 +       atomic_set (&d->zccd_count, 1);
26 +       d->zccd_destructor = callback;
27 +}
28 +
29 +static inline void zccd_get (zccd_t *d)                /* take a reference */
30 +{
31 +       atomic_inc (&d->zccd_count);
32 +}
33 +
34 +static inline void zccd_put (zccd_t *d)                /* release a reference */
35 +{
36 +       if (atomic_dec_and_test (&d->zccd_count))
37 +               (d->zccd_destructor)(d);
38 +}
39 +
40  /* This data is invariant across clones and lives at
41   * the end of the header data, ie. at skb->end.
42   */
43 @@ -123,6 +147,12 @@
44         atomic_t        dataref;
45         unsigned int    nr_frags;
46         struct sk_buff  *frag_list;
47 +       zccd_t          *zccd;                  /* zero copy descriptor */
48 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
49 +       /* NB we expect zero-copy data to be at least 1 packet, so
50 +        * having 2 zccds means we don't unneccessarily split the packet
51 +        * where consecutive zero-copy sends abutt.
52 +        */
53         skb_frag_t      frags[MAX_SKB_FRAGS];
54  };
55  
56 Index: linux-2.4.22-vanilla/include/net/tcp.h
57 ===================================================================
58 --- linux-2.4.22-vanilla.orig/include/net/tcp.h 2003-11-03 23:22:13.000000000 +0300
59 +++ linux-2.4.22-vanilla/include/net/tcp.h      2003-12-02 23:58:10.000000000 +0300
60 @@ -643,6 +643,8 @@
61  
62  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
63  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
64 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
65 +                                                 int flags, zccd_t *zccd);
66  
67  extern int                     tcp_ioctl(struct sock *sk, 
68                                           int cmd, 
69 @@ -737,6 +739,9 @@
70                                             struct msghdr *msg,
71                                             int len, int nonblock, 
72                                             int flags, int *addr_len);
73 +extern int                     tcp_recvpackets(struct sock *sk,
74 +                                               struct sk_buff_head *packets,
75 +                                               int len, int nonblock);
76  
77  extern int                     tcp_listen_start(struct sock *sk);
78  
79 Index: linux-2.4.22-vanilla/net/core/skbuff.c
80 ===================================================================
81 --- linux-2.4.22-vanilla.orig/net/core/skbuff.c 2003-11-03 23:22:13.000000000 +0300
82 +++ linux-2.4.22-vanilla/net/core/skbuff.c      2003-12-02 23:56:15.000000000 +0300
83 @@ -208,6 +208,8 @@
84         atomic_set(&(skb_shinfo(skb)->dataref), 1);
85         skb_shinfo(skb)->nr_frags = 0;
86         skb_shinfo(skb)->frag_list = NULL;
87 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
88 +       skb_shinfo(skb)->zccd2 = NULL;
89         return skb;
90  
91  nodata:
92 @@ -277,6 +279,10 @@
93  {
94         if (!skb->cloned ||
95             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
96 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
97 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
98 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
99 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
100                 if (skb_shinfo(skb)->nr_frags) {
101                         int i;
102                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
103 @@ -535,6 +541,8 @@
104         atomic_set(&(skb_shinfo(skb)->dataref), 1);
105         skb_shinfo(skb)->nr_frags = 0;
106         skb_shinfo(skb)->frag_list = NULL;
107 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
108 +       skb_shinfo(skb)->zccd2 = NULL;
109  
110         /* We are no longer a clone, even if we were. */
111         skb->cloned = 0;
112 @@ -581,6 +589,14 @@
113         n->data_len = skb->data_len;
114         n->len = skb->len;
115  
116 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
117 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
118 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
119 +
120 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
121 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
122 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
123 +
124         if (skb_shinfo(skb)->nr_frags) {
125                 int i;
126  
127 @@ -623,6 +639,8 @@
128         u8 *data;
129         int size = nhead + (skb->end - skb->head) + ntail;
130         long off;
131 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
132 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
133  
134         if (skb_shared(skb))
135                 BUG();
136 @@ -644,6 +662,11 @@
137         if (skb_shinfo(skb)->frag_list)
138                 skb_clone_fraglist(skb);
139  
140 +       if (zccd != NULL)                       /* user zero copy descriptor? */
141 +               zccd_get (zccd);                /* extra ref (pages are shared) */
142 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
143 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
144 +
145         skb_release_data(skb);
146  
147         off = (data+nhead) - skb->head;
148 @@ -658,6 +681,8 @@
149         skb->nh.raw += off;
150         skb->cloned = 0;
151         atomic_set(&skb_shinfo(skb)->dataref, 1);
152 +       skb_shinfo(skb)->zccd = zccd;
153 +       skb_shinfo(skb)->zccd2 = zccd2;
154         return 0;
155  
156  nodata:
157 Index: linux-2.4.22-vanilla/net/ipv4/tcp.c
158 ===================================================================
159 --- linux-2.4.22-vanilla.orig/net/ipv4/tcp.c    2003-11-03 23:22:13.000000000 +0300
160 +++ linux-2.4.22-vanilla/net/ipv4/tcp.c 2003-12-02 23:56:15.000000000 +0300
161 @@ -747,7 +747,7 @@
162         goto out;
163  }
164  
165 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
166 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
167  
168  static inline int
169  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
170 @@ -826,7 +826,8 @@
171         return err;
172  }
173  
174 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
175 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
176 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
177  {
178         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
179         int mss_now;
180 @@ -874,6 +875,17 @@
181                         copy = size;
182  
183                 i = skb_shinfo(skb)->nr_frags;
184 +
185 +               if (zccd != NULL &&             /* this is a zcc I/O */
186 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
187 +                   skb_shinfo(skb)->zccd2 != NULL &&
188 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
189 +                   skb_shinfo(skb)->zccd2 != zccd)
190 +               {
191 +                       tcp_mark_push (tp, skb);
192 +                       goto new_segment;
193 +               }
194 +
195                 if (can_coalesce(skb, i, page, offset)) {
196                         skb_shinfo(skb)->frags[i-1].size += copy;
197                 } else if (i < MAX_SKB_FRAGS) {
198 @@ -884,6 +896,20 @@
199                         goto new_segment;
200                 }
201  
202 +               if (zccd != NULL &&     /* this is a zcc I/O */
203 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
204 +                   skb_shinfo(skb)->zccd2 != zccd)
205 +               {
206 +                       zccd_get (zccd);        /* bump ref count */
207 +
208 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
209 +
210 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
211 +                               skb_shinfo(skb)->zccd = zccd;
212 +                       else
213 +                               skb_shinfo(skb)->zccd2 = zccd;
214 +               }
215 +
216                 skb->len += copy;
217                 skb->data_len += copy;
218                 skb->ip_summed = CHECKSUM_HW;
219 @@ -947,7 +973,31 @@
220  
221         lock_sock(sk);
222         TCP_CHECK_TIMER(sk);
223 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
224 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
225 +       TCP_CHECK_TIMER(sk);
226 +       release_sock(sk);
227 +       return res;
228 +}
229 +
230 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
231 +                         int flags, zccd_t *zccd)
232 +{
233 +       ssize_t res;
234 +       struct sock *sk = sock->sk;
235 +
236 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
237 +
238 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
239 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
240 +               BUG ();
241 +
242 +#undef TCP_ZC_CSUM_FLAGS
243 +
244 +       lock_sock(sk);
245 +       TCP_CHECK_TIMER(sk);
246 +
247 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
248 +
249         TCP_CHECK_TIMER(sk);
250         release_sock(sk);
251         return res;
252 @@ -1771,6 +1821,202 @@
253         goto out;
254  }
255  
256 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
257 +                    int len, int nonblock)
258 +{
259 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
260 +       int copied;
261 +       long timeo;
262 +
263 +       BUG_TRAP (len > 0);
264 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
265 +
266 +       lock_sock(sk);
267 +
268 +       TCP_CHECK_TIMER(sk);
269 +
270 +       copied = -ENOTCONN;
271 +       if (sk->state == TCP_LISTEN)
272 +               goto out;
273 +
274 +       copied = 0;
275 +       timeo = sock_rcvtimeo(sk, nonblock);
276 +
277 +       do {
278 +               struct sk_buff * skb;
279 +               u32 offset;
280 +               unsigned long used;
281 +               int exhausted;
282 +               int eaten;
283 +
284 +               /* Are we at urgent data? Stop if we have read anything. */
285 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
286 +                       break;
287 +
288 +               /* We need to check signals first, to get correct SIGURG
289 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
290 +                * and move it down to the bottom of the loop
291 +                */
292 +               if (signal_pending(current)) {
293 +                       if (copied)
294 +                               break;
295 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
296 +                       break;
297 +               }
298 +
299 +               /* Next get a buffer. */
300 +
301 +               skb = skb_peek(&sk->receive_queue);
302 +
303 +               if (skb == NULL)                /* nothing ready */
304 +               {
305 +                       if (copied) {
306 +                               if (sk->err ||
307 +                                   sk->state == TCP_CLOSE ||
308 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
309 +                                   !timeo ||
310 +                                   (0))
311 +                                       break;
312 +                       } else {
313 +                               if (sk->done)
314 +                                       break;
315 +
316 +                               if (sk->err) {
317 +                                       copied = sock_error(sk);
318 +                                       break;
319 +                               }
320 +
321 +                               if (sk->shutdown & RCV_SHUTDOWN)
322 +                                       break;
323 +
324 +                               if (sk->state == TCP_CLOSE) {
325 +                                       if (!sk->done) {
326 +                                               /* This occurs when user tries to read
327 +                                                * from never connected socket.
328 +                                                */
329 +                                               copied = -ENOTCONN;
330 +                                               break;
331 +                                       }
332 +                                       break;
333 +                               }
334 +
335 +                               if (!timeo) {
336 +                                       copied = -EAGAIN;
337 +                                       break;
338 +                               }
339 +                       }
340 +
341 +                       cleanup_rbuf(sk, copied);
342 +                       timeo = tcp_data_wait(sk, timeo);
343 +                       continue;
344 +               }
345 +
346 +               BUG_TRAP (atomic_read (&skb->users) == 1);
347 +
348 +               exhausted = eaten = 0;
349 +
350 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
351 +               if (skb->h.th->syn)
352 +                       offset--;
353 +
354 +               used = skb->len - offset;
355 +
356 +               if (tp->urg_data) {
357 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
358 +                       if (urg_offset < used) {
359 +                               if (!urg_offset) { /* at urgent date */
360 +                                       if (!sk->urginline) {
361 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
362 +                                               offset++;
363 +                                               used--;
364 +                                       }
365 +                               } else          /* truncate read */
366 +                                       used = urg_offset;
367 +                       }
368 +               }
369 +
370 +               BUG_TRAP (used >= 0);
371 +               if (len < used)
372 +                       used = len;
373 +
374 +               if (used == 0)
375 +                       exhausted = 1;
376 +               else
377 +               {
378 +                       if (skb_is_nonlinear (skb))
379 +                       {
380 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
381 +
382 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
383 +
384 +                               if (rc)
385 +                               {
386 +                                       if (!copied)
387 +                                               copied = rc;
388 +                                       break;
389 +                               }
390 +                       }
391 +
392 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
393 +                       {
394 +                               __skb_unlink (skb, &sk->receive_queue);
395 +                               dst_release (skb->dst);
396 +                               skb_orphan (skb);
397 +                               __skb_pull (skb, offset);
398 +                               __skb_queue_tail (packets, skb);
399 +                               exhausted = eaten = 1;
400 +                       }
401 +                       else                    /* consuming only part of the packet */
402 +                       {
403 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
404 +
405 +                               if (skb2 == NULL)
406 +                               {
407 +                                       if (!copied)
408 +                                               copied = -ENOMEM;
409 +                                       break;
410 +                               }
411 +
412 +                               dst_release (skb2->dst);
413 +                               __skb_pull (skb2, offset);
414 +                               __skb_trim (skb2, used);
415 +                               __skb_queue_tail (packets, skb2);
416 +                       }
417 +
418 +                       tp->copied_seq += used;
419 +                       copied += used;
420 +                       len -= used;
421 +               }
422 +
423 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
424 +                       tp->urg_data = 0;
425 +                       tcp_fast_path_check(sk, tp);
426 +               }
427 +
428 +               if (!exhausted)
429 +                       continue;
430 +
431 +               if (skb->h.th->fin)
432 +               {
433 +                       tp->copied_seq++;
434 +                       if (!eaten)
435 +                               tcp_eat_skb (sk, skb);
436 +                       break;
437 +               }
438 +
439 +               if (!eaten)
440 +                       tcp_eat_skb (sk, skb);
441 +
442 +       } while (len > 0);
443 +
444 + out:
445 +       /* Clean up data we have read: This will do ACK frames. */
446 +       cleanup_rbuf(sk, copied);
447 +       TCP_CHECK_TIMER(sk);
448 +       release_sock(sk);
449 +       return copied;
450 +}
451 +
452  /*
453   *     State processing on a close. This implements the state shift for
454   *     sending our FIN frame. Note that we only send a FIN for some
455 Index: linux-2.4.22-vanilla/net/netsyms.c
456 ===================================================================
457 --- linux-2.4.22-vanilla.orig/net/netsyms.c     2003-11-03 23:22:13.000000000 +0300
458 +++ linux-2.4.22-vanilla/net/netsyms.c  2003-12-04 20:42:50.000000000 +0300
459 @@ -417,6 +417,8 @@
460  
461  #endif
462  
463 +EXPORT_SYMBOL(tcp_sendpage_zccd);
464 +EXPORT_SYMBOL(tcp_recvpackets);
465  EXPORT_SYMBOL(tcp_read_sock);
466  
467  EXPORT_SYMBOL(netlink_set_err);