Whamcloud - gitweb
b=4336
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy.patch
1 Index: linux-2.4.20/include/linux/skbuff.h
2 ===================================================================
3 --- linux-2.4.20.orig/include/linux/skbuff.h    2003-05-16 05:28:45.000000000 +0400
4 +++ linux-2.4.20/include/linux/skbuff.h 2003-12-04 20:56:32.000000000 +0300
5 @@ -116,6 +116,30 @@
6         __u16 size;
7  };
8  
9 +/* Support for callback when skb data has been released */
10 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
11 +{                                              /* (embed as first member of custom struct) */
12 +       atomic_t        zccd_count;             /* reference count */
13 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
14 +} zccd_t;
15 +
16 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
17 +{
18 +       atomic_set (&d->zccd_count, 1);
19 +       d->zccd_destructor = callback;
20 +}
21 +
22 +static inline void zccd_get (zccd_t *d)                /* take a reference */
23 +{
24 +       atomic_inc (&d->zccd_count);
25 +}
26 +
27 +static inline void zccd_put (zccd_t *d)                /* release a reference */
28 +{
29 +       if (atomic_dec_and_test (&d->zccd_count))
30 +               (d->zccd_destructor)(d);
31 +}
32 +
33  /* This data is invariant across clones and lives at
34   * the end of the header data, ie. at skb->end.
35   */
36 @@ -123,6 +147,12 @@
37         atomic_t        dataref;
38         unsigned int    nr_frags;
39         struct sk_buff  *frag_list;
40 +       zccd_t          *zccd;                  /* zero copy descriptor */
41 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
42 +       /* NB we expect zero-copy data to be at least 1 packet, so
43 +        * having 2 zccds means we don't unneccessarily split the packet
44 +        * where consecutive zero-copy sends abutt.
45 +        */
46         skb_frag_t      frags[MAX_SKB_FRAGS];
47  };
48  
49 Index: linux-2.4.20/include/net/tcp.h
50 ===================================================================
51 --- linux-2.4.20.orig/include/net/tcp.h 2003-05-16 05:29:15.000000000 +0400
52 +++ linux-2.4.20/include/net/tcp.h      2003-12-04 20:56:32.000000000 +0300
53 @@ -638,6 +638,8 @@
54  
55  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
56  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
57 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
58 +                                                 int flags, zccd_t *zccd);
59  
60  extern int                     tcp_ioctl(struct sock *sk, 
61                                           int cmd, 
62 @@ -731,6 +733,9 @@
63                                             struct msghdr *msg,
64                                             int len, int nonblock, 
65                                             int flags, int *addr_len);
66 +extern int                     tcp_recvpackets(struct sock *sk,
67 +                                               struct sk_buff_head *packets,
68 +                                               int len, int nonblock);
69  
70  extern int                     tcp_listen_start(struct sock *sk);
71  
72 Index: linux-2.4.20/net/netsyms.c
73 ===================================================================
74 --- linux-2.4.20.orig/net/netsyms.c     2003-05-16 05:29:15.000000000 +0400
75 +++ linux-2.4.20/net/netsyms.c  2003-12-04 20:56:44.000000000 +0300
76 @@ -408,6 +408,8 @@
77  
78  #endif
79  
80 +EXPORT_SYMBOL(tcp_sendpage_zccd);
81 +EXPORT_SYMBOL(tcp_recvpackets);
82  EXPORT_SYMBOL(tcp_read_sock);
83  
84  EXPORT_SYMBOL(netlink_set_err);
85 Index: linux-2.4.20/net/core/skbuff.c
86 ===================================================================
87 --- linux-2.4.20.orig/net/core/skbuff.c 2003-05-16 05:28:46.000000000 +0400
88 +++ linux-2.4.20/net/core/skbuff.c      2003-12-04 20:56:32.000000000 +0300
89 @@ -208,6 +208,8 @@
90         atomic_set(&(skb_shinfo(skb)->dataref), 1);
91         skb_shinfo(skb)->nr_frags = 0;
92         skb_shinfo(skb)->frag_list = NULL;
93 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
94 +       skb_shinfo(skb)->zccd2 = NULL;
95         return skb;
96  
97  nodata:
98 @@ -276,6 +278,10 @@
99  {
100         if (!skb->cloned ||
101             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
102 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
103 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
104 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
105 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
106                 if (skb_shinfo(skb)->nr_frags) {
107                         int i;
108                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
109 @@ -532,6 +538,8 @@
110         atomic_set(&(skb_shinfo(skb)->dataref), 1);
111         skb_shinfo(skb)->nr_frags = 0;
112         skb_shinfo(skb)->frag_list = NULL;
113 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
114 +       skb_shinfo(skb)->zccd2 = NULL;
115  
116         /* We are no longer a clone, even if we were. */
117         skb->cloned = 0;
118 @@ -578,6 +586,14 @@
119         n->data_len = skb->data_len;
120         n->len = skb->len;
121  
122 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
123 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
124 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
125 +
126 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
127 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
128 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
129 +
130         if (skb_shinfo(skb)->nr_frags) {
131                 int i;
132  
133 @@ -620,6 +636,8 @@
134         u8 *data;
135         int size = nhead + (skb->end - skb->head) + ntail;
136         long off;
137 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
138 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
139  
140         if (skb_shared(skb))
141                 BUG();
142 @@ -641,6 +659,11 @@
143         if (skb_shinfo(skb)->frag_list)
144                 skb_clone_fraglist(skb);
145  
146 +       if (zccd != NULL)                       /* user zero copy descriptor? */
147 +               zccd_get (zccd);                /* extra ref (pages are shared) */
148 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
149 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
150 +
151         skb_release_data(skb);
152  
153         off = (data+nhead) - skb->head;
154 @@ -655,6 +678,8 @@
155         skb->nh.raw += off;
156         skb->cloned = 0;
157         atomic_set(&skb_shinfo(skb)->dataref, 1);
158 +       skb_shinfo(skb)->zccd = zccd;
159 +       skb_shinfo(skb)->zccd2 = zccd2;
160         return 0;
161  
162  nodata:
163 Index: linux-2.4.20/net/ipv4/tcp.c
164 ===================================================================
165 --- linux-2.4.20.orig/net/ipv4/tcp.c    2003-05-16 05:29:15.000000000 +0400
166 +++ linux-2.4.20/net/ipv4/tcp.c 2003-12-04 20:56:32.000000000 +0300
167 @@ -745,7 +745,7 @@
168         goto out;
169  }
170  
171 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
172 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
173  
174  static inline int
175  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
176 @@ -824,7 +824,8 @@
177         return err;
178  }
179  
180 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
181 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
182 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
183  {
184         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
185         int mss_now;
186 @@ -872,6 +873,17 @@
187                         copy = size;
188  
189                 i = skb_shinfo(skb)->nr_frags;
190 +
191 +               if (zccd != NULL &&             /* this is a zcc I/O */
192 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
193 +                   skb_shinfo(skb)->zccd2 != NULL &&
194 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
195 +                   skb_shinfo(skb)->zccd2 != zccd)
196 +               {
197 +                       tcp_mark_push (tp, skb);
198 +                       goto new_segment;
199 +               }
200 +
201                 if (can_coalesce(skb, i, page, offset)) {
202                         skb_shinfo(skb)->frags[i-1].size += copy;
203                 } else if (i < MAX_SKB_FRAGS) {
204 @@ -882,6 +894,20 @@
205                         goto new_segment;
206                 }
207  
208 +               if (zccd != NULL &&     /* this is a zcc I/O */
209 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
210 +                   skb_shinfo(skb)->zccd2 != zccd)
211 +               {
212 +                       zccd_get (zccd);        /* bump ref count */
213 +
214 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
215 +
216 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
217 +                               skb_shinfo(skb)->zccd = zccd;
218 +                       else
219 +                               skb_shinfo(skb)->zccd2 = zccd;
220 +               }
221 +
222                 skb->len += copy;
223                 skb->data_len += copy;
224                 skb->ip_summed = CHECKSUM_HW;
225 @@ -945,7 +971,31 @@
226  
227         lock_sock(sk);
228         TCP_CHECK_TIMER(sk);
229 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
230 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
231 +       TCP_CHECK_TIMER(sk);
232 +       release_sock(sk);
233 +       return res;
234 +}
235 +
236 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
237 +                         int flags, zccd_t *zccd)
238 +{
239 +       ssize_t res;
240 +       struct sock *sk = sock->sk;
241 +
242 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
243 +
244 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
245 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
246 +               BUG ();
247 +
248 +#undef TCP_ZC_CSUM_FLAGS
249 +
250 +       lock_sock(sk);
251 +       TCP_CHECK_TIMER(sk);
252 +
253 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
254 +
255         TCP_CHECK_TIMER(sk);
256         release_sock(sk);
257         return res;
258 @@ -1769,6 +1819,202 @@
259         goto out;
260  }
261  
262 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
263 +                    int len, int nonblock)
264 +{
265 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
266 +       int copied;
267 +       long timeo;
268 +
269 +       BUG_TRAP (len > 0);
270 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
271 +
272 +       lock_sock(sk);
273 +
274 +       TCP_CHECK_TIMER(sk);
275 +
276 +       copied = -ENOTCONN;
277 +       if (sk->state == TCP_LISTEN)
278 +               goto out;
279 +
280 +       copied = 0;
281 +       timeo = sock_rcvtimeo(sk, nonblock);
282 +
283 +       do {
284 +               struct sk_buff * skb;
285 +               u32 offset;
286 +               unsigned long used;
287 +               int exhausted;
288 +               int eaten;
289 +
290 +               /* Are we at urgent data? Stop if we have read anything. */
291 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
292 +                       break;
293 +
294 +               /* We need to check signals first, to get correct SIGURG
295 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
296 +                * and move it down to the bottom of the loop
297 +                */
298 +               if (signal_pending(current)) {
299 +                       if (copied)
300 +                               break;
301 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
302 +                       break;
303 +               }
304 +
305 +               /* Next get a buffer. */
306 +
307 +               skb = skb_peek(&sk->receive_queue);
308 +
309 +               if (skb == NULL)                /* nothing ready */
310 +               {
311 +                       if (copied) {
312 +                               if (sk->err ||
313 +                                   sk->state == TCP_CLOSE ||
314 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
315 +                                   !timeo ||
316 +                                   (0))
317 +                                       break;
318 +                       } else {
319 +                               if (sk->done)
320 +                                       break;
321 +
322 +                               if (sk->err) {
323 +                                       copied = sock_error(sk);
324 +                                       break;
325 +                               }
326 +
327 +                               if (sk->shutdown & RCV_SHUTDOWN)
328 +                                       break;
329 +
330 +                               if (sk->state == TCP_CLOSE) {
331 +                                       if (!sk->done) {
332 +                                               /* This occurs when user tries to read
333 +                                                * from never connected socket.
334 +                                                */
335 +                                               copied = -ENOTCONN;
336 +                                               break;
337 +                                       }
338 +                                       break;
339 +                               }
340 +
341 +                               if (!timeo) {
342 +                                       copied = -EAGAIN;
343 +                                       break;
344 +                               }
345 +                       }
346 +
347 +                       cleanup_rbuf(sk, copied);
348 +                       timeo = tcp_data_wait(sk, timeo);
349 +                       continue;
350 +               }
351 +
352 +               BUG_TRAP (atomic_read (&skb->users) == 1);
353 +
354 +               exhausted = eaten = 0;
355 +
356 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
357 +               if (skb->h.th->syn)
358 +                       offset--;
359 +
360 +               used = skb->len - offset;
361 +
362 +               if (tp->urg_data) {
363 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
364 +                       if (urg_offset < used) {
365 +                               if (!urg_offset) { /* at urgent date */
366 +                                       if (!sk->urginline) {
367 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
368 +                                               offset++;
369 +                                               used--;
370 +                                       }
371 +                               } else          /* truncate read */
372 +                                       used = urg_offset;
373 +                       }
374 +               }
375 +
376 +               BUG_TRAP (used >= 0);
377 +               if (len < used)
378 +                       used = len;
379 +
380 +               if (used == 0)
381 +                       exhausted = 1;
382 +               else
383 +               {
384 +                       if (skb_is_nonlinear (skb))
385 +                       {
386 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
387 +
388 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
389 +
390 +                               if (rc)
391 +                               {
392 +                                       if (!copied)
393 +                                               copied = rc;
394 +                                       break;
395 +                               }
396 +                       }
397 +
398 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
399 +                       {
400 +                               __skb_unlink (skb, &sk->receive_queue);
401 +                               dst_release (skb->dst);
402 +                               skb_orphan (skb);
403 +                               __skb_pull (skb, offset);
404 +                               __skb_queue_tail (packets, skb);
405 +                               exhausted = eaten = 1;
406 +                       }
407 +                       else                    /* consuming only part of the packet */
408 +                       {
409 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
410 +
411 +                               if (skb2 == NULL)
412 +                               {
413 +                                       if (!copied)
414 +                                               copied = -ENOMEM;
415 +                                       break;
416 +                               }
417 +
418 +                               dst_release (skb2->dst);
419 +                               __skb_pull (skb2, offset);
420 +                               __skb_trim (skb2, used);
421 +                               __skb_queue_tail (packets, skb2);
422 +                       }
423 +
424 +                       tp->copied_seq += used;
425 +                       copied += used;
426 +                       len -= used;
427 +               }
428 +
429 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
430 +                       tp->urg_data = 0;
431 +                       tcp_fast_path_check(sk, tp);
432 +               }
433 +
434 +               if (!exhausted)
435 +                       continue;
436 +
437 +               if (skb->h.th->fin)
438 +               {
439 +                       tp->copied_seq++;
440 +                       if (!eaten)
441 +                               tcp_eat_skb (sk, skb);
442 +                       break;
443 +               }
444 +
445 +               if (!eaten)
446 +                       tcp_eat_skb (sk, skb);
447 +
448 +       } while (len > 0);
449 +
450 + out:
451 +       /* Clean up data we have read: This will do ACK frames. */
452 +       cleanup_rbuf(sk, copied);
453 +       TCP_CHECK_TIMER(sk);
454 +       release_sock(sk);
455 +       return copied;
456 +}
457 +
458  /*
459   *     State processing on a close. This implements the state shift for
460   *     sending our FIN frame. Note that we only send a FIN for some