Whamcloud - gitweb
file xnu_types.h was initially added on branch b_port_step.
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy-2.4.19-pre1.patch
1 Index: linux-2.4.19-pre1/include/linux/skbuff.h
2 ===================================================================
3 --- linux-2.4.19-pre1.orig/include/linux/skbuff.h       2001-11-22 22:46:26.000000000 +0300
4 +++ linux-2.4.19-pre1/include/linux/skbuff.h    2004-01-14 01:15:13.000000000 +0300
5 @@ -116,6 +116,30 @@
6         __u16 size;
7  };
8  
9 +/* Support for callback when skb data has been released */
10 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
11 +{                                              /* (embed as first member of custom struct) */
12 +       atomic_t        zccd_count;             /* reference count */
13 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
14 +} zccd_t;
15 +
16 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
17 +{
18 +       atomic_set (&d->zccd_count, 1);
19 +       d->zccd_destructor = callback;
20 +}
21 +
22 +static inline void zccd_get (zccd_t *d)                /* take a reference */
23 +{
24 +       atomic_inc (&d->zccd_count);
25 +}
26 +
27 +static inline void zccd_put (zccd_t *d)                /* release a reference */
28 +{
29 +       if (atomic_dec_and_test (&d->zccd_count))
30 +               (d->zccd_destructor)(d);
31 +}
32 +
33  /* This data is invariant across clones and lives at
34   * the end of the header data, ie. at skb->end.
35   */
36 @@ -123,6 +147,12 @@
37         atomic_t        dataref;
38         unsigned int    nr_frags;
39         struct sk_buff  *frag_list;
40 +       zccd_t          *zccd;                  /* zero copy descriptor */
41 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
42 +       /* NB we expect zero-copy data to be at least 1 packet, so
43 +        * having 2 zccds means we don't unneccessarily split the packet
44 +        * where consecutive zero-copy sends abutt.
45 +        */
46         skb_frag_t      frags[MAX_SKB_FRAGS];
47  };
48  
49 Index: linux-2.4.19-pre1/include/net/tcp.h
50 ===================================================================
51 --- linux-2.4.19-pre1.orig/include/net/tcp.h    2001-11-22 22:47:22.000000000 +0300
52 +++ linux-2.4.19-pre1/include/net/tcp.h 2004-01-14 01:15:13.000000000 +0300
53 @@ -640,6 +640,8 @@
54  
55  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
56  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
57 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
58 +                                                 int flags, zccd_t *zccd);
59  
60  extern int                     tcp_ioctl(struct sock *sk, 
61                                           int cmd, 
62 @@ -733,6 +735,9 @@
63                                             struct msghdr *msg,
64                                             int len, int nonblock, 
65                                             int flags, int *addr_len);
66 +extern int                     tcp_recvpackets(struct sock *sk,
67 +                                               struct sk_buff_head *packets,
68 +                                               int len, int nonblock);
69  
70  extern int                     tcp_listen_start(struct sock *sk);
71  
72 Index: linux-2.4.19-pre1/net/netsyms.c
73 ===================================================================
74 --- linux-2.4.19-pre1.orig/net/netsyms.c        2004-01-14 01:10:37.000000000 +0300
75 +++ linux-2.4.19-pre1/net/netsyms.c     2004-01-14 01:15:54.000000000 +0300
76 @@ -409,6 +409,9 @@
77  
78  #endif
79  
80 +EXPORT_SYMBOL(tcp_sendpage_zccd);
81 +EXPORT_SYMBOL(tcp_recvpackets);
82 +
83  EXPORT_SYMBOL(netlink_set_err);
84  EXPORT_SYMBOL(netlink_broadcast);
85  EXPORT_SYMBOL(netlink_unicast);
86 Index: linux-2.4.19-pre1/net/core/skbuff.c
87 ===================================================================
88 --- linux-2.4.19-pre1.orig/net/core/skbuff.c    2001-12-21 20:42:05.000000000 +0300
89 +++ linux-2.4.19-pre1/net/core/skbuff.c 2004-01-14 01:15:13.000000000 +0300
90 @@ -208,6 +208,8 @@
91         atomic_set(&(skb_shinfo(skb)->dataref), 1);
92         skb_shinfo(skb)->nr_frags = 0;
93         skb_shinfo(skb)->frag_list = NULL;
94 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
95 +       skb_shinfo(skb)->zccd2 = NULL;
96         return skb;
97  
98  nodata:
99 @@ -276,6 +278,10 @@
100  {
101         if (!skb->cloned ||
102             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
103 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
104 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
105 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
106 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
107                 if (skb_shinfo(skb)->nr_frags) {
108                         int i;
109                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
110 @@ -532,6 +538,8 @@
111         atomic_set(&(skb_shinfo(skb)->dataref), 1);
112         skb_shinfo(skb)->nr_frags = 0;
113         skb_shinfo(skb)->frag_list = NULL;
114 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
115 +       skb_shinfo(skb)->zccd2 = NULL;
116  
117         /* We are no longer a clone, even if we were. */
118         skb->cloned = 0;
119 @@ -578,6 +586,14 @@
120         n->data_len = skb->data_len;
121         n->len = skb->len;
122  
123 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
124 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
125 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
126 +
127 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
128 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
129 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
130 +
131         if (skb_shinfo(skb)->nr_frags) {
132                 int i;
133  
134 @@ -620,6 +636,8 @@
135         u8 *data;
136         int size = nhead + (skb->end - skb->head) + ntail;
137         long off;
138 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
139 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
140  
141         if (skb_shared(skb))
142                 BUG();
143 @@ -641,6 +659,11 @@
144         if (skb_shinfo(skb)->frag_list)
145                 skb_clone_fraglist(skb);
146  
147 +       if (zccd != NULL)                       /* user zero copy descriptor? */
148 +               zccd_get (zccd);                /* extra ref (pages are shared) */
149 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
150 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
151 +
152         skb_release_data(skb);
153  
154         off = (data+nhead) - skb->head;
155 @@ -655,6 +678,8 @@
156         skb->nh.raw += off;
157         skb->cloned = 0;
158         atomic_set(&skb_shinfo(skb)->dataref, 1);
159 +       skb_shinfo(skb)->zccd = zccd;
160 +       skb_shinfo(skb)->zccd2 = zccd2;
161         return 0;
162  
163  nodata:
164 Index: linux-2.4.19-pre1/net/ipv4/tcp.c
165 ===================================================================
166 --- linux-2.4.19-pre1.orig/net/ipv4/tcp.c       2001-12-21 20:42:05.000000000 +0300
167 +++ linux-2.4.19-pre1/net/ipv4/tcp.c    2004-01-14 01:15:13.000000000 +0300
168 @@ -744,7 +744,7 @@
169         goto out;
170  }
171  
172 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
173 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
174  
175  static inline int
176  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
177 @@ -823,7 +823,8 @@
178         return err;
179  }
180  
181 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
182 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
183 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
184  {
185         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
186         int mss_now;
187 @@ -871,6 +872,17 @@
188                         copy = size;
189  
190                 i = skb_shinfo(skb)->nr_frags;
191 +
192 +               if (zccd != NULL &&             /* this is a zcc I/O */
193 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
194 +                   skb_shinfo(skb)->zccd2 != NULL &&
195 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
196 +                   skb_shinfo(skb)->zccd2 != zccd)
197 +               {
198 +                       tcp_mark_push (tp, skb);
199 +                       goto new_segment;
200 +               }
201 +
202                 if (can_coalesce(skb, i, page, offset)) {
203                         skb_shinfo(skb)->frags[i-1].size += copy;
204                 } else if (i < MAX_SKB_FRAGS) {
205 @@ -881,6 +893,20 @@
206                         goto new_segment;
207                 }
208  
209 +               if (zccd != NULL &&     /* this is a zcc I/O */
210 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
211 +                   skb_shinfo(skb)->zccd2 != zccd)
212 +               {
213 +                       zccd_get (zccd);        /* bump ref count */
214 +
215 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
216 +
217 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
218 +                               skb_shinfo(skb)->zccd = zccd;
219 +                       else
220 +                               skb_shinfo(skb)->zccd2 = zccd;
221 +               }
222 +
223                 skb->len += copy;
224                 skb->data_len += copy;
225                 skb->ip_summed = CHECKSUM_HW;
226 @@ -944,7 +970,31 @@
227  
228         lock_sock(sk);
229         TCP_CHECK_TIMER(sk);
230 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
231 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
232 +       TCP_CHECK_TIMER(sk);
233 +       release_sock(sk);
234 +       return res;
235 +}
236 +
237 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
238 +                         int flags, zccd_t *zccd)
239 +{
240 +       ssize_t res;
241 +       struct sock *sk = sock->sk;
242 +
243 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
244 +
245 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
246 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
247 +               BUG ();
248 +
249 +#undef TCP_ZC_CSUM_FLAGS
250 +
251 +       lock_sock(sk);
252 +       TCP_CHECK_TIMER(sk);
253 +
254 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
255 +
256         TCP_CHECK_TIMER(sk);
257         release_sock(sk);
258         return res;
259 @@ -1683,6 +1733,202 @@
260         goto out;
261  }
262  
263 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
264 +                    int len, int nonblock)
265 +{
266 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
267 +       int copied;
268 +       long timeo;
269 +
270 +       BUG_TRAP (len > 0);
271 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
272 +
273 +       lock_sock(sk);
274 +
275 +       TCP_CHECK_TIMER(sk);
276 +
277 +       copied = -ENOTCONN;
278 +       if (sk->state == TCP_LISTEN)
279 +               goto out;
280 +
281 +       copied = 0;
282 +       timeo = sock_rcvtimeo(sk, nonblock);
283 +
284 +       do {
285 +               struct sk_buff * skb;
286 +               u32 offset;
287 +               unsigned long used;
288 +               int exhausted;
289 +               int eaten;
290 +
291 +               /* Are we at urgent data? Stop if we have read anything. */
292 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
293 +                       break;
294 +
295 +               /* We need to check signals first, to get correct SIGURG
296 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
297 +                * and move it down to the bottom of the loop
298 +                */
299 +               if (signal_pending(current)) {
300 +                       if (copied)
301 +                               break;
302 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
303 +                       break;
304 +               }
305 +
306 +               /* Next get a buffer. */
307 +
308 +               skb = skb_peek(&sk->receive_queue);
309 +
310 +               if (skb == NULL)                /* nothing ready */
311 +               {
312 +                       if (copied) {
313 +                               if (sk->err ||
314 +                                   sk->state == TCP_CLOSE ||
315 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
316 +                                   !timeo ||
317 +                                   (0))
318 +                                       break;
319 +                       } else {
320 +                               if (sk->done)
321 +                                       break;
322 +
323 +                               if (sk->err) {
324 +                                       copied = sock_error(sk);
325 +                                       break;
326 +                               }
327 +
328 +                               if (sk->shutdown & RCV_SHUTDOWN)
329 +                                       break;
330 +
331 +                               if (sk->state == TCP_CLOSE) {
332 +                                       if (!sk->done) {
333 +                                               /* This occurs when user tries to read
334 +                                                * from never connected socket.
335 +                                                */
336 +                                               copied = -ENOTCONN;
337 +                                               break;
338 +                                       }
339 +                                       break;
340 +                               }
341 +
342 +                               if (!timeo) {
343 +                                       copied = -EAGAIN;
344 +                                       break;
345 +                               }
346 +                       }
347 +
348 +                       cleanup_rbuf(sk, copied);
349 +                       timeo = tcp_data_wait(sk, timeo);
350 +                       continue;
351 +               }
352 +
353 +               BUG_TRAP (atomic_read (&skb->users) == 1);
354 +
355 +               exhausted = eaten = 0;
356 +
357 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
358 +               if (skb->h.th->syn)
359 +                       offset--;
360 +
361 +               used = skb->len - offset;
362 +
363 +               if (tp->urg_data) {
364 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
365 +                       if (urg_offset < used) {
366 +                               if (!urg_offset) { /* at urgent date */
367 +                                       if (!sk->urginline) {
368 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
369 +                                               offset++;
370 +                                               used--;
371 +                                       }
372 +                               } else          /* truncate read */
373 +                                       used = urg_offset;
374 +                       }
375 +               }
376 +
377 +               BUG_TRAP (used >= 0);
378 +               if (len < used)
379 +                       used = len;
380 +
381 +               if (used == 0)
382 +                       exhausted = 1;
383 +               else
384 +               {
385 +                       if (skb_is_nonlinear (skb))
386 +                       {
387 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
388 +
389 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
390 +
391 +                               if (rc)
392 +                               {
393 +                                       if (!copied)
394 +                                               copied = rc;
395 +                                       break;
396 +                               }
397 +                       }
398 +
399 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
400 +                       {
401 +                               __skb_unlink (skb, &sk->receive_queue);
402 +                               dst_release (skb->dst);
403 +                               skb_orphan (skb);
404 +                               __skb_pull (skb, offset);
405 +                               __skb_queue_tail (packets, skb);
406 +                               exhausted = eaten = 1;
407 +                       }
408 +                       else                    /* consuming only part of the packet */
409 +                       {
410 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
411 +
412 +                               if (skb2 == NULL)
413 +                               {
414 +                                       if (!copied)
415 +                                               copied = -ENOMEM;
416 +                                       break;
417 +                               }
418 +
419 +                               dst_release (skb2->dst);
420 +                               __skb_pull (skb2, offset);
421 +                               __skb_trim (skb2, used);
422 +                               __skb_queue_tail (packets, skb2);
423 +                       }
424 +
425 +                       tp->copied_seq += used;
426 +                       copied += used;
427 +                       len -= used;
428 +               }
429 +
430 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
431 +                       tp->urg_data = 0;
432 +                       tcp_fast_path_check(sk, tp);
433 +               }
434 +
435 +               if (!exhausted)
436 +                       continue;
437 +
438 +               if (skb->h.th->fin)
439 +               {
440 +                       tp->copied_seq++;
441 +                       if (!eaten)
442 +                               tcp_eat_skb (sk, skb);
443 +                       break;
444 +               }
445 +
446 +               if (!eaten)
447 +                       tcp_eat_skb (sk, skb);
448 +
449 +       } while (len > 0);
450 +
451 + out:
452 +       /* Clean up data we have read: This will do ACK frames. */
453 +       cleanup_rbuf(sk, copied);
454 +       TCP_CHECK_TIMER(sk);
455 +       release_sock(sk);
456 +       return copied;
457 +}
458 +
459  /*
460   *     State processing on a close. This implements the state shift for
461   *     sending our FIN frame. Note that we only send a FIN for some