Whamcloud - gitweb
adding 2.6-sles10 target and kernel config files for the sles10 kernel
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy-2.6-fc5.patch
1 Index: linux-2.6.16.i686/net/core/dev.c
2 ===================================================================
3 --- linux-2.6.16.i686.orig/net/core/dev.c       2006-05-30 15:47:10.000000000 +0800
4 +++ linux-2.6.16.i686/net/core/dev.c    2006-05-30 21:24:07.000000000 +0800
5 @@ -1181,6 +1181,9 @@
6         ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
7         ninfo->nr_frags = 0;
8         ninfo->frag_list = NULL;
9 +       ninfo->zccd = NULL;             /* copied data => no user zero copy descriptor */
10 +       ninfo->zccd2 = NULL;
11 +
12  
13         /* Offset between the two in bytes */
14         offset = data - skb->head;
15 Index: linux-2.6.16.i686/net/core/skbuff.c
16 ===================================================================
17 --- linux-2.6.16.i686.orig/net/core/skbuff.c    2006-05-30 15:47:12.000000000 +0800
18 +++ linux-2.6.16.i686/net/core/skbuff.c 2006-05-30 21:26:35.000000000 +0800
19 @@ -170,7 +170,8 @@
20         shinfo->ufo_size = 0;
21         shinfo->ip6_frag_id = 0;
22         shinfo->frag_list = NULL;
23 -
24 +       shinfo->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
25 +       shinfo->zccd2 = NULL;
26         if (fclone) {
27                 struct sk_buff *child = skb + 1;
28                 atomic_t *fclone_ref = (atomic_t *) (child + 1);
29 @@ -242,7 +243,9 @@
30         shinfo->ufo_size = 0;
31         shinfo->ip6_frag_id = 0;
32         shinfo->frag_list = NULL;
33 -
34 +       shinfo->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
35 +       shinfo->zccd2 = NULL;
36 +       
37         if (fclone) {
38                 struct sk_buff *child = skb + 1;
39                 atomic_t *fclone_ref = (atomic_t *) (child + 1);
40 @@ -287,6 +290,10 @@
41         if (!skb->cloned ||
42             !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
43                                &skb_shinfo(skb)->dataref)) {
44 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
45 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
46 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
47 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
48                 if (skb_shinfo(skb)->nr_frags) {
49                         int i;
50                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
51 @@ -606,6 +613,14 @@
52         n->data_len  = skb->data_len;
53         n->len       = skb->len;
54  
55 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
56 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
57 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
58 +
59 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
60 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
61 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
62 +
63         if (skb_shinfo(skb)->nr_frags) {
64                 int i;
65  
66 @@ -649,6 +664,9 @@
67         u8 *data;
68         int size = nhead + (skb->end - skb->head) + ntail;
69         long off;
70 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
71 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
72 +
73  
74         if (skb_shared(skb))
75                 BUG();
76 @@ -670,6 +688,11 @@
77         if (skb_shinfo(skb)->frag_list)
78                 skb_clone_fraglist(skb);
79  
80 +       if (zccd != NULL)                       /* user zero copy descriptor? */
81 +               zccd_get (zccd);                /* extra ref (pages are shared) */
82 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
83 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
84 +
85         skb_release_data(skb);
86  
87         off = (data + nhead) - skb->head;
88 @@ -684,6 +707,8 @@
89         skb->cloned   = 0;
90         skb->nohdr    = 0;
91         atomic_set(&skb_shinfo(skb)->dataref, 1);
92 +       skb_shinfo(skb)->zccd = zccd;
93 +       skb_shinfo(skb)->zccd2 = zccd2;
94         return 0;
95  
96  nodata:
97 Index: linux-2.6.16.i686/net/ipv4/tcp.c
98 ===================================================================
99 --- linux-2.6.16.i686.orig/net/ipv4/tcp.c       2006-05-30 15:47:12.000000000 +0800
100 +++ linux-2.6.16.i686/net/ipv4/tcp.c    2006-05-30 21:24:07.000000000 +0800
101 @@ -498,8 +498,10 @@
102         }
103  }
104  
105 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
106  static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
107 -                        size_t psize, int flags)
108 +                               size_t psize, int flags, zccd_t *zccd)
109 +
110  {
111         struct tcp_sock *tp = tcp_sk(sk);
112         int mss_now, size_goal;
113 @@ -547,6 +549,17 @@
114                         copy = size;
115  
116                 i = skb_shinfo(skb)->nr_frags;
117 +
118 +               if (zccd != NULL &&             /* this is a zcc I/O */
119 +                               skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
120 +                               skb_shinfo(skb)->zccd2 != NULL &&
121 +                               skb_shinfo(skb)->zccd != zccd && /* not the same one */
122 +                               skb_shinfo(skb)->zccd2 != zccd)
123 +               {
124 +                       tcp_mark_push (tp, skb);
125 +                       goto new_segment;
126 +               }
127 +
128                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
129                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
130                         tcp_mark_push(tp, skb);
131 @@ -562,6 +575,20 @@
132                         skb_fill_page_desc(skb, i, page, offset, copy);
133                 }
134  
135 +               if (zccd != NULL &&     /* this is a zcc I/O */
136 +                       skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
137 +                       skb_shinfo(skb)->zccd2 != zccd)
138 +               {
139 +                       zccd_get (zccd);        /* bump ref count */
140 +
141 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
142 +
143 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
144 +                               skb_shinfo(skb)->zccd = zccd;
145 +                       else
146 +                               skb_shinfo(skb)->zccd2 = zccd;
147 +               }
148 +
149                 skb->len += copy;
150                 skb->data_len += copy;
151                 skb->truesize += copy;
152 @@ -631,12 +658,37 @@
153  
154         lock_sock(sk);
155         TCP_CHECK_TIMER(sk);
156 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
157 +       res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL);
158 +       TCP_CHECK_TIMER(sk);
159 +       release_sock(sk);
160 +       return res;
161 +}
162 +
163 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
164 +                          int flags, zccd_t *zccd)
165 +{
166 +       ssize_t res;
167 +       struct sock *sk = sock->sk;
168 +
169 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
170 +
171 +       if (!(sk->sk_route_caps & NETIF_F_SG) ||        /* caller shouldn't waste her time */
172 +           !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
173 +               BUG ();
174 +
175 +#undef TCP_ZC_CSUM_FLAGS
176 +
177 +       lock_sock(sk);
178 +       TCP_CHECK_TIMER(sk);
179 +
180 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
181 +
182         TCP_CHECK_TIMER(sk);
183         release_sock(sk);
184         return res;
185  }
186  
187 +
188  #define TCP_PAGE(sk)   (sk->sk_sndmsg_page)
189  #define TCP_OFF(sk)    (sk->sk_sndmsg_off)
190  
191 @@ -1406,6 +1458,202 @@
192         goto out;
193  }
194  
195 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
196 +                    int len, int nonblock)
197 +{
198 +       struct tcp_sock *tp = tcp_sk(sk);
199 +       int copied;
200 +       long timeo;
201 +
202 +       BUG_TRAP (len > 0);
203 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
204 +
205 +       lock_sock(sk);
206 +
207 +       TCP_CHECK_TIMER(sk);
208 +
209 +       copied = -ENOTCONN;
210 +       if (sk->sk_state == TCP_LISTEN)
211 +               goto out;
212 +
213 +       copied = 0;
214 +       timeo = sock_rcvtimeo(sk, nonblock);
215 +
216 +       do {
217 +               struct sk_buff * skb;
218 +               u32 offset;
219 +               unsigned long used;
220 +               int exhausted;
221 +               int eaten;
222 +
223 +               /* Are we at urgent data? Stop if we have read anything. */
224 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
225 +                       break;
226 +
227 +               /* We need to check signals first, to get correct SIGURG
228 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
229 +                * and move it down to the bottom of the loop
230 +                */
231 +               if (signal_pending(current)) {
232 +                       if (copied)
233 +                               break;
234 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
235 +                       break;
236 +               }
237 +
238 +               /* Next get a buffer. */
239 +
240 +               skb = skb_peek(&sk->sk_receive_queue);
241 +
242 +               if (skb == NULL)                /* nothing ready */
243 +               {
244 +                       if (copied) {
245 +                               if (sk->sk_err ||
246 +                                   sk->sk_state == TCP_CLOSE ||
247 +                                   (sk->sk_shutdown & RCV_SHUTDOWN) ||
248 +                                   !timeo ||
249 +                                   (0))
250 +                                       break;
251 +                       } else {
252 +                               if (sock_flag(sk, SOCK_DONE))
253 +                                       break;
254 +
255 +                               if (sk->sk_err) {
256 +                                       copied = sock_error(sk);
257 +                                       break;
258 +                               }
259 +
260 +                               if (sk->sk_shutdown & RCV_SHUTDOWN)
261 +                                       break;
262 +
263 +                               if (sk->sk_state == TCP_CLOSE) {
264 +                                       if (!(sock_flag(sk, SOCK_DONE))) {
265 +                                               /* This occurs when user tries to read
266 +                                                * from never connected socket.
267 +                                                */
268 +                                               copied = -ENOTCONN;
269 +                                               break;
270 +                                       }
271 +                                       break;
272 +                               }
273 +
274 +                               if (!timeo) {
275 +                                       copied = -EAGAIN;
276 +                                       break;
277 +                               }
278 +                       }
279 +
280 +                       cleanup_rbuf(sk, copied);
281 +                       sk_wait_data(sk, &timeo);
282 +                       continue;
283 +               }
284 +
285 +               BUG_TRAP (atomic_read (&skb->users) == 1);
286 +
287 +               exhausted = eaten = 0;
288 +
289 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
290 +               if (skb->h.th->syn)
291 +                       offset--;
292 +
293 +               used = skb->len - offset;
294 +
295 +               if (tp->urg_data) {
296 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
297 +                       if (urg_offset < used) {
298 +                               if (!urg_offset) { /* at urgent date */
299 +                                       if (!(sock_flag(sk, SOCK_URGINLINE))) {
300 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
301 +                                               offset++;
302 +                                               used--;
303 +                                       }
304 +                               } else          /* truncate read */
305 +                                       used = urg_offset;
306 +                       }
307 +               }
308 +
309 +               BUG_TRAP (used >= 0);
310 +               if (len < used)
311 +                       used = len;
312 +
313 +               if (used == 0)
314 +                       exhausted = 1;
315 +               else
316 +               {
317 +                       if (skb_is_nonlinear (skb))
318 +                       {
319 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
320 +
321 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
322 +
323 +                               if (rc)
324 +                               {
325 +                                       if (!copied)
326 +                                               copied = rc;
327 +                                       break;
328 +                               }
329 +                       }
330 +
331 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
332 +                       {
333 +                               __skb_unlink (skb, &sk->sk_receive_queue);
334 +                               dst_release (skb->dst);
335 +                               skb_orphan (skb);
336 +                               __skb_pull (skb, offset);
337 +                               __skb_queue_tail (packets, skb);
338 +                               exhausted = eaten = 1;
339 +                       }
340 +                       else                    /* consuming only part of the packet */
341 +                       {
342 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
343 +
344 +                               if (skb2 == NULL)
345 +                               {
346 +                                       if (!copied)
347 +                                               copied = -ENOMEM;
348 +                                       break;
349 +                               }
350 +
351 +                               dst_release (skb2->dst);
352 +                               __skb_pull (skb2, offset);
353 +                               __skb_trim (skb2, used);
354 +                               __skb_queue_tail (packets, skb2);
355 +                       }
356 +
357 +                       tp->copied_seq += used;
358 +                       copied += used;
359 +                       len -= used;
360 +               }
361 +
362 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
363 +                       tp->urg_data = 0;
364 +                       tcp_fast_path_check(sk, tp);
365 +               }
366 +
367 +               if (!exhausted)
368 +                       continue;
369 +
370 +               if (skb->h.th->fin)
371 +               {
372 +                       tp->copied_seq++;
373 +                       if (!eaten)
374 +                               sk_eat_skb (sk, skb);
375 +                       break;
376 +               }
377 +
378 +               if (!eaten)
379 +                       sk_eat_skb (sk, skb);
380 +
381 +       } while (len > 0);
382 +
383 + out:
384 +       /* Clean up data we have read: This will do ACK frames. */
385 +       cleanup_rbuf(sk, copied);
386 +       TCP_CHECK_TIMER(sk);
387 +       release_sock(sk);
388 +       return copied;
389 +}
390 +
391  /*
392   *     State processing on a close. This implements the state shift for
393   *     sending our FIN frame. Note that we only send a FIN for some
394 @@ -2139,6 +2387,8 @@
395  EXPORT_SYMBOL(tcp_recvmsg);
396  EXPORT_SYMBOL(tcp_sendmsg);
397  EXPORT_SYMBOL(tcp_sendpage);
398 +EXPORT_SYMBOL(tcp_sendpage_zccd);
399 +EXPORT_SYMBOL(tcp_recvpackets);
400  EXPORT_SYMBOL(tcp_setsockopt);
401  EXPORT_SYMBOL(tcp_shutdown);
402  EXPORT_SYMBOL(tcp_statistics);
403 Index: linux-2.6.16.i686/include/linux/skbuff.h
404 ===================================================================
405 --- linux-2.6.16.i686.orig/include/linux/skbuff.h       2006-05-30 15:47:11.000000000 +0800
406 +++ linux-2.6.16.i686/include/linux/skbuff.h    2006-05-30 21:24:07.000000000 +0800
407 @@ -128,6 +128,30 @@
408         __u16 size;
409  };
410  
411 +/* Support for callback when skb data has been released */
412 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
413 +{                                              /* (embed as first member of custom struct) */
414 +       atomic_t        zccd_count;             /* reference count */
415 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
416 +} zccd_t;
417 +
418 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
419 +{
420 +       atomic_set (&d->zccd_count, 1);
421 +       d->zccd_destructor = callback;
422 +}
423 +
424 +static inline void zccd_get (zccd_t *d)                /* take a reference */
425 +{
426 +       atomic_inc (&d->zccd_count);
427 +}
428 +
429 +static inline void zccd_put (zccd_t *d)                /* release a reference */
430 +{
431 +       if (atomic_dec_and_test (&d->zccd_count))
432 +               (d->zccd_destructor)(d);
433 +}
434 +
435  /* This data is invariant across clones and lives at
436   * the end of the header data, ie. at skb->end.
437   */
438 @@ -139,6 +163,13 @@
439         unsigned short  ufo_size;
440         unsigned int    ip6_frag_id;
441         struct sk_buff  *frag_list;
442 +       zccd_t          *zccd;                  /* zero copy descriptor */
443 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
444 +       /* NB we expect zero-copy data to be at least 1 packet, so
445 +       * having 2 zccds means we don't unneccessarily split the packet
446 +       * where consecutive zero-copy sends abutt.
447 +       */
448 +
449         skb_frag_t      frags[MAX_SKB_FRAGS];
450  };
451  
452 Index: linux-2.6.16.i686/include/net/tcp.h
453 ===================================================================
454 --- linux-2.6.16.i686.orig/include/net/tcp.h    2006-05-30 15:47:11.000000000 +0800
455 +++ linux-2.6.16.i686/include/net/tcp.h 2006-05-30 21:24:07.000000000 +0800
456 @@ -272,6 +272,9 @@
457  extern int                     tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
458                                             struct msghdr *msg, size_t size);
459  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
460 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
461 +                                               int flags, zccd_t *zccd);
462 +
463  
464  extern int                     tcp_ioctl(struct sock *sk, 
465                                           int cmd, 
466 @@ -354,6 +357,9 @@
467                                             struct msghdr *msg,
468                                             size_t len, int nonblock, 
469                                             int flags, int *addr_len);
470 +extern int                     tcp_recvpackets(struct sock *sk,
471 +                                               struct sk_buff_head *packets,
472 +                                               int len, int nonblock);
473  
474  extern void                    tcp_parse_options(struct sk_buff *skb,
475                                                   struct tcp_options_received *opt_rx,