Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy-2.6-sles10.patch
1 Index: linux-2.6.16.21-0.8/net/core/skbuff.c
2 ===================================================================
3 --- linux-2.6.16.21-0.8.orig/net/core/skbuff.c  2006-08-03 21:11:25.000000000 -0600
4 +++ linux-2.6.16.21-0.8/net/core/skbuff.c       2006-08-03 21:11:30.000000000 -0600
5 @@ -170,7 +170,8 @@
6         shinfo->ufo_size = 0;
7         shinfo->ip6_frag_id = 0;
8         shinfo->frag_list = NULL;
9 -
10 +       shinfo->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
11 +       shinfo->zccd2 = NULL;
12         if (fclone) {
13                 struct sk_buff *child = skb + 1;
14                 atomic_t *fclone_ref = (atomic_t *) (child + 1);
15 @@ -287,6 +288,10 @@
16         if (!skb->cloned ||
17             !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
18                                &skb_shinfo(skb)->dataref)) {
19 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
20 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
21 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
22 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
23                 if (skb_shinfo(skb)->nr_frags) {
24                         int i;
25                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
26 @@ -606,6 +611,14 @@
27         n->data_len  = skb->data_len;
28         n->len       = skb->len;
29  
30 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
31 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
32 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
33 +
34 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
35 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
36 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
37 +
38         if (skb_shinfo(skb)->nr_frags) {
39                 int i;
40  
41 @@ -649,6 +662,9 @@
42         u8 *data;
43         int size = nhead + (skb->end - skb->head) + ntail;
44         long off;
45 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
46 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
47 +
48  
49         if (skb_shared(skb))
50                 BUG();
51 @@ -670,6 +686,11 @@
52         if (skb_shinfo(skb)->frag_list)
53                 skb_clone_fraglist(skb);
54  
55 +       if (zccd != NULL)                       /* user zero copy descriptor? */
56 +               zccd_get (zccd);                /* extra ref (pages are shared) */
57 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
58 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
59 +
60         skb_release_data(skb);
61  
62         off = (data + nhead) - skb->head;
63 @@ -684,6 +705,8 @@
64         skb->cloned   = 0;
65         skb->nohdr    = 0;
66         atomic_set(&skb_shinfo(skb)->dataref, 1);
67 +       skb_shinfo(skb)->zccd = zccd;
68 +       skb_shinfo(skb)->zccd2 = zccd2;
69         return 0;
70  
71  nodata:
72 Index: linux-2.6.16.21-0.8/net/ipv4/tcp.c
73 ===================================================================
74 --- linux-2.6.16.21-0.8.orig/net/ipv4/tcp.c     2006-08-03 21:11:25.000000000 -0600
75 +++ linux-2.6.16.21-0.8/net/ipv4/tcp.c  2006-08-03 21:11:30.000000000 -0600
76 @@ -498,8 +498,10 @@
77         }
78  }
79  
80 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
81  static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
82 -                        size_t psize, int flags)
83 +                               size_t psize, int flags, zccd_t *zccd)
84 +
85  {
86         struct tcp_sock *tp = tcp_sk(sk);
87         int mss_now, size_goal;
88 @@ -547,6 +549,17 @@
89                         copy = size;
90  
91                 i = skb_shinfo(skb)->nr_frags;
92 +
93 +               if (zccd != NULL &&             /* this is a zcc I/O */
94 +                               skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
95 +                               skb_shinfo(skb)->zccd2 != NULL &&
96 +                               skb_shinfo(skb)->zccd != zccd && /* not the same one */
97 +                               skb_shinfo(skb)->zccd2 != zccd)
98 +               {
99 +                       tcp_mark_push (tp, skb);
100 +                       goto new_segment;
101 +               }
102 +
103                 can_coalesce = skb_can_coalesce(skb, i, page, offset);
104                 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
105                         tcp_mark_push(tp, skb);
106 @@ -562,6 +575,20 @@
107                         skb_fill_page_desc(skb, i, page, offset, copy);
108                 }
109  
110 +               if (zccd != NULL &&     /* this is a zcc I/O */
111 +                       skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
112 +                       skb_shinfo(skb)->zccd2 != zccd)
113 +               {
114 +                       zccd_get (zccd);        /* bump ref count */
115 +
116 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
117 +
118 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
119 +                               skb_shinfo(skb)->zccd = zccd;
120 +                       else
121 +                               skb_shinfo(skb)->zccd2 = zccd;
122 +               }
123 +
124                 skb->len += copy;
125                 skb->data_len += copy;
126                 skb->truesize += copy;
127 @@ -631,12 +658,37 @@
128  
129         lock_sock(sk);
130         TCP_CHECK_TIMER(sk);
131 -       res = do_tcp_sendpages(sk, &page, offset, size, flags);
132 +       res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL);
133 +       TCP_CHECK_TIMER(sk);
134 +       release_sock(sk);
135 +       return res;
136 +}
137 +
138 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
139 +                          int flags, zccd_t *zccd)
140 +{
141 +       ssize_t res;
142 +       struct sock *sk = sock->sk;
143 +
144 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
145 +
146 +       if (!(sk->sk_route_caps & NETIF_F_SG) ||        /* caller shouldn't waste her time */
147 +           !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
148 +               BUG ();
149 +
150 +#undef TCP_ZC_CSUM_FLAGS
151 +
152 +       lock_sock(sk);
153 +       TCP_CHECK_TIMER(sk);
154 +
155 +       res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
156 +
157         TCP_CHECK_TIMER(sk);
158         release_sock(sk);
159         return res;
160  }
161  
162 +
163  #define TCP_PAGE(sk)   (sk->sk_sndmsg_page)
164  #define TCP_OFF(sk)    (sk->sk_sndmsg_off)
165  
166 @@ -1482,6 +1534,202 @@
167         goto out;
168  }
169  
170 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
171 +                    int len, int nonblock)
172 +{
173 +       struct tcp_sock *tp = tcp_sk(sk);
174 +       int copied;
175 +       long timeo;
176 +
177 +       BUG_TRAP (len > 0);
178 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
179 +
180 +       lock_sock(sk);
181 +
182 +       TCP_CHECK_TIMER(sk);
183 +
184 +       copied = -ENOTCONN;
185 +       if (sk->sk_state == TCP_LISTEN)
186 +               goto out;
187 +
188 +       copied = 0;
189 +       timeo = sock_rcvtimeo(sk, nonblock);
190 +
191 +       do {
192 +               struct sk_buff * skb;
193 +               u32 offset;
194 +               unsigned long used;
195 +               int exhausted;
196 +               int eaten;
197 +
198 +               /* Are we at urgent data? Stop if we have read anything. */
199 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
200 +                       break;
201 +
202 +               /* We need to check signals first, to get correct SIGURG
203 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
204 +                * and move it down to the bottom of the loop
205 +                */
206 +               if (signal_pending(current)) {
207 +                       if (copied)
208 +                               break;
209 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
210 +                       break;
211 +               }
212 +
213 +               /* Next get a buffer. */
214 +
215 +               skb = skb_peek(&sk->sk_receive_queue);
216 +
217 +               if (skb == NULL)                /* nothing ready */
218 +               {
219 +                       if (copied) {
220 +                               if (sk->sk_err ||
221 +                                   sk->sk_state == TCP_CLOSE ||
222 +                                   (sk->sk_shutdown & RCV_SHUTDOWN) ||
223 +                                   !timeo ||
224 +                                   (0))
225 +                                       break;
226 +                       } else {
227 +                               if (sock_flag(sk, SOCK_DONE))
228 +                                       break;
229 +
230 +                               if (sk->sk_err) {
231 +                                       copied = sock_error(sk);
232 +                                       break;
233 +                               }
234 +
235 +                               if (sk->sk_shutdown & RCV_SHUTDOWN)
236 +                                       break;
237 +
238 +                               if (sk->sk_state == TCP_CLOSE) {
239 +                                       if (!(sock_flag(sk, SOCK_DONE))) {
240 +                                               /* This occurs when user tries to read
241 +                                                * from never connected socket.
242 +                                                */
243 +                                               copied = -ENOTCONN;
244 +                                               break;
245 +                                       }
246 +                                       break;
247 +                               }
248 +
249 +                               if (!timeo) {
250 +                                       copied = -EAGAIN;
251 +                                       break;
252 +                               }
253 +                       }
254 +
255 +                       tcp_cleanup_rbuf(sk, copied);
256 +                       sk_wait_data(sk, &timeo);
257 +                       continue;
258 +               }
259 +
260 +               BUG_TRAP (atomic_read (&skb->users) == 1);
261 +
262 +               exhausted = eaten = 0;
263 +
264 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
265 +               if (skb->h.th->syn)
266 +                       offset--;
267 +
268 +               used = skb->len - offset;
269 +
270 +               if (tp->urg_data) {
271 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
272 +                       if (urg_offset < used) {
273 +                               if (!urg_offset) { /* at urgent date */
274 +                                       if (!(sock_flag(sk, SOCK_URGINLINE))) {
275 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
276 +                                               offset++;
277 +                                               used--;
278 +                                       }
279 +                               } else          /* truncate read */
280 +                                       used = urg_offset;
281 +                       }
282 +               }
283 +
284 +               BUG_TRAP (used >= 0);
285 +               if (len < used)
286 +                       used = len;
287 +
288 +               if (used == 0)
289 +                       exhausted = 1;
290 +               else
291 +               {
292 +                       if (skb_is_nonlinear (skb))
293 +                       {
294 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
295 +
296 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
297 +
298 +                               if (rc)
299 +                               {
300 +                                       if (!copied)
301 +                                               copied = rc;
302 +                                       break;
303 +                               }
304 +                       }
305 +
306 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
307 +                       {
308 +                               __skb_unlink (skb, &sk->sk_receive_queue);
309 +                               dst_release (skb->dst);
310 +                               skb_orphan (skb);
311 +                               __skb_pull (skb, offset);
312 +                               __skb_queue_tail (packets, skb);
313 +                               exhausted = eaten = 1;
314 +                       }
315 +                       else                    /* consuming only part of the packet */
316 +                       {
317 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
318 +
319 +                               if (skb2 == NULL)
320 +                               {
321 +                                       if (!copied)
322 +                                               copied = -ENOMEM;
323 +                                       break;
324 +                               }
325 +
326 +                               dst_release (skb2->dst);
327 +                               __skb_pull (skb2, offset);
328 +                               __skb_trim (skb2, used);
329 +                               __skb_queue_tail (packets, skb2);
330 +                       }
331 +
332 +                       tp->copied_seq += used;
333 +                       copied += used;
334 +                       len -= used;
335 +               }
336 +
337 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
338 +                       tp->urg_data = 0;
339 +                       tcp_fast_path_check(sk, tp);
340 +               }
341 +
342 +               if (!exhausted)
343 +                       continue;
344 +
345 +               if (skb->h.th->fin)
346 +               {
347 +                       tp->copied_seq++;
348 +                       if (!eaten)
349 +                               sk_eat_skb (sk, skb, 0);
350 +                       break;
351 +               }
352 +
353 +               if (!eaten)
354 +                       sk_eat_skb (sk, skb, 0);
355 +
356 +       } while (len > 0);
357 +
358 + out:
359 +       /* Clean up data we have read: This will do ACK frames. */
360 +       tcp_cleanup_rbuf(sk, copied);
361 +       TCP_CHECK_TIMER(sk);
362 +       release_sock(sk);
363 +       return copied;
364 +}
365 +
366  /*
367   *     State processing on a close. This implements the state shift for
368   *     sending our FIN frame. Note that we only send a FIN for some
369 @@ -2218,6 +2466,8 @@
370  EXPORT_SYMBOL(tcp_recvmsg);
371  EXPORT_SYMBOL(tcp_sendmsg);
372  EXPORT_SYMBOL(tcp_sendpage);
373 +EXPORT_SYMBOL(tcp_sendpage_zccd);
374 +EXPORT_SYMBOL(tcp_recvpackets);
375  EXPORT_SYMBOL(tcp_setsockopt);
376  EXPORT_SYMBOL(tcp_shutdown);
377  EXPORT_SYMBOL(tcp_statistics);
378 Index: linux-2.6.16.21-0.8/include/linux/skbuff.h
379 ===================================================================
380 --- linux-2.6.16.21-0.8.orig/include/linux/skbuff.h     2006-08-03 21:11:25.000000000 -0600
381 +++ linux-2.6.16.21-0.8/include/linux/skbuff.h  2006-08-03 21:11:30.000000000 -0600
382 @@ -129,6 +129,30 @@
383         __u16 size;
384  };
385  
386 +/* Support for callback when skb data has been released */
387 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
388 +{                                              /* (embed as first member of custom struct) */
389 +       atomic_t        zccd_count;             /* reference count */
390 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
391 +} zccd_t;
392 +
393 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
394 +{
395 +       atomic_set (&d->zccd_count, 1);
396 +       d->zccd_destructor = callback;
397 +}
398 +
399 +static inline void zccd_get (zccd_t *d)                /* take a reference */
400 +{
401 +       atomic_inc (&d->zccd_count);
402 +}
403 +
404 +static inline void zccd_put (zccd_t *d)                /* release a reference */
405 +{
406 +       if (atomic_dec_and_test (&d->zccd_count))
407 +               (d->zccd_destructor)(d);
408 +}
409 +
410  /* This data is invariant across clones and lives at
411   * the end of the header data, ie. at skb->end.
412   */
413 @@ -140,6 +164,13 @@
414         unsigned short  ufo_size;
415         unsigned int    ip6_frag_id;
416         struct sk_buff  *frag_list;
417 +       zccd_t          *zccd;                  /* zero copy descriptor */
418 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
419 +       /* NB we expect zero-copy data to be at least 1 packet, so
420 +       * having 2 zccds means we don't unneccessarily split the packet
421 +       * where consecutive zero-copy sends abutt.
422 +       */
423 +
424         skb_frag_t      frags[MAX_SKB_FRAGS];
425  };
426  
427 Index: linux-2.6.16.21-0.8/include/net/tcp.h
428 ===================================================================
429 --- linux-2.6.16.21-0.8.orig/include/net/tcp.h  2006-08-03 21:11:25.000000000 -0600
430 +++ linux-2.6.16.21-0.8/include/net/tcp.h       2006-08-03 21:11:30.000000000 -0600
431 @@ -272,6 +272,9 @@
432  extern int                     tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
433                                             struct msghdr *msg, size_t size);
434  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
435 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
436 +                                               int flags, zccd_t *zccd);
437 +
438  
439  extern int                     tcp_ioctl(struct sock *sk, 
440                                           int cmd, 
441 @@ -356,6 +359,9 @@
442                                             struct msghdr *msg,
443                                             size_t len, int nonblock, 
444                                             int flags, int *addr_len);
445 +extern int                     tcp_recvpackets(struct sock *sk,
446 +                                               struct sk_buff_head *packets,
447 +                                               int len, int nonblock);
448  
449  extern void                    tcp_parse_options(struct sk_buff *skb,
450                                                   struct tcp_options_received *opt_rx,