Whamcloud - gitweb
b=3119
[fs/lustre-release.git] / lustre / kernel_patches / patches / tcp-zero-copy-2.4.21-suse2.patch
1 Index: linux-2.4.21-suse2/include/linux/skbuff.h
2 ===================================================================
3 --- linux-2.4.21-suse2.orig/include/linux/skbuff.h      2004-01-10 12:48:50.000000000 +0300
4 +++ linux-2.4.21-suse2/include/linux/skbuff.h   2004-01-10 16:31:03.000000000 +0300
5 @@ -116,6 +116,30 @@
6         __u16 size;
7  };
8  
9 +/* Support for callback when skb data has been released */
10 +typedef struct zccd                            /* Zero Copy Callback Descriptor */
11 +{                                              /* (embed as first member of custom struct) */
12 +       atomic_t        zccd_count;             /* reference count */
13 +       void           (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
14 +} zccd_t;
15 +
16 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
17 +{
18 +       atomic_set (&d->zccd_count, 1);
19 +       d->zccd_destructor = callback;
20 +}
21 +
22 +static inline void zccd_get (zccd_t *d)                /* take a reference */
23 +{
24 +       atomic_inc (&d->zccd_count);
25 +}
26 +
27 +static inline void zccd_put (zccd_t *d)                /* release a reference */
28 +{
29 +       if (atomic_dec_and_test (&d->zccd_count))
30 +               (d->zccd_destructor)(d);
31 +}
32 +
33  /* This data is invariant across clones and lives at
34   * the end of the header data, ie. at skb->end.
35   */
36 @@ -123,6 +147,12 @@
37         atomic_t        dataref;
38         unsigned int    nr_frags;
39         struct sk_buff  *frag_list;
40 +       zccd_t          *zccd;                  /* zero copy descriptor */
41 +       zccd_t          *zccd2;                 /* 2nd zero copy descriptor */
42 +       /* NB we expect zero-copy data to be at least 1 packet, so
43 +        * having 2 zccds means we don't unneccessarily split the packet
44 +        * where consecutive zero-copy sends abutt.
45 +        */
46         skb_frag_t      frags[MAX_SKB_FRAGS];
47  };
48  
49 Index: linux-2.4.21-suse2/include/net/tcp.h
50 ===================================================================
51 --- linux-2.4.21-suse2.orig/include/net/tcp.h   2003-11-11 03:47:27.000000000 +0300
52 +++ linux-2.4.21-suse2/include/net/tcp.h        2004-01-10 16:33:16.000000000 +0300
53 @@ -644,6 +644,8 @@
54  
55  extern int                     tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
56  extern ssize_t                 tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
57 +extern ssize_t                 tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
58 +                                                 int flags, zccd_t *zccd);
59  
60  extern int                     tcp_ioctl(struct sock *sk, 
61                                           int cmd, 
62 @@ -740,6 +742,9 @@
63                                             struct msghdr *msg,
64                                             int len, int nonblock, 
65                                             int flags, int *addr_len);
66 +extern int                     tcp_recvpackets(struct sock *sk,
67 +                                               struct sk_buff_head *packets,
68 +                                               int len, int nonblock);
69  extern int tcp_kvec_read(struct sock *sk, kvec_cb_t cb, int len);
70  extern int tcp_kvec_write(struct sock *sk, kvec_cb_t cb, int len);
71  
72 Index: linux-2.4.21-suse2/net/netsyms.c
73 ===================================================================
74 --- linux-2.4.21-suse2.orig/net/netsyms.c       2003-10-28 21:34:24.000000000 +0300
75 +++ linux-2.4.21-suse2/net/netsyms.c    2004-01-10 16:30:46.000000000 +0300
76 @@ -403,6 +403,8 @@
77  
78  #endif
79  
80 +EXPORT_SYMBOL(tcp_sendpage_zccd);
81 +EXPORT_SYMBOL(tcp_recvpackets);
82  EXPORT_SYMBOL(tcp_read_sock);
83  
84  EXPORT_SYMBOL(netlink_set_err);
85 Index: linux-2.4.21-suse2/net/core/skbuff.c
86 ===================================================================
87 --- linux-2.4.21-suse2.orig/net/core/skbuff.c   2003-11-07 02:54:26.000000000 +0300
88 +++ linux-2.4.21-suse2/net/core/skbuff.c        2004-01-10 16:30:46.000000000 +0300
89 @@ -208,6 +208,8 @@
90         atomic_set(&(skb_shinfo(skb)->dataref), 1);
91         skb_shinfo(skb)->nr_frags = 0;
92         skb_shinfo(skb)->frag_list = NULL;
93 +       skb_shinfo(skb)->zccd = NULL;           /* skbuffs kick off with NO user zero copy descriptors */
94 +       skb_shinfo(skb)->zccd2 = NULL;
95         return skb;
96  
97  nodata:
98 @@ -277,6 +279,10 @@
99  {
100         if (!skb->cloned ||
101             atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
102 +               if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
103 +                       zccd_put (skb_shinfo(skb)->zccd); /* release hold */
104 +               if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
105 +                       zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
106                 if (skb_shinfo(skb)->nr_frags) {
107                         int i;
108                         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
109 @@ -535,6 +541,8 @@
110         atomic_set(&(skb_shinfo(skb)->dataref), 1);
111         skb_shinfo(skb)->nr_frags = 0;
112         skb_shinfo(skb)->frag_list = NULL;
113 +       skb_shinfo(skb)->zccd = NULL;           /* copied data => no user zero copy descriptor */
114 +       skb_shinfo(skb)->zccd2 = NULL;
115  
116         /* We are no longer a clone, even if we were. */
117         skb->cloned = 0;
118 @@ -581,6 +589,14 @@
119         n->data_len = skb->data_len;
120         n->len = skb->len;
121  
122 +       if (skb_shinfo(skb)->zccd != NULL)      /* user zero copy descriptor? */
123 +               zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
124 +       skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
125 +
126 +       if (skb_shinfo(skb)->zccd2 != NULL)     /* 2nd user zero copy descriptor? */
127 +               zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
128 +       skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
129 +
130         if (skb_shinfo(skb)->nr_frags) {
131                 int i;
132  
133 @@ -623,6 +639,8 @@
134         u8 *data;
135         int size = nhead + (skb->end - skb->head) + ntail;
136         long off;
137 +       zccd_t *zccd = skb_shinfo(skb)->zccd;   /* stash user zero copy descriptor */
138 +       zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
139  
140         if (skb_shared(skb))
141                 BUG();
142 @@ -644,6 +662,11 @@
143         if (skb_shinfo(skb)->frag_list)
144                 skb_clone_fraglist(skb);
145  
146 +       if (zccd != NULL)                       /* user zero copy descriptor? */
147 +               zccd_get (zccd);                /* extra ref (pages are shared) */
148 +       if (zccd2 != NULL)                      /* 2nd user zero copy descriptor? */
149 +               zccd_get (zccd2);               /* extra ref (pages are shared) */
150 +
151         skb_release_data(skb);
152  
153         off = (data+nhead) - skb->head;
154 @@ -658,6 +681,8 @@
155         skb->nh.raw += off;
156         skb->cloned = 0;
157         atomic_set(&skb_shinfo(skb)->dataref, 1);
158 +       skb_shinfo(skb)->zccd = zccd;
159 +       skb_shinfo(skb)->zccd2 = zccd2;
160         return 0;
161  
162  nodata:
163 Index: linux-2.4.21-suse2/net/ipv4/tcp.c
164 ===================================================================
165 --- linux-2.4.21-suse2.orig/net/ipv4/tcp.c      2003-10-28 21:34:09.000000000 +0300
166 +++ linux-2.4.21-suse2/net/ipv4/tcp.c   2004-01-10 16:35:26.000000000 +0300
167 @@ -1013,7 +1013,7 @@
168         goto out;
169  }
170  
171 -ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags);
172 +ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags, zccd_t *zccd);
173  
174  static inline int
175  can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
176 @@ -1092,7 +1092,8 @@
177         return err;
178  }
179  
180 -ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags)
181 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
182 +ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags, zccd_t *zccd)
183  {
184         struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
185         int mss_now;
186 @@ -1145,6 +1146,17 @@
187                         copy = size;
188  
189                 i = skb_shinfo(skb)->nr_frags;
190 +
191 +               if (zccd != NULL &&             /* this is a zcc I/O */
192 +                   skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
193 +                   skb_shinfo(skb)->zccd2 != NULL &&
194 +                   skb_shinfo(skb)->zccd != zccd && /* not the same one */
195 +                   skb_shinfo(skb)->zccd2 != zccd)
196 +               {
197 +                       tcp_mark_push (tp, skb);
198 +                       goto new_segment;
199 +               }
200 +
201                 if (can_coalesce(skb, i, page, offset)) {
202                         skb_shinfo(skb)->frags[i-1].size += copy;
203                 } else if (i < MAX_SKB_FRAGS) {
204 @@ -1155,6 +1167,20 @@
205                         goto new_segment;
206                 }
207  
208 +               if (zccd != NULL &&     /* this is a zcc I/O */
209 +                   skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
210 +                   skb_shinfo(skb)->zccd2 != zccd)
211 +               {
212 +                       zccd_get (zccd);        /* bump ref count */
213 +
214 +                       BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
215 +
216 +                       if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
217 +                               skb_shinfo(skb)->zccd = zccd;
218 +                       else
219 +                               skb_shinfo(skb)->zccd2 = zccd;
220 +               }
221 +
222                 skb->len += copy;
223                 skb->data_len += copy;
224                 skb->ip_summed = CHECKSUM_HW;
225 @@ -1221,7 +1247,30 @@
226  
227         lock_sock(sk);
228         TCP_CHECK_TIMER(sk);
229 -       res = do_tcp_sendpages(sk, &let, 0, size, flags);
230 +       res = do_tcp_sendpages(sk, &let, 0, size, flags, NULL);
231 +       TCP_CHECK_TIMER(sk);
232 +       release_sock(sk);
233 +       return res;
234 +}
235 +
236 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
237 +                         int flags, zccd_t *zccd)
238 +{
239 +       struct kveclet let = { page, offset, size };
240 +       ssize_t res;
241 +       struct sock *sk = sock->sk;
242 +
243 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
244 +
245 +       if (!(sk->route_caps & NETIF_F_SG) ||   /* caller shouldn't waste her time */
246 +           !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
247 +               BUG ();
248 +
249 +       lock_sock(sk);
250 +       TCP_CHECK_TIMER(sk);
251 +
252 +       res = do_tcp_sendpages(sk, &let, 0, size, flags, zccd);
253 +
254         TCP_CHECK_TIMER(sk);
255         release_sock(sk);
256         return res;
257 @@ -1235,7 +1284,7 @@
258             !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
259                 BUG();
260  
261 -       res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT);
262 +       res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT, NULL);
263         if (res > 0)
264                 info->done += res;
265  
266 @@ -2084,6 +2133,202 @@
267         goto out;
268  }
269  
270 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
271 +                    int len, int nonblock)
272 +{
273 +       struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
274 +       int copied;
275 +       long timeo;
276 +
277 +       BUG_TRAP (len > 0);
278 +       /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
279 +
280 +       lock_sock(sk);
281 +
282 +       TCP_CHECK_TIMER(sk);
283 +
284 +       copied = -ENOTCONN;
285 +       if (sk->state == TCP_LISTEN)
286 +               goto out;
287 +
288 +       copied = 0;
289 +       timeo = sock_rcvtimeo(sk, nonblock);
290 +
291 +       do {
292 +               struct sk_buff * skb;
293 +               u32 offset;
294 +               unsigned long used;
295 +               int exhausted;
296 +               int eaten;
297 +
298 +               /* Are we at urgent data? Stop if we have read anything. */
299 +               if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
300 +                       break;
301 +
302 +               /* We need to check signals first, to get correct SIGURG
303 +                * handling. FIXME: Need to check this doesnt impact 1003.1g
304 +                * and move it down to the bottom of the loop
305 +                */
306 +               if (signal_pending(current)) {
307 +                       if (copied)
308 +                               break;
309 +                       copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
310 +                       break;
311 +               }
312 +
313 +               /* Next get a buffer. */
314 +
315 +               skb = skb_peek(&sk->receive_queue);
316 +
317 +               if (skb == NULL)                /* nothing ready */
318 +               {
319 +                       if (copied) {
320 +                               if (sk->err ||
321 +                                   sk->state == TCP_CLOSE ||
322 +                                   (sk->shutdown & RCV_SHUTDOWN) ||
323 +                                   !timeo ||
324 +                                   (0))
325 +                                       break;
326 +                       } else {
327 +                               if (sk->done)
328 +                                       break;
329 +
330 +                               if (sk->err) {
331 +                                       copied = sock_error(sk);
332 +                                       break;
333 +                               }
334 +
335 +                               if (sk->shutdown & RCV_SHUTDOWN)
336 +                                       break;
337 +
338 +                               if (sk->state == TCP_CLOSE) {
339 +                                       if (!sk->done) {
340 +                                               /* This occurs when user tries to read
341 +                                                * from never connected socket.
342 +                                                */
343 +                                               copied = -ENOTCONN;
344 +                                               break;
345 +                                       }
346 +                                       break;
347 +                               }
348 +
349 +                               if (!timeo) {
350 +                                       copied = -EAGAIN;
351 +                                       break;
352 +                               }
353 +                       }
354 +
355 +                       cleanup_rbuf(sk, copied);
356 +                       timeo = tcp_data_wait(sk, timeo);
357 +                       continue;
358 +               }
359 +
360 +               BUG_TRAP (atomic_read (&skb->users) == 1);
361 +
362 +               exhausted = eaten = 0;
363 +
364 +               offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
365 +               if (skb->h.th->syn)
366 +                       offset--;
367 +
368 +               used = skb->len - offset;
369 +
370 +               if (tp->urg_data) {
371 +                       u32 urg_offset = tp->urg_seq - tp->copied_seq;
372 +                       if (urg_offset < used) {
373 +                               if (!urg_offset) { /* at urgent date */
374 +                                       if (!sk->urginline) {
375 +                                               tp->copied_seq++; /* discard the single byte of urgent data */
376 +                                               offset++;
377 +                                               used--;
378 +                                       }
379 +                               } else          /* truncate read */
380 +                                       used = urg_offset;
381 +                       }
382 +               }
383 +
384 +               BUG_TRAP (used >= 0);
385 +               if (len < used)
386 +                       used = len;
387 +
388 +               if (used == 0)
389 +                       exhausted = 1;
390 +               else
391 +               {
392 +                       if (skb_is_nonlinear (skb))
393 +                       {
394 +                               int   rc = skb_linearize (skb, GFP_KERNEL);
395 +
396 +                               printk ("tcp_recvpackets(): linearising: %d\n", rc);
397 +
398 +                               if (rc)
399 +                               {
400 +                                       if (!copied)
401 +                                               copied = rc;
402 +                                       break;
403 +                               }
404 +                       }
405 +
406 +                       if ((offset + used) == skb->len) /* consuming the whole packet */
407 +                       {
408 +                               __skb_unlink (skb, &sk->receive_queue);
409 +                               dst_release (skb->dst);
410 +                               skb_orphan (skb);
411 +                               __skb_pull (skb, offset);
412 +                               __skb_queue_tail (packets, skb);
413 +                               exhausted = eaten = 1;
414 +                       }
415 +                       else                    /* consuming only part of the packet */
416 +                       {
417 +                               struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
418 +
419 +                               if (skb2 == NULL)
420 +                               {
421 +                                       if (!copied)
422 +                                               copied = -ENOMEM;
423 +                                       break;
424 +                               }
425 +
426 +                               dst_release (skb2->dst);
427 +                               __skb_pull (skb2, offset);
428 +                               __skb_trim (skb2, used);
429 +                               __skb_queue_tail (packets, skb2);
430 +                       }
431 +
432 +                       tp->copied_seq += used;
433 +                       copied += used;
434 +                       len -= used;
435 +               }
436 +
437 +               if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
438 +                       tp->urg_data = 0;
439 +                       tcp_fast_path_check(sk, tp);
440 +               }
441 +
442 +               if (!exhausted)
443 +                       continue;
444 +
445 +               if (skb->h.th->fin)
446 +               {
447 +                       tp->copied_seq++;
448 +                       if (!eaten)
449 +                               tcp_eat_skb (sk, skb);
450 +                       break;
451 +               }
452 +
453 +               if (!eaten)
454 +                       tcp_eat_skb (sk, skb);
455 +
456 +       } while (len > 0);
457 +
458 + out:
459 +       /* Clean up data we have read: This will do ACK frames. */
460 +       cleanup_rbuf(sk, copied);
461 +       TCP_CHECK_TIMER(sk);
462 +       release_sock(sk);
463 +       return copied;
464 +}
465 +
466  /*
467   *     State processing on a close. This implements the state shift for
468   *     sending our FIN frame. Note that we only send a FIN for some