1 include/linux/skbuff.h | 30 +++++
3 net/core/skbuff.c | 25 ++++
4 net/ipv4/tcp.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++-
6 5 files changed, 311 insertions(+), 3 deletions(-)
8 --- linux-2.4.22-ac1/include/linux/skbuff.h~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
9 +++ linux-2.4.22-ac1-alexey/include/linux/skbuff.h 2003-09-26 00:38:48.000000000 +0400
10 @@ -116,6 +116,30 @@ struct skb_frag_struct
14 +/* Support for callback when skb data has been released */
15 +typedef struct zccd /* Zero Copy Callback Descriptor */
16 +{ /* (embed as first member of custom struct) */
17 + atomic_t zccd_count; /* reference count */
18 + void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
21 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
23 + atomic_set (&d->zccd_count, 1);
24 + d->zccd_destructor = callback;
27 +static inline void zccd_get (zccd_t *d) /* take a reference */
29 + atomic_inc (&d->zccd_count);
32 +static inline void zccd_put (zccd_t *d) /* release a reference */
34 + if (atomic_dec_and_test (&d->zccd_count))
35 + (d->zccd_destructor)(d);
38 /* This data is invariant across clones and lives at
39 * the end of the header data, ie. at skb->end.
41 @@ -123,6 +147,12 @@ struct skb_shared_info {
43 unsigned int nr_frags;
44 struct sk_buff *frag_list;
45 + zccd_t *zccd; /* zero copy descriptor */
46 + zccd_t *zccd2; /* 2nd zero copy descriptor */
47 + /* NB we expect zero-copy data to be at least 1 packet, so
48 + * having 2 zccds means we don't unneccessarily split the packet
49 + * where consecutive zero-copy sends abutt.
51 skb_frag_t frags[MAX_SKB_FRAGS];
54 --- linux-2.4.22-ac1/include/net/tcp.h~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
55 +++ linux-2.4.22-ac1-alexey/include/net/tcp.h 2003-09-26 00:38:48.000000000 +0400
56 @@ -643,6 +643,8 @@ extern int tcp_v4_tw_remember_stam
58 extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
59 extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
60 +extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
61 + int flags, zccd_t *zccd);
63 extern int tcp_ioctl(struct sock *sk,
65 @@ -737,6 +739,9 @@ extern int tcp_recvmsg(struct sock *sk
67 int len, int nonblock,
68 int flags, int *addr_len);
69 +extern int tcp_recvpackets(struct sock *sk,
70 + struct sk_buff_head *packets,
71 + int len, int nonblock);
73 extern int tcp_listen_start(struct sock *sk);
75 --- linux-2.4.22-ac1/net/core/skbuff.c~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
76 +++ linux-2.4.22-ac1-alexey/net/core/skbuff.c 2003-09-26 00:38:48.000000000 +0400
77 @@ -208,6 +208,8 @@ struct sk_buff *alloc_skb(unsigned int s
78 atomic_set(&(skb_shinfo(skb)->dataref), 1);
79 skb_shinfo(skb)->nr_frags = 0;
80 skb_shinfo(skb)->frag_list = NULL;
81 + skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
82 + skb_shinfo(skb)->zccd2 = NULL;
86 @@ -277,6 +279,10 @@ static void skb_release_data(struct sk_b
89 atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
90 + if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
91 + zccd_put (skb_shinfo(skb)->zccd); /* release hold */
92 + if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
93 + zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
94 if (skb_shinfo(skb)->nr_frags) {
96 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
97 @@ -535,6 +541,8 @@ int skb_linearize(struct sk_buff *skb, i
98 atomic_set(&(skb_shinfo(skb)->dataref), 1);
99 skb_shinfo(skb)->nr_frags = 0;
100 skb_shinfo(skb)->frag_list = NULL;
101 + skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */
102 + skb_shinfo(skb)->zccd2 = NULL;
104 /* We are no longer a clone, even if we were. */
106 @@ -581,6 +589,14 @@ struct sk_buff *pskb_copy(struct sk_buff
107 n->data_len = skb->data_len;
110 + if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
111 + zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
112 + skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
114 + if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
115 + zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
116 + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
118 if (skb_shinfo(skb)->nr_frags) {
121 @@ -623,6 +639,8 @@ int pskb_expand_head(struct sk_buff *skb
123 int size = nhead + (skb->end - skb->head) + ntail;
125 + zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
126 + zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
130 @@ -644,6 +662,11 @@ int pskb_expand_head(struct sk_buff *skb
131 if (skb_shinfo(skb)->frag_list)
132 skb_clone_fraglist(skb);
134 + if (zccd != NULL) /* user zero copy descriptor? */
135 + zccd_get (zccd); /* extra ref (pages are shared) */
136 + if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
137 + zccd_get (zccd2); /* extra ref (pages are shared) */
139 skb_release_data(skb);
141 off = (data+nhead) - skb->head;
142 @@ -658,6 +681,8 @@ int pskb_expand_head(struct sk_buff *skb
145 atomic_set(&skb_shinfo(skb)->dataref, 1);
146 + skb_shinfo(skb)->zccd = zccd;
147 + skb_shinfo(skb)->zccd2 = zccd2;
151 --- linux-2.4.22-ac1/net/ipv4/tcp.c~tcp-zero-copy-2.4.22-rh 2003-08-25 15:44:44.000000000 +0400
152 +++ linux-2.4.22-ac1-alexey/net/ipv4/tcp.c 2003-09-26 00:38:48.000000000 +0400
153 @@ -747,7 +747,7 @@ do_interrupted:
157 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
158 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
161 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
162 @@ -826,7 +826,8 @@ static int tcp_error(struct sock *sk, in
166 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
167 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
168 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
170 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
172 @@ -874,6 +875,17 @@ new_segment:
175 i = skb_shinfo(skb)->nr_frags;
177 + if (zccd != NULL && /* this is a zcc I/O */
178 + skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
179 + skb_shinfo(skb)->zccd2 != NULL &&
180 + skb_shinfo(skb)->zccd != zccd && /* not the same one */
181 + skb_shinfo(skb)->zccd2 != zccd)
183 + tcp_mark_push (tp, skb);
187 if (can_coalesce(skb, i, page, offset)) {
188 skb_shinfo(skb)->frags[i-1].size += copy;
189 } else if (i < MAX_SKB_FRAGS) {
190 @@ -884,6 +896,20 @@ new_segment:
194 + if (zccd != NULL && /* this is a zcc I/O */
195 + skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
196 + skb_shinfo(skb)->zccd2 != zccd)
198 + zccd_get (zccd); /* bump ref count */
200 + BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
202 + if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
203 + skb_shinfo(skb)->zccd = zccd;
205 + skb_shinfo(skb)->zccd2 = zccd;
209 skb->data_len += copy;
210 skb->ip_summed = CHECKSUM_HW;
211 @@ -947,7 +973,31 @@ ssize_t tcp_sendpage(struct socket *sock
215 - res = do_tcp_sendpages(sk, &page, offset, size, flags);
216 + res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
217 + TCP_CHECK_TIMER(sk);
222 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
223 + int flags, zccd_t *zccd)
226 + struct sock *sk = sock->sk;
228 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
230 + if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
231 + !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
234 +#undef TCP_ZC_CSUM_FLAGS
237 + TCP_CHECK_TIMER(sk);
239 + res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
244 @@ -1771,6 +1821,202 @@ recv_urg:
248 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
249 + int len, int nonblock)
251 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
255 + BUG_TRAP (len > 0);
256 + /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
260 + TCP_CHECK_TIMER(sk);
262 + copied = -ENOTCONN;
263 + if (sk->state == TCP_LISTEN)
267 + timeo = sock_rcvtimeo(sk, nonblock);
270 + struct sk_buff * skb;
272 + unsigned long used;
276 + /* Are we at urgent data? Stop if we have read anything. */
277 + if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
280 + /* We need to check signals first, to get correct SIGURG
281 + * handling. FIXME: Need to check this doesnt impact 1003.1g
282 + * and move it down to the bottom of the loop
284 + if (signal_pending(current)) {
287 + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
291 + /* Next get a buffer. */
293 + skb = skb_peek(&sk->receive_queue);
295 + if (skb == NULL) /* nothing ready */
299 + sk->state == TCP_CLOSE ||
300 + (sk->shutdown & RCV_SHUTDOWN) ||
309 + copied = sock_error(sk);
313 + if (sk->shutdown & RCV_SHUTDOWN)
316 + if (sk->state == TCP_CLOSE) {
318 + /* This occurs when user tries to read
319 + * from never connected socket.
321 + copied = -ENOTCONN;
333 + cleanup_rbuf(sk, copied);
334 + timeo = tcp_data_wait(sk, timeo);
338 + BUG_TRAP (atomic_read (&skb->users) == 1);
340 + exhausted = eaten = 0;
342 + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
343 + if (skb->h.th->syn)
346 + used = skb->len - offset;
348 + if (tp->urg_data) {
349 + u32 urg_offset = tp->urg_seq - tp->copied_seq;
350 + if (urg_offset < used) {
351 + if (!urg_offset) { /* at urgent date */
352 + if (!sk->urginline) {
353 + tp->copied_seq++; /* discard the single byte of urgent data */
357 + } else /* truncate read */
362 + BUG_TRAP (used >= 0);
370 + if (skb_is_nonlinear (skb))
372 + int rc = skb_linearize (skb, GFP_KERNEL);
374 + printk ("tcp_recvpackets(): linearising: %d\n", rc);
384 + if ((offset + used) == skb->len) /* consuming the whole packet */
386 + __skb_unlink (skb, &sk->receive_queue);
387 + dst_release (skb->dst);
389 + __skb_pull (skb, offset);
390 + __skb_queue_tail (packets, skb);
391 + exhausted = eaten = 1;
393 + else /* consuming only part of the packet */
395 + struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
404 + dst_release (skb2->dst);
405 + __skb_pull (skb2, offset);
406 + __skb_trim (skb2, used);
407 + __skb_queue_tail (packets, skb2);
410 + tp->copied_seq += used;
415 + if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
417 + tcp_fast_path_check(sk, tp);
423 + if (skb->h.th->fin)
427 + tcp_eat_skb (sk, skb);
432 + tcp_eat_skb (sk, skb);
437 + /* Clean up data we have read: This will do ACK frames. */
438 + cleanup_rbuf(sk, copied);
439 + TCP_CHECK_TIMER(sk);
445 * State processing on a close. This implements the state shift for
446 * sending our FIN frame. Note that we only send a FIN for some
447 --- linux-2.4.22-ac1/net/netsyms.c~tcp-zero-copy-2.4.22-rh 2003-09-25 14:16:26.000000000 +0400
448 +++ linux-2.4.22-ac1-alexey/net/netsyms.c 2003-09-26 00:39:16.000000000 +0400
449 @@ -396,6 +396,8 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
450 EXPORT_SYMBOL(sysctl_tcp_ecn);
451 EXPORT_SYMBOL(tcp_cwnd_application_limited);
452 EXPORT_SYMBOL(tcp_sendpage);
453 +EXPORT_SYMBOL(tcp_sendpage_zccd);
454 +EXPORT_SYMBOL(tcp_recvpackets);
455 EXPORT_SYMBOL(sysctl_tcp_low_latency);
457 EXPORT_SYMBOL(tcp_write_xmit);