1 include/linux/skbuff.h | 30 +++++
3 net/core/skbuff.c | 25 ++++
4 net/ipv4/tcp.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++-
6 5 files changed, 311 insertions(+), 3 deletions(-)
8 --- kernel-2.4.20-6chaos_18_7/include/linux/skbuff.h~tcp_zero_copy_2.4.20_chaos 2003-06-24 11:31:17.000000000 -0600
9 +++ kernel-2.4.20-6chaos_18_7-braam/include/linux/skbuff.h 2003-07-12 15:38:07.000000000 -0600
10 @@ -116,6 +116,30 @@ struct skb_frag_struct
14 +/* Support for callback when skb data has been released */
15 +typedef struct zccd /* Zero Copy Callback Descriptor */
16 +{ /* (embed as first member of custom struct) */
17 + atomic_t zccd_count; /* reference count */
18 + void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
21 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
23 + atomic_set (&d->zccd_count, 1);
24 + d->zccd_destructor = callback;
27 +static inline void zccd_get (zccd_t *d) /* take a reference */
29 + atomic_inc (&d->zccd_count);
32 +static inline void zccd_put (zccd_t *d) /* release a reference */
34 + if (atomic_dec_and_test (&d->zccd_count))
35 + (d->zccd_destructor)(d);
38 /* This data is invariant across clones and lives at
39 * the end of the header data, ie. at skb->end.
41 @@ -123,6 +147,12 @@ struct skb_shared_info {
43 unsigned int nr_frags;
44 struct sk_buff *frag_list;
45 + zccd_t *zccd; /* zero copy descriptor */
46 + zccd_t *zccd2; /* 2nd zero copy descriptor */
47 + /* NB we expect zero-copy data to be at least 1 packet, so
48 + * having 2 zccds means we don't unneccessarily split the packet
49 + * where consecutive zero-copy sends abutt.
51 skb_frag_t frags[MAX_SKB_FRAGS];
54 --- kernel-2.4.20-6chaos_18_7/include/net/tcp.h~tcp_zero_copy_2.4.20_chaos 2003-06-24 11:31:17.000000000 -0600
55 +++ kernel-2.4.20-6chaos_18_7-braam/include/net/tcp.h 2003-07-12 15:38:07.000000000 -0600
56 @@ -643,6 +643,8 @@ extern int tcp_v4_tw_remember_stam
58 extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
59 extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
60 +extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
61 + int flags, zccd_t *zccd);
63 extern int tcp_ioctl(struct sock *sk,
65 @@ -737,6 +739,9 @@ extern int tcp_recvmsg(struct sock *sk
67 int len, int nonblock,
68 int flags, int *addr_len);
69 +extern int tcp_recvpackets(struct sock *sk,
70 + struct sk_buff_head *packets,
71 + int len, int nonblock);
73 extern int tcp_listen_start(struct sock *sk);
75 --- kernel-2.4.20-6chaos_18_7/net/netsyms.c~tcp_zero_copy_2.4.20_chaos 2003-05-15 21:15:18.000000000 -0600
76 +++ kernel-2.4.20-6chaos_18_7-braam/net/netsyms.c 2003-07-12 15:38:54.000000000 -0600
77 @@ -397,6 +397,8 @@ EXPORT_SYMBOL(sysctl_tcp_wmem);
78 EXPORT_SYMBOL(sysctl_tcp_ecn);
79 EXPORT_SYMBOL(tcp_cwnd_application_limited);
80 EXPORT_SYMBOL(tcp_sendpage);
81 +EXPORT_SYMBOL(tcp_sendpage_zccd);
82 +EXPORT_SYMBOL(tcp_recvpackets);
83 EXPORT_SYMBOL(sysctl_tcp_low_latency);
85 EXPORT_SYMBOL(tcp_write_xmit);
86 --- kernel-2.4.20-6chaos_18_7/net/core/skbuff.c~tcp_zero_copy_2.4.20_chaos 2003-05-15 21:15:21.000000000 -0600
87 +++ kernel-2.4.20-6chaos_18_7-braam/net/core/skbuff.c 2003-07-12 15:38:07.000000000 -0600
88 @@ -208,6 +208,8 @@ struct sk_buff *alloc_skb(unsigned int s
89 atomic_set(&(skb_shinfo(skb)->dataref), 1);
90 skb_shinfo(skb)->nr_frags = 0;
91 skb_shinfo(skb)->frag_list = NULL;
92 + skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
93 + skb_shinfo(skb)->zccd2 = NULL;
97 @@ -276,6 +278,10 @@ static void skb_release_data(struct sk_b
100 atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
101 + if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
102 + zccd_put (skb_shinfo(skb)->zccd); /* release hold */
103 + if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
104 + zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
105 if (skb_shinfo(skb)->nr_frags) {
107 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
108 @@ -532,6 +538,8 @@ int skb_linearize(struct sk_buff *skb, i
109 atomic_set(&(skb_shinfo(skb)->dataref), 1);
110 skb_shinfo(skb)->nr_frags = 0;
111 skb_shinfo(skb)->frag_list = NULL;
112 + skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */
113 + skb_shinfo(skb)->zccd2 = NULL;
115 /* We are no longer a clone, even if we were. */
117 @@ -578,6 +586,14 @@ struct sk_buff *pskb_copy(struct sk_buff
118 n->data_len = skb->data_len;
121 + if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
122 + zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
123 + skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
125 + if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
126 + zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
127 + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
129 if (skb_shinfo(skb)->nr_frags) {
132 @@ -620,6 +636,8 @@ int pskb_expand_head(struct sk_buff *skb
134 int size = nhead + (skb->end - skb->head) + ntail;
136 + zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
137 + zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
141 @@ -641,6 +659,11 @@ int pskb_expand_head(struct sk_buff *skb
142 if (skb_shinfo(skb)->frag_list)
143 skb_clone_fraglist(skb);
145 + if (zccd != NULL) /* user zero copy descriptor? */
146 + zccd_get (zccd); /* extra ref (pages are shared) */
147 + if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
148 + zccd_get (zccd2); /* extra ref (pages are shared) */
150 skb_release_data(skb);
152 off = (data+nhead) - skb->head;
153 @@ -655,6 +678,8 @@ int pskb_expand_head(struct sk_buff *skb
156 atomic_set(&skb_shinfo(skb)->dataref, 1);
157 + skb_shinfo(skb)->zccd = zccd;
158 + skb_shinfo(skb)->zccd2 = zccd2;
162 --- kernel-2.4.20-6chaos_18_7/net/ipv4/tcp.c~tcp_zero_copy_2.4.20_chaos 2003-05-15 21:15:21.000000000 -0600
163 +++ kernel-2.4.20-6chaos_18_7-braam/net/ipv4/tcp.c 2003-07-12 15:38:07.000000000 -0600
164 @@ -747,7 +747,7 @@ do_interrupted:
168 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
169 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
172 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
173 @@ -826,7 +826,8 @@ static int tcp_error(struct sock *sk, in
177 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
178 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
179 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
181 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
183 @@ -874,6 +875,17 @@ new_segment:
186 i = skb_shinfo(skb)->nr_frags;
188 + if (zccd != NULL && /* this is a zcc I/O */
189 + skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
190 + skb_shinfo(skb)->zccd2 != NULL &&
191 + skb_shinfo(skb)->zccd != zccd && /* not the same one */
192 + skb_shinfo(skb)->zccd2 != zccd)
194 + tcp_mark_push (tp, skb);
198 if (can_coalesce(skb, i, page, offset)) {
199 skb_shinfo(skb)->frags[i-1].size += copy;
200 } else if (i < MAX_SKB_FRAGS) {
201 @@ -884,6 +896,20 @@ new_segment:
205 + if (zccd != NULL && /* this is a zcc I/O */
206 + skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
207 + skb_shinfo(skb)->zccd2 != zccd)
209 + zccd_get (zccd); /* bump ref count */
211 + BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
213 + if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
214 + skb_shinfo(skb)->zccd = zccd;
216 + skb_shinfo(skb)->zccd2 = zccd;
220 skb->data_len += copy;
221 skb->ip_summed = CHECKSUM_HW;
222 @@ -947,7 +973,31 @@ ssize_t tcp_sendpage(struct socket *sock
226 - res = do_tcp_sendpages(sk, &page, offset, size, flags);
227 + res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
228 + TCP_CHECK_TIMER(sk);
233 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
234 + int flags, zccd_t *zccd)
237 + struct sock *sk = sock->sk;
239 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
241 + if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
242 + !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
245 +#undef TCP_ZC_CSUM_FLAGS
248 + TCP_CHECK_TIMER(sk);
250 + res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
255 @@ -1771,6 +1821,202 @@ recv_urg:
259 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
260 + int len, int nonblock)
262 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
266 + BUG_TRAP (len > 0);
267 + /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
271 + TCP_CHECK_TIMER(sk);
273 + copied = -ENOTCONN;
274 + if (sk->state == TCP_LISTEN)
278 + timeo = sock_rcvtimeo(sk, nonblock);
281 + struct sk_buff * skb;
283 + unsigned long used;
287 + /* Are we at urgent data? Stop if we have read anything. */
288 + if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
291 + /* We need to check signals first, to get correct SIGURG
292 + * handling. FIXME: Need to check this doesnt impact 1003.1g
293 + * and move it down to the bottom of the loop
295 + if (signal_pending(current)) {
298 + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
302 + /* Next get a buffer. */
304 + skb = skb_peek(&sk->receive_queue);
306 + if (skb == NULL) /* nothing ready */
310 + sk->state == TCP_CLOSE ||
311 + (sk->shutdown & RCV_SHUTDOWN) ||
320 + copied = sock_error(sk);
324 + if (sk->shutdown & RCV_SHUTDOWN)
327 + if (sk->state == TCP_CLOSE) {
329 + /* This occurs when user tries to read
330 + * from never connected socket.
332 + copied = -ENOTCONN;
344 + cleanup_rbuf(sk, copied);
345 + timeo = tcp_data_wait(sk, timeo);
349 + BUG_TRAP (atomic_read (&skb->users) == 1);
351 + exhausted = eaten = 0;
353 + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
354 + if (skb->h.th->syn)
357 + used = skb->len - offset;
359 + if (tp->urg_data) {
360 + u32 urg_offset = tp->urg_seq - tp->copied_seq;
361 + if (urg_offset < used) {
362 + if (!urg_offset) { /* at urgent date */
363 + if (!sk->urginline) {
364 + tp->copied_seq++; /* discard the single byte of urgent data */
368 + } else /* truncate read */
373 + BUG_TRAP (used >= 0);
381 + if (skb_is_nonlinear (skb))
383 + int rc = skb_linearize (skb, GFP_KERNEL);
385 + printk ("tcp_recvpackets(): linearising: %d\n", rc);
395 + if ((offset + used) == skb->len) /* consuming the whole packet */
397 + __skb_unlink (skb, &sk->receive_queue);
398 + dst_release (skb->dst);
400 + __skb_pull (skb, offset);
401 + __skb_queue_tail (packets, skb);
402 + exhausted = eaten = 1;
404 + else /* consuming only part of the packet */
406 + struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
415 + dst_release (skb2->dst);
416 + __skb_pull (skb2, offset);
417 + __skb_trim (skb2, used);
418 + __skb_queue_tail (packets, skb2);
421 + tp->copied_seq += used;
426 + if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
428 + tcp_fast_path_check(sk, tp);
434 + if (skb->h.th->fin)
438 + tcp_eat_skb (sk, skb);
443 + tcp_eat_skb (sk, skb);
448 + /* Clean up data we have read: This will do ACK frames. */
449 + cleanup_rbuf(sk, copied);
450 + TCP_CHECK_TIMER(sk);
456 * State processing on a close. This implements the state shift for
457 * sending our FIN frame. Note that we only send a FIN for some