1 include/linux/skbuff.h | 30 +++++
3 net/core/skbuff.c | 25 ++++
4 net/ipv4/tcp.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++-
6 5 files changed, 311 insertions(+), 3 deletions(-)
8 Index: linux-2.4.20-rh-20.9/include/linux/skbuff.h
9 ===================================================================
10 --- linux-2.4.20-rh-20.9.orig/include/linux/skbuff.h 2003-09-13 19:34:24.000000000 +0400
11 +++ linux-2.4.20-rh-20.9/include/linux/skbuff.h 2003-12-19 14:14:55.000000000 +0300
16 +/* Support for callback when skb data has been released */
17 +typedef struct zccd /* Zero Copy Callback Descriptor */
18 +{ /* (embed as first member of custom struct) */
19 + atomic_t zccd_count; /* reference count */
20 + void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */
23 +static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *))
25 + atomic_set (&d->zccd_count, 1);
26 + d->zccd_destructor = callback;
29 +static inline void zccd_get (zccd_t *d) /* take a reference */
31 + atomic_inc (&d->zccd_count);
34 +static inline void zccd_put (zccd_t *d) /* release a reference */
36 + if (atomic_dec_and_test (&d->zccd_count))
37 + (d->zccd_destructor)(d);
40 /* This data is invariant across clones and lives at
41 * the end of the header data, ie. at skb->end.
45 unsigned int nr_frags;
46 struct sk_buff *frag_list;
47 + zccd_t *zccd; /* zero copy descriptor */
48 + zccd_t *zccd2; /* 2nd zero copy descriptor */
49 + /* NB we expect zero-copy data to be at least 1 packet, so
50 + * having 2 zccds means we don't unneccessarily split the packet
51 + * where consecutive zero-copy sends abutt.
53 skb_frag_t frags[MAX_SKB_FRAGS];
56 Index: linux-2.4.20-rh-20.9/include/net/tcp.h
57 ===================================================================
58 --- linux-2.4.20-rh-20.9.orig/include/net/tcp.h 2003-09-13 19:34:25.000000000 +0400
59 +++ linux-2.4.20-rh-20.9/include/net/tcp.h 2003-12-19 14:14:55.000000000 +0300
62 extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size);
63 extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
64 +extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
65 + int flags, zccd_t *zccd);
67 extern int tcp_ioctl(struct sock *sk,
71 int len, int nonblock,
72 int flags, int *addr_len);
73 +extern int tcp_recvpackets(struct sock *sk,
74 + struct sk_buff_head *packets,
75 + int len, int nonblock);
77 extern int tcp_listen_start(struct sock *sk);
79 Index: linux-2.4.20-rh-20.9/net/netsyms.c
80 ===================================================================
81 --- linux-2.4.20-rh-20.9.orig/net/netsyms.c 2003-09-13 19:34:24.000000000 +0400
82 +++ linux-2.4.20-rh-20.9/net/netsyms.c 2003-12-19 14:15:24.000000000 +0300
84 EXPORT_SYMBOL(sysctl_tcp_wmem);
85 EXPORT_SYMBOL(sysctl_tcp_ecn);
86 EXPORT_SYMBOL(tcp_cwnd_application_limited);
87 -EXPORT_SYMBOL(tcp_sendpage);
88 +EXPORT_SYMBOL(tcp_recvpackets);
89 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 EXPORT_SYMBOL(tcp_write_xmit);
96 +EXPORT_SYMBOL(tcp_sendpage);
97 +EXPORT_SYMBOL(tcp_sendpage_zccd);
98 EXPORT_SYMBOL(tcp_read_sock);
100 EXPORT_SYMBOL(netlink_set_err);
101 Index: linux-2.4.20-rh-20.9/net/core/skbuff.c
102 ===================================================================
103 --- linux-2.4.20-rh-20.9.orig/net/core/skbuff.c 2003-09-13 19:34:19.000000000 +0400
104 +++ linux-2.4.20-rh-20.9/net/core/skbuff.c 2003-12-19 14:14:56.000000000 +0300
106 atomic_set(&(skb_shinfo(skb)->dataref), 1);
107 skb_shinfo(skb)->nr_frags = 0;
108 skb_shinfo(skb)->frag_list = NULL;
109 + skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */
110 + skb_shinfo(skb)->zccd2 = NULL;
117 atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
118 + if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */
119 + zccd_put (skb_shinfo(skb)->zccd); /* release hold */
120 + if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */
121 + zccd_put (skb_shinfo(skb)->zccd2); /* release hold */
122 if (skb_shinfo(skb)->nr_frags) {
124 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
126 atomic_set(&(skb_shinfo(skb)->dataref), 1);
127 skb_shinfo(skb)->nr_frags = 0;
128 skb_shinfo(skb)->frag_list = NULL;
129 + skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */
130 + skb_shinfo(skb)->zccd2 = NULL;
132 /* We are no longer a clone, even if we were. */
135 n->data_len = skb->data_len;
138 + if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */
139 + zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */
140 + skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd;
142 + if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */
143 + zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */
144 + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
146 if (skb_shinfo(skb)->nr_frags) {
151 int size = nhead + (skb->end - skb->head) + ntail;
153 + zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */
154 + zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */
159 if (skb_shinfo(skb)->frag_list)
160 skb_clone_fraglist(skb);
162 + if (zccd != NULL) /* user zero copy descriptor? */
163 + zccd_get (zccd); /* extra ref (pages are shared) */
164 + if (zccd2 != NULL) /* 2nd user zero copy descriptor? */
165 + zccd_get (zccd2); /* extra ref (pages are shared) */
167 skb_release_data(skb);
169 off = (data+nhead) - skb->head;
173 atomic_set(&skb_shinfo(skb)->dataref, 1);
174 + skb_shinfo(skb)->zccd = zccd;
175 + skb_shinfo(skb)->zccd2 = zccd2;
179 Index: linux-2.4.20-rh-20.9/net/ipv4/tcp.c
180 ===================================================================
181 --- linux-2.4.20-rh-20.9.orig/net/ipv4/tcp.c 2003-09-13 19:34:25.000000000 +0400
182 +++ linux-2.4.20-rh-20.9/net/ipv4/tcp.c 2003-12-19 14:14:56.000000000 +0300
187 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
188 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd);
191 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
196 -ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
197 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
198 +ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags, zccd_t *zccd)
200 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
205 i = skb_shinfo(skb)->nr_frags;
207 + if (zccd != NULL && /* this is a zcc I/O */
208 + skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */
209 + skb_shinfo(skb)->zccd2 != NULL &&
210 + skb_shinfo(skb)->zccd != zccd && /* not the same one */
211 + skb_shinfo(skb)->zccd2 != zccd)
213 + tcp_mark_push (tp, skb);
217 if (can_coalesce(skb, i, page, offset)) {
218 skb_shinfo(skb)->frags[i-1].size += copy;
219 } else if (i < MAX_SKB_FRAGS) {
224 + if (zccd != NULL && /* this is a zcc I/O */
225 + skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */
226 + skb_shinfo(skb)->zccd2 != zccd)
228 + zccd_get (zccd); /* bump ref count */
230 + BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
232 + if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */
233 + skb_shinfo(skb)->zccd = zccd;
235 + skb_shinfo(skb)->zccd2 = zccd;
239 skb->data_len += copy;
240 skb->ip_summed = CHECKSUM_HW;
245 - res = do_tcp_sendpages(sk, &page, offset, size, flags);
246 + res = do_tcp_sendpages(sk, &page, offset, size, flags, NULL);
247 + TCP_CHECK_TIMER(sk);
252 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
253 + int flags, zccd_t *zccd)
256 + struct sock *sk = sock->sk;
258 +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
260 + if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */
261 + !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */
264 +#undef TCP_ZC_CSUM_FLAGS
267 + TCP_CHECK_TIMER(sk);
269 + res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
274 @@ -1771,6 +1821,202 @@
278 +int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets,
279 + int len, int nonblock)
281 + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
285 + BUG_TRAP (len > 0);
286 + /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/
290 + TCP_CHECK_TIMER(sk);
292 + copied = -ENOTCONN;
293 + if (sk->state == TCP_LISTEN)
297 + timeo = sock_rcvtimeo(sk, nonblock);
300 + struct sk_buff * skb;
302 + unsigned long used;
306 + /* Are we at urgent data? Stop if we have read anything. */
307 + if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq)
310 + /* We need to check signals first, to get correct SIGURG
311 + * handling. FIXME: Need to check this doesnt impact 1003.1g
312 + * and move it down to the bottom of the loop
314 + if (signal_pending(current)) {
317 + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
321 + /* Next get a buffer. */
323 + skb = skb_peek(&sk->receive_queue);
325 + if (skb == NULL) /* nothing ready */
329 + sk->state == TCP_CLOSE ||
330 + (sk->shutdown & RCV_SHUTDOWN) ||
339 + copied = sock_error(sk);
343 + if (sk->shutdown & RCV_SHUTDOWN)
346 + if (sk->state == TCP_CLOSE) {
348 + /* This occurs when user tries to read
349 + * from never connected socket.
351 + copied = -ENOTCONN;
363 + cleanup_rbuf(sk, copied);
364 + timeo = tcp_data_wait(sk, timeo);
368 + BUG_TRAP (atomic_read (&skb->users) == 1);
370 + exhausted = eaten = 0;
372 + offset = tp->copied_seq - TCP_SKB_CB(skb)->seq;
373 + if (skb->h.th->syn)
376 + used = skb->len - offset;
378 + if (tp->urg_data) {
379 + u32 urg_offset = tp->urg_seq - tp->copied_seq;
380 + if (urg_offset < used) {
381 + if (!urg_offset) { /* at urgent date */
382 + if (!sk->urginline) {
383 + tp->copied_seq++; /* discard the single byte of urgent data */
387 + } else /* truncate read */
392 + BUG_TRAP (used >= 0);
400 + if (skb_is_nonlinear (skb))
402 + int rc = skb_linearize (skb, GFP_KERNEL);
404 + printk ("tcp_recvpackets(): linearising: %d\n", rc);
414 + if ((offset + used) == skb->len) /* consuming the whole packet */
416 + __skb_unlink (skb, &sk->receive_queue);
417 + dst_release (skb->dst);
419 + __skb_pull (skb, offset);
420 + __skb_queue_tail (packets, skb);
421 + exhausted = eaten = 1;
423 + else /* consuming only part of the packet */
425 + struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL);
434 + dst_release (skb2->dst);
435 + __skb_pull (skb2, offset);
436 + __skb_trim (skb2, used);
437 + __skb_queue_tail (packets, skb2);
440 + tp->copied_seq += used;
445 + if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
447 + tcp_fast_path_check(sk, tp);
453 + if (skb->h.th->fin)
457 + tcp_eat_skb (sk, skb);
462 + tcp_eat_skb (sk, skb);
467 + /* Clean up data we have read: This will do ACK frames. */
468 + cleanup_rbuf(sk, copied);
469 + TCP_CHECK_TIMER(sk);
475 * State processing on a close. This implements the state shift for
476 * sending our FIN frame. Note that we only send a FIN for some