1 diff -ur linux-2.6.5-7.252/include/linux/skbuff.h linux-2.6.5-7.252-tcp/include/linux/skbuff.h
2 --- linux-2.6.5-7.252/include/linux/skbuff.h 2006-10-11 21:46:38.000000000 +0300
3 +++ linux-2.6.5-7.252-tcp/include/linux/skbuff.h 2006-10-11 21:52:56.000000000 +0300
8 +/* Zero Copy Callback Descriptor
9 + * This struct supports receiving notification when zero-copy network I/O has
10 + * completed. The ZCCD can be embedded in a struct containing the state of a
11 + * zero-copy network send. Every skbuff that references that send's pages also
12 + * keeps a reference on the ZCCD. When they have all been disposed of, the
13 + * reference count on the ZCCD drops to zero and the callback is made, telling
14 + * the original caller that the pages may now be overwritten. */
17 + atomic_t zccd_refcount;
18 + void (*zccd_callback)(struct zccd *);
21 +static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
23 + atomic_set (&d->zccd_refcount, 1);
24 + d->zccd_callback = callback;
27 +static inline void zccd_incref (struct zccd *d) /* take a reference */
29 + atomic_inc (&d->zccd_refcount);
32 +static inline void zccd_decref (struct zccd *d) /* release a reference */
34 + if (atomic_dec_and_test (&d->zccd_refcount))
35 + (d->zccd_callback)(d);
38 /* This data is invariant across clones and lives at
39 * the end of the header data, ie. at skb->end.
42 unsigned short tso_size;
43 unsigned short tso_segs;
44 struct sk_buff *frag_list;
47 + /* NB zero-copy data is normally whole pages. We have 2 zccds in an
48 + * skbuff so we don't unneccessarily split the packet where pages fall
49 + * into the same packet. */
50 skb_frag_t frags[MAX_SKB_FRAGS];
53 @@ -1152,6 +1187,23 @@
57 +/* This skbuf has dropped its pages: drop refs on any zero-copy callback
58 + * descriptors it has. */
59 +static inline void skb_complete_zccd (struct sk_buff *skb)
61 + struct skb_shared_info *info = skb_shinfo(skb);
63 + if (info->zccd1 != NULL) {
64 + zccd_decref(info->zccd1);
68 + if (info->zccd2 != NULL) {
69 + zccd_decref(info->zccd2);
74 #define skb_queue_walk(queue, skb) \
75 for (skb = (queue)->next, prefetch(skb->next); \
76 (skb != (struct sk_buff *)(queue)); \
77 diff -ur linux-2.6.5-7.252/include/net/tcp.h linux-2.6.5-7.252-tcp/include/net/tcp.h
78 --- linux-2.6.5-7.252/include/net/tcp.h 2006-10-11 21:46:38.000000000 +0300
79 +++ linux-2.6.5-7.252-tcp/include/net/tcp.h 2006-10-11 21:52:56.000000000 +0300
81 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
82 struct msghdr *msg, size_t size);
83 extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
84 +extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
85 + int flags, struct zccd *zccd);
87 extern int tcp_ioctl(struct sock *sk,
89 diff -ur linux-2.6.5-7.252/net/core/dev.c linux-2.6.5-7.252-tcp/net/core/dev.c
90 --- linux-2.6.5-7.252/net/core/dev.c 2006-10-11 21:46:38.000000000 +0300
91 +++ linux-2.6.5-7.252-tcp/net/core/dev.c 2006-10-11 21:52:56.000000000 +0300
93 ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
95 ninfo->frag_list = NULL;
96 + ninfo->zccd1 = NULL; /* zero copy completion callback */
97 + ninfo->zccd2 = NULL; /* not required */
99 /* Offset between the two in bytes */
100 offset = data - skb->head;
101 diff -ur linux-2.6.5-7.252/net/core/skbuff.c linux-2.6.5-7.252-tcp/net/core/skbuff.c
102 --- linux-2.6.5-7.252/net/core/skbuff.c 2006-10-11 21:46:38.000000000 +0300
103 +++ linux-2.6.5-7.252-tcp/net/core/skbuff.c 2006-10-11 22:06:31.000000000 +0300
105 skb_shinfo(skb)->tso_size = 0;
106 skb_shinfo(skb)->tso_segs = 0;
107 skb_shinfo(skb)->frag_list = NULL;
108 + skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */
109 + skb_shinfo(skb)->zccd2 = NULL; /* not required (yet) */
116 atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
117 + /* complete zero-copy callbacks (if any) */
118 + skb_complete_zccd(skb);
120 if (skb_shinfo(skb)->nr_frags) {
122 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
124 skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
125 get_page(skb_shinfo(n)->frags[i].page);
128 + /* Transfer zero-copy callback descriptors */
129 + BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
130 + skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
131 + skb_shinfo(skb)->zccd1 = NULL;
133 + BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
134 + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
135 + skb_shinfo(skb)->zccd2 = NULL;
137 skb_shinfo(n)->nr_frags = i;
139 + if (skb_shinfo(skb)->zccd1 != NULL) {
140 + BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
141 + skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
142 + zccd_incref(skb_shinfo(n)->zccd1);
145 + if (skb_shinfo(skb)->zccd2 != NULL) {
146 + BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
147 + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
148 + zccd_incref(skb_shinfo(n)->zccd2);
151 skb_shinfo(n)->tso_size = skb_shinfo(skb)->tso_size;
152 skb_shinfo(n)->tso_segs = skb_shinfo(skb)->tso_segs;
154 memcpy(data + nhead, skb->head, skb->tail - skb->head);
155 memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
157 + /* zero-copy descriptors have been copied into the new shinfo -
158 + * account the new references */
159 + if (skb_shinfo(skb)->zccd1 != NULL)
160 + zccd_incref(skb_shinfo(skb)->zccd1);
161 + if (skb_shinfo(skb)->zccd2 != NULL)
162 + zccd_incref(skb_shinfo(skb)->zccd2);
164 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
165 get_page(skb_shinfo(skb)->frags[i].page);
171 + if (skb_shinfo(skb)->nr_frags == 0) /* dropped all the pages */
172 + skb_complete_zccd(skb); /* drop zccd refs */
175 skb->data_len -= skb->len - len;
179 skb_shinfo(skb)->nr_frags = k;
181 + if (k == 0) /* dropped all the pages */
182 + skb_complete_zccd(skb); /* drop zccd refs */
185 skb->data_len -= delta;
187 diff -ur linux-2.6.5-7.252/net/ipv4/tcp.c linux-2.6.5-7.252-tcp/net/ipv4/tcp.c
188 --- linux-2.6.5-7.252/net/ipv4/tcp.c 2006-10-11 21:46:38.000000000 +0300
189 +++ linux-2.6.5-7.252-tcp/net/ipv4/tcp.c 2006-10-11 23:15:24.000000000 +0300
193 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
194 - size_t psize, int flags);
195 + size_t psize, int flags, struct zccd *zccd);
197 static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page,
203 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
204 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
205 - size_t psize, int flags)
206 + size_t psize, int flags, struct zccd *zccd)
208 struct tcp_opt *tp = tcp_sk(sk);
213 i = skb_shinfo(skb)->nr_frags;
215 + if (zccd != NULL && /* completion callback wanted */
216 + skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
217 + skb_shinfo(skb)->zccd2 != NULL &&
218 + skb_shinfo(skb)->zccd1 != zccd && /* room needed */
219 + skb_shinfo(skb)->zccd2 != zccd) {
220 + tcp_mark_push (tp, skb);
224 if (can_coalesce(skb, i, page, offset)) {
225 skb_shinfo(skb)->frags[i - 1].size += copy;
226 } else if (i < MAX_SKB_FRAGS) {
231 + if (zccd != NULL && /* completion callback wanted */
232 + skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
233 + skb_shinfo(skb)->zccd2 != zccd) {
234 + if (skb_shinfo(skb)->zccd1 == NULL) {
235 + skb_shinfo(skb)->zccd1 = zccd;
237 + BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
238 + skb_shinfo(skb)->zccd2 = zccd;
240 + zccd_incref(zccd); /* new reference */
244 skb->data_len += copy;
245 skb->ip_summed = CHECKSUM_HW;
247 return tcp_error(sk, flags, err);
250 -ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
251 - size_t size, int flags)
252 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
253 + size_t size, int flags, struct zccd *zccd)
256 struct sock *sk = sock->sk;
257 @@ -1003,12 +1026,19 @@
261 - res = do_tcp_sendpages(sk, &page, offset, size, flags);
262 + res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
268 +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
269 + size_t size, int flags)
271 + return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
275 #define TCP_PAGE(sk) (inet_sk(sk)->sndmsg_page)
276 #define TCP_OFF(sk) (inet_sk(sk)->sndmsg_off)
278 @@ -2872,6 +2902,7 @@
279 EXPORT_SYMBOL(tcp_recvmsg);
280 EXPORT_SYMBOL(tcp_sendmsg);
281 EXPORT_SYMBOL(tcp_sendpage);
282 +EXPORT_SYMBOL(tcp_sendpage_zccd);
283 EXPORT_SYMBOL(tcp_setsockopt);
284 EXPORT_SYMBOL(tcp_shutdown);
285 EXPORT_SYMBOL(tcp_sockets_allocated);
286 diff -ur linux-2.6.5-7.252/net/ipv4/tcp_output.c linux-2.6.5-7.252-tcp/net/ipv4/tcp_output.c
287 --- linux-2.6.5-7.252/net/ipv4/tcp_output.c 2006-10-11 21:46:38.000000000 +0300
288 +++ linux-2.6.5-7.252-tcp/net/ipv4/tcp_output.c 2006-10-11 22:14:04.000000000 +0300
292 skb_shinfo(skb1)->nr_frags = k;
295 + /* skb1 has pages. Transfer or clone the zccds */
297 + if (skb_shinfo(skb)->zccd1 != NULL) {
298 + BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
299 + skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
301 + if (skb_shinfo(skb)->nr_frags == 0)
302 + skb_shinfo(skb)->zccd1 = NULL;
304 + zccd_incref(skb_shinfo(skb)->zccd1);
307 + if (skb_shinfo(skb)->zccd2 != NULL) {
308 + BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
309 + skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
311 + if (skb_shinfo(skb)->nr_frags == 0)
312 + skb_shinfo(skb)->zccd2 = NULL;
314 + zccd_incref(skb_shinfo(skb)->zccd2);
322 skb_shinfo(skb)->nr_frags = k;
324 + if (k == 0) /* dropped all pages */
325 + skb_complete_zccd(skb);
327 skb->tail = skb->data;
328 skb->data_len -= len;
329 skb->len = skb->data_len;