1 --- linux/./include/net/tcp.h 2006-10-10 01:49:23.000000000 +0100
2 +++ ../2.6.9-41.2chaos/linux/./include/net/tcp.h 2006-09-21 17:15:21.000000000 +0100
3 @@ -787,6 +787,8 @@ extern int tcp_v4_tw_remember_stam
4 extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk,
5 struct msghdr *msg, size_t size);
6 extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags);
7 +extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size,
8 + int flags, struct zccd *zccd);
10 extern int tcp_ioctl(struct sock *sk,
12 --- linux/./include/linux/skbuff.h 2006-10-10 01:49:23.000000000 +0100
13 +++ ../2.6.9-41.2chaos/linux/./include/linux/skbuff.h 2006-10-06 18:09:35.000000000 +0100
14 @@ -134,6 +134,36 @@ struct skb_frag_struct {
18 +/* Zero Copy Callback Descriptor
19 + * This struct supports receiving notification when zero-copy network I/O has
20 + * completed. The ZCCD can be embedded in a struct containing the state of a
21 + * zero-copy network send. Every skbuff that references that send's pages also
22 + * keeps a reference on the ZCCD. When they have all been disposed of, the
23 + * reference count on the ZCCD drops to zero and the callback is made, telling
24 + * the original caller that the pages may now be overwritten. */
27 + atomic_t zccd_refcount;
28 + void (*zccd_callback)(struct zccd *);
31 +static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *))
33 + atomic_set (&d->zccd_refcount, 1);
34 + d->zccd_callback = callback;
37 +static inline void zccd_incref (struct zccd *d) /* take a reference */
39 + atomic_inc (&d->zccd_refcount);
42 +static inline void zccd_decref (struct zccd *d) /* release a reference */
44 + if (atomic_dec_and_test (&d->zccd_refcount))
45 + (d->zccd_callback)(d);
48 /* This data is invariant across clones and lives at
49 * the end of the header data, ie. at skb->end.
51 @@ -143,6 +173,11 @@ struct skb_shared_info {
52 unsigned short tso_size;
53 unsigned short tso_segs;
54 struct sk_buff *frag_list;
57 + /* NB zero-copy data is normally whole pages. We have 2 zccds in an
58 + * skbuff so we don't unneccessarily split the packet where pages fall
59 + * into the same packet. */
60 skb_frag_t frags[MAX_SKB_FRAGS];
63 @@ -1070,6 +1105,23 @@ static inline void kunmap_skb_frag(void
67 +/* This skbuf has dropped its pages: drop refs on any zero-copy callback
68 + * descriptors it has. */
69 +static inline void skb_complete_zccd (struct sk_buff *skb)
71 + struct skb_shared_info *info = skb_shinfo(skb);
73 + if (info->zccd1 != NULL) {
74 + zccd_decref(info->zccd1);
78 + if (info->zccd2 != NULL) {
79 + zccd_decref(info->zccd2);
84 #define skb_queue_walk(queue, skb) \
85 for (skb = (queue)->next, prefetch(skb->next); \
86 (skb != (struct sk_buff *)(queue)); \
87 --- linux/./net/core/dev.c 2006-10-10 01:49:23.000000000 +0100
88 +++ ../2.6.9-41.2chaos/linux/./net/core/dev.c 2006-09-21 16:53:45.000000000 +0100
89 @@ -1140,6 +1140,8 @@ int __skb_linearize(struct sk_buff *skb,
90 ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
92 ninfo->frag_list = NULL;
93 + ninfo->zccd1 = NULL; /* zero copy completion callback */
94 + ninfo->zccd2 = NULL; /* not required */
96 /* Offset between the two in bytes */
97 offset = data - skb->head;
98 --- linux/./net/core/skbuff.c 2006-10-10 01:49:23.000000000 +0100
99 +++ ../2.6.9-41.2chaos/linux/./net/core/skbuff.c 2006-10-10 01:46:16.000000000 +0100
100 @@ -155,6 +155,8 @@ struct sk_buff *alloc_skb(unsigned int s
101 skb_shinfo(skb)->tso_size = 0;
102 skb_shinfo(skb)->tso_segs = 0;
103 skb_shinfo(skb)->frag_list = NULL;
104 + skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */
105 + skb_shinfo(skb)->zccd2 = NULL; /* not required (yet) */
109 @@ -189,6 +191,9 @@ void skb_release_data(struct sk_buff *sk
112 atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) {
113 + /* complete zero-copy callbacks (if any) */
114 + skb_complete_zccd(skb);
116 if (skb_shinfo(skb)->nr_frags) {
118 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
119 @@ -484,6 +489,18 @@ struct sk_buff *pskb_copy(struct sk_buff
120 get_page(skb_shinfo(n)->frags[i].page);
122 skb_shinfo(n)->nr_frags = i;
124 + if (skb_shinfo(skb)->zccd1 != NULL) {
125 + BUG_TRAP(skb_shinfo(n)->zccd1 == NULL);
126 + skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1;
127 + zccd_incref(skb_shinfo(n)->zccd1);
130 + if (skb_shinfo(skb)->zccd2 != NULL) {
131 + BUG_TRAP(skb_shinfo(n)->zccd2 == NULL);
132 + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2;
133 + zccd_incref(skb_shinfo(n)->zccd2);
137 if (skb_shinfo(skb)->frag_list) {
138 @@ -533,6 +550,13 @@ int pskb_expand_head(struct sk_buff *skb
139 memcpy(data + nhead, skb->head, skb->tail - skb->head);
140 memcpy(data + size, skb->end, sizeof(struct skb_shared_info));
142 + /* zero-copy descriptors have been copied into the new shinfo -
143 + * account the new references */
144 + if (skb_shinfo(skb)->zccd1 != NULL)
145 + zccd_incref(skb_shinfo(skb)->zccd1);
146 + if (skb_shinfo(skb)->zccd2 != NULL)
147 + zccd_incref(skb_shinfo(skb)->zccd2);
149 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
150 get_page(skb_shinfo(skb)->frags[i].page);
152 @@ -694,6 +718,9 @@ int ___pskb_trim(struct sk_buff *skb, un
156 + if (skb_shinfo(skb)->nr_frags == 0) /* dropped all the pages */
157 + skb_complete_zccd(skb); /* drop zccd refs */
160 skb->data_len -= skb->len - len;
162 @@ -846,6 +873,9 @@ pull_pages:
164 skb_shinfo(skb)->nr_frags = k;
166 + if (k == 0) /* dropped all the pages */
167 + skb_complete_zccd(skb); /* drop zccd refs */
170 skb->data_len -= delta;
172 @@ -1362,6 +1392,15 @@ static void inline skb_split_inside_head
173 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
174 skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
176 + /* Transfer zero-copy callback descriptors */
177 + BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
178 + skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
179 + skb_shinfo(skb)->zccd1 = NULL;
181 + BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
182 + skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
183 + skb_shinfo(skb)->zccd2 = NULL;
185 skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
186 skb_shinfo(skb)->nr_frags = 0;
187 skb1->data_len = skb->data_len;
188 @@ -1410,6 +1449,30 @@ static void inline skb_split_no_header(s
191 skb_shinfo(skb1)->nr_frags = k;
194 + /* skb1 has pages. Transfer or clone the zccds */
196 + if (skb_shinfo(skb)->zccd1 != NULL) {
197 + BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL);
198 + skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1;
200 + if (skb_shinfo(skb)->nr_frags == 0)
201 + skb_shinfo(skb)->zccd1 = NULL;
203 + zccd_incref(skb_shinfo(skb)->zccd1);
206 + if (skb_shinfo(skb)->zccd2 != NULL) {
207 + BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL);
208 + skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2;
210 + if (skb_shinfo(skb)->nr_frags == 0)
211 + skb_shinfo(skb)->zccd2 = NULL;
213 + zccd_incref(skb_shinfo(skb)->zccd2);
219 --- linux/./net/ipv4/tcp_output.c 2006-09-21 00:13:11.000000000 +0100
220 +++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp_output.c 2006-09-21 18:24:26.000000000 +0100
221 @@ -562,6 +562,9 @@ static unsigned char *__pskb_trim_head(s
223 skb_shinfo(skb)->nr_frags = k;
225 + if (k == 0) /* dropped all pages */
226 + skb_complete_zccd(skb);
228 skb->tail = skb->data;
229 skb->data_len -= len;
230 skb->len = skb->data_len;
231 --- linux/./net/ipv4/tcp.c 2006-10-10 01:49:23.000000000 +0100
232 +++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp.c 2006-10-09 19:03:15.000000000 +0100
233 @@ -628,8 +628,9 @@ static inline void tcp_push(struct sock
237 +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */
238 static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
239 - size_t psize, int flags)
240 + size_t psize, int flags, struct zccd *zccd)
242 struct tcp_opt *tp = tcp_sk(sk);
244 @@ -676,6 +677,16 @@ new_segment:
247 i = skb_shinfo(skb)->nr_frags;
249 + if (zccd != NULL && /* completion callback wanted */
250 + skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */
251 + skb_shinfo(skb)->zccd2 != NULL &&
252 + skb_shinfo(skb)->zccd1 != zccd && /* room needed */
253 + skb_shinfo(skb)->zccd2 != zccd) {
254 + tcp_mark_push (tp, skb);
258 can_coalesce = skb_can_coalesce(skb, i, page, offset);
259 if (!can_coalesce && i >= MAX_SKB_FRAGS) {
260 tcp_mark_push(tp, skb);
261 @@ -692,6 +703,18 @@ new_segment:
262 skb_fill_page_desc(skb, i, page, offset, copy);
265 + if (zccd != NULL && /* completion callback wanted */
266 + skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */
267 + skb_shinfo(skb)->zccd2 != zccd) {
268 + if (skb_shinfo(skb)->zccd1 == NULL) {
269 + skb_shinfo(skb)->zccd1 = zccd;
271 + BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL);
272 + skb_shinfo(skb)->zccd2 = zccd;
274 + zccd_incref(zccd); /* new reference */
278 skb->data_len += copy;
279 skb->truesize += copy;
280 @@ -744,8 +767,8 @@ out_err:
281 return sk_stream_error(sk, flags, err);
284 -ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
285 - size_t size, int flags)
286 +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset,
287 + size_t size, int flags, struct zccd *zccd)
290 struct sock *sk = sock->sk;
291 @@ -760,12 +783,18 @@ ssize_t tcp_sendpage(struct socket *sock
295 - res = do_tcp_sendpages(sk, &page, offset, size, flags);
296 + res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd);
302 +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
303 + size_t size, int flags)
305 + return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL);
308 #define TCP_PAGE(sk) (sk->sk_sndmsg_page)
309 #define TCP_OFF(sk) (sk->sk_sndmsg_off)
311 @@ -2343,6 +2372,7 @@ EXPORT_SYMBOL(tcp_read_sock);
312 EXPORT_SYMBOL(tcp_recvmsg);
313 EXPORT_SYMBOL(tcp_sendmsg);
314 EXPORT_SYMBOL(tcp_sendpage);
315 +EXPORT_SYMBOL(tcp_sendpage_zccd);
316 EXPORT_SYMBOL(tcp_setsockopt);
317 EXPORT_SYMBOL(tcp_shutdown);
318 EXPORT_SYMBOL(tcp_statistics);