--- linux/./include/net/tcp.h 2006-10-10 01:49:23.000000000 +0100 +++ ../2.6.9-41.2chaos/linux/./include/net/tcp.h 2006-09-21 17:15:21.000000000 +0100 @@ -787,6 +787,8 @@ extern int tcp_v4_tw_remember_stam extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size); extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); +extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, + int flags, struct zccd *zccd); extern int tcp_ioctl(struct sock *sk, int cmd, --- linux/./include/linux/skbuff.h 2006-10-10 01:49:23.000000000 +0100 +++ ../2.6.9-41.2chaos/linux/./include/linux/skbuff.h 2006-10-06 18:09:35.000000000 +0100 @@ -134,6 +134,36 @@ struct skb_frag_struct { __u16 size; }; +/* Zero Copy Callback Descriptor + * This struct supports receiving notification when zero-copy network I/O has + * completed. The ZCCD can be embedded in a struct containing the state of a + * zero-copy network send. Every skbuff that references that send's pages also + * keeps a reference on the ZCCD. When they have all been disposed of, the + * reference count on the ZCCD drops to zero and the callback is made, telling + * the original caller that the pages may now be overwritten. */ +struct zccd +{ + atomic_t zccd_refcount; + void (*zccd_callback)(struct zccd *); +}; + +static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *)) +{ + atomic_set (&d->zccd_refcount, 1); + d->zccd_callback = callback; +} + +static inline void zccd_incref (struct zccd *d) /* take a reference */ +{ + atomic_inc (&d->zccd_refcount); +} + +static inline void zccd_decref (struct zccd *d) /* release a reference */ +{ + if (atomic_dec_and_test (&d->zccd_refcount)) + (d->zccd_callback)(d); +} + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -143,6 +173,11 @@ struct skb_shared_info { unsigned short tso_size; unsigned short tso_segs; struct sk_buff *frag_list; + struct zccd *zccd1; + struct zccd *zccd2; + /* NB zero-copy data is normally whole pages. We have 2 zccds in an + * skbuff so we don't unneccessarily split the packet where pages fall + * into the same packet. */ skb_frag_t frags[MAX_SKB_FRAGS]; }; @@ -1070,6 +1105,23 @@ static inline void kunmap_skb_frag(void #endif } +/* This skbuf has dropped its pages: drop refs on any zero-copy callback + * descriptors it has. */ +static inline void skb_complete_zccd (struct sk_buff *skb) +{ + struct skb_shared_info *info = skb_shinfo(skb); + + if (info->zccd1 != NULL) { + zccd_decref(info->zccd1); + info->zccd1 = NULL; + } + + if (info->zccd2 != NULL) { + zccd_decref(info->zccd2); + info->zccd2 = NULL; + } +} + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next, prefetch(skb->next); \ (skb != (struct sk_buff *)(queue)); \ --- linux/./net/core/dev.c 2006-10-10 01:49:23.000000000 +0100 +++ ../2.6.9-41.2chaos/linux/./net/core/dev.c 2006-09-21 16:53:45.000000000 +0100 @@ -1140,6 +1140,8 @@ int __skb_linearize(struct sk_buff *skb, ninfo->tso_segs = skb_shinfo(skb)->tso_segs; ninfo->nr_frags = 0; ninfo->frag_list = NULL; + ninfo->zccd1 = NULL; /* zero copy completion callback */ + ninfo->zccd2 = NULL; /* not required */ /* Offset between the two in bytes */ offset = data - skb->head; --- linux/./net/core/skbuff.c 2006-10-10 01:49:23.000000000 +0100 +++ ../2.6.9-41.2chaos/linux/./net/core/skbuff.c 2006-10-10 01:46:16.000000000 +0100 @@ -155,6 +155,8 @@ struct sk_buff *alloc_skb(unsigned int s skb_shinfo(skb)->tso_size = 0; skb_shinfo(skb)->tso_segs = 0; skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */ + skb_shinfo(skb)->zccd2 = NULL; /* not required (yet) */ out: return skb; nodata: @@ -189,6 +191,9 @@ void skb_release_data(struct sk_buff *sk { if (!skb->cloned || atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { + /* complete zero-copy callbacks (if any) */ + skb_complete_zccd(skb); + if (skb_shinfo(skb)->nr_frags) { int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -484,6 +489,18 @@ struct sk_buff *pskb_copy(struct sk_buff get_page(skb_shinfo(n)->frags[i].page); } skb_shinfo(n)->nr_frags = i; + + if (skb_shinfo(skb)->zccd1 != NULL) { + BUG_TRAP(skb_shinfo(n)->zccd1 == NULL); + skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1; + zccd_incref(skb_shinfo(n)->zccd1); + } + + if (skb_shinfo(skb)->zccd2 != NULL) { + BUG_TRAP(skb_shinfo(n)->zccd2 == NULL); + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; + zccd_incref(skb_shinfo(n)->zccd2); + } } if (skb_shinfo(skb)->frag_list) { @@ -533,6 +550,13 @@ int pskb_expand_head(struct sk_buff *skb memcpy(data + nhead, skb->head, skb->tail - skb->head); memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); + /* zero-copy descriptors have been copied into the new shinfo - + * account the new references */ + if (skb_shinfo(skb)->zccd1 != NULL) + zccd_incref(skb_shinfo(skb)->zccd1); + if (skb_shinfo(skb)->zccd2 != NULL) + zccd_incref(skb_shinfo(skb)->zccd2); + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); @@ -694,6 +718,9 @@ int ___pskb_trim(struct sk_buff *skb, un offset = end; } + if (skb_shinfo(skb)->nr_frags == 0) /* dropped all the pages */ + skb_complete_zccd(skb); /* drop zccd refs */ + if (offset < len) { skb->data_len -= skb->len - len; skb->len = len; @@ -846,6 +873,9 @@ pull_pages: } skb_shinfo(skb)->nr_frags = k; + if (k == 0) /* dropped all the pages */ + skb_complete_zccd(skb); /* drop zccd refs */ + skb->tail += delta; skb->data_len -= delta; @@ -1362,6 +1392,15 @@ static void inline skb_split_inside_head for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + /* Transfer zero-copy callback descriptors */ + BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); + skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; + skb_shinfo(skb)->zccd1 = NULL; + + BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); + skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; + skb_shinfo(skb)->zccd2 = NULL; + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; @@ -1410,6 +1449,30 @@ static void inline skb_split_no_header(s pos += size; } skb_shinfo(skb1)->nr_frags = k; + + if (k != 0) { + /* skb1 has pages. Transfer or clone the zccds */ + + if (skb_shinfo(skb)->zccd1 != NULL) { + BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); + skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; + + if (skb_shinfo(skb)->nr_frags == 0) + skb_shinfo(skb)->zccd1 = NULL; + else + zccd_incref(skb_shinfo(skb)->zccd1); + } + + if (skb_shinfo(skb)->zccd2 != NULL) { + BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); + skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; + + if (skb_shinfo(skb)->nr_frags == 0) + skb_shinfo(skb)->zccd2 = NULL; + else + zccd_incref(skb_shinfo(skb)->zccd2); + } + } } /** --- linux/./net/ipv4/tcp_output.c 2006-09-21 00:13:11.000000000 +0100 +++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp_output.c 2006-09-21 18:24:26.000000000 +0100 @@ -562,6 +562,9 @@ static unsigned char *__pskb_trim_head(s } skb_shinfo(skb)->nr_frags = k; + if (k == 0) /* dropped all pages */ + skb_complete_zccd(skb); + skb->tail = skb->data; skb->data_len -= len; skb->len = skb->data_len; --- linux/./net/ipv4/tcp.c 2006-10-10 01:49:23.000000000 +0100 +++ ../2.6.9-41.2chaos/linux/./net/ipv4/tcp.c 2006-10-09 19:03:15.000000000 +0100 @@ -628,8 +628,9 @@ static inline void tcp_push(struct sock } } +/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, - size_t psize, int flags) + size_t psize, int flags, struct zccd *zccd) { struct tcp_opt *tp = tcp_sk(sk); int mss_now; @@ -676,6 +677,16 @@ new_segment: copy = size; i = skb_shinfo(skb)->nr_frags; + + if (zccd != NULL && /* completion callback wanted */ + skb_shinfo(skb)->zccd1 != NULL && /* no room for zccd */ + skb_shinfo(skb)->zccd2 != NULL && + skb_shinfo(skb)->zccd1 != zccd && /* room needed */ + skb_shinfo(skb)->zccd2 != zccd) { + tcp_mark_push (tp, skb); + goto new_segment; + } + can_coalesce = skb_can_coalesce(skb, i, page, offset); if (!can_coalesce && i >= MAX_SKB_FRAGS) { tcp_mark_push(tp, skb); @@ -692,6 +703,18 @@ new_segment: skb_fill_page_desc(skb, i, page, offset, copy); } + if (zccd != NULL && /* completion callback wanted */ + skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */ + skb_shinfo(skb)->zccd2 != zccd) { + if (skb_shinfo(skb)->zccd1 == NULL) { + skb_shinfo(skb)->zccd1 = zccd; + } else { + BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); + skb_shinfo(skb)->zccd2 = zccd; + } + zccd_incref(zccd); /* new reference */ + } + skb->len += copy; skb->data_len += copy; skb->truesize += copy; @@ -744,8 +767,8 @@ out_err: return sk_stream_error(sk, flags, err); } -ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, - size_t size, int flags) +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, + size_t size, int flags, struct zccd *zccd) { ssize_t res; struct sock *sk = sock->sk; @@ -760,12 +783,18 @@ ssize_t tcp_sendpage(struct socket *sock lock_sock(sk); TCP_CHECK_TIMER(sk); - res = do_tcp_sendpages(sk, &page, offset, size, flags); + res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); TCP_CHECK_TIMER(sk); release_sock(sk); return res; } +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) +{ + return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL); +} + #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) @@ -2343,6 +2372,7 @@ EXPORT_SYMBOL(tcp_read_sock); EXPORT_SYMBOL(tcp_recvmsg); EXPORT_SYMBOL(tcp_sendmsg); EXPORT_SYMBOL(tcp_sendpage); +EXPORT_SYMBOL(tcp_sendpage_zccd); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); EXPORT_SYMBOL(tcp_statistics);