diff -uNr linux-2.4.21-32.0.1.EL/include/linux/skbuff.h linux-2.4.21-32.0.1.EL-newzc/include/linux/skbuff.h --- linux-2.4.21-32.0.1.EL/include/linux/skbuff.h 2005-06-02 05:51:57.000000000 +0300 +++ linux-2.4.21-32.0.1.EL-newzc/include/linux/skbuff.h 2006-10-23 23:54:31.000000000 +0300 @@ -116,6 +116,36 @@ __u16 size; }; +/* Zero Copy Callback Descriptor + * This struct supports receiving notification when zero-copy network I/O has + * completed. The ZCCD can be embedded in a struct containing the state of a + * zero-copy network send. Every skbuff that references that send's pages also + * keeps a reference on the ZCCD. When they have all been disposed of, the + * reference count on the ZCCD drops to zero and the callback is made, telling + * the original caller that the pages may now be overwritten. */ +struct zccd +{ + atomic_t zccd_refcount; + void (*zccd_callback)(struct zccd *); +}; + +static inline void zccd_init (struct zccd *d, void (*callback)(struct zccd *)) +{ + atomic_set (&d->zccd_refcount, 1); + d->zccd_callback = callback; +} + +static inline void zccd_incref (struct zccd *d) /* take a reference */ +{ + atomic_inc (&d->zccd_refcount); +} + +static inline void zccd_decref (struct zccd *d) /* release a reference */ +{ + if (atomic_dec_and_test (&d->zccd_refcount)) + (d->zccd_callback)(d); +} + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -123,6 +153,11 @@ atomic_t dataref; unsigned int nr_frags; struct sk_buff *frag_list; + struct zccd *zccd1; + struct zccd *zccd2; + /* NB zero-copy data is normally whole pages. We have 2 zccds in an + * skbuff so we don't unneccessarily split the packet where pages fall + * into the same packet. */ skb_frag_t frags[MAX_SKB_FRAGS]; }; @@ -1153,6 +1188,23 @@ #endif } +/* This skbuf has dropped its pages: drop refs on any zero-copy callback + * descriptors it has. */ +static inline void skb_complete_zccd (struct sk_buff *skb) +{ + struct skb_shared_info *info = skb_shinfo(skb); + + if (info->zccd1 != NULL) { + zccd_decref(info->zccd1); + info->zccd1 = NULL; + } + + if (info->zccd2 != NULL) { + zccd_decref(info->zccd2); + info->zccd2 = NULL; + } +} + #define skb_queue_walk(queue, skb) \ for (skb = (queue)->next; \ (skb != (struct sk_buff *)(queue)); \ diff -uNr linux-2.4.21-32.0.1.EL/include/net/tcp.h linux-2.4.21-32.0.1.EL-newzc/include/net/tcp.h --- linux-2.4.21-32.0.1.EL/include/net/tcp.h 2005-06-02 05:51:51.000000000 +0300 +++ linux-2.4.21-32.0.1.EL-newzc/include/net/tcp.h 2006-10-23 23:54:37.000000000 +0300 @@ -636,6 +636,8 @@ extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); +extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, + int flags, struct zccd *zccd); extern int tcp_ioctl(struct sock *sk, int cmd, diff -uNr linux-2.4.21-32.0.1.EL/net/core/skbuff.c linux-2.4.21-32.0.1.EL-newzc/net/core/skbuff.c --- linux-2.4.21-32.0.1.EL/net/core/skbuff.c 2005-06-02 05:51:57.000000000 +0300 +++ linux-2.4.21-32.0.1.EL-newzc/net/core/skbuff.c 2006-10-23 23:44:10.000000000 +0300 @@ -210,6 +210,9 @@ atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */ + skb_shinfo(skb)->zccd2 = NULL; /* not required (yet) */ + return skb; nodata: @@ -280,6 +283,9 @@ { if (!skb->cloned || atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { + /* complete zero-copy callbacks (if any) */ + skb_complete_zccd(skb); + if (skb_shinfo(skb)->nr_frags) { int i; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) @@ -550,6 +556,8 @@ atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; skb_shinfo(skb)->frag_list = NULL; + skb_shinfo(skb)->zccd1 = NULL; /* zero-copy completion callback */ + skb_shinfo(skb)->zccd2 = NULL; /* not required */ /* We are no longer a clone, even if we were. */ skb->cloned = 0; @@ -604,6 +612,18 @@ get_page(skb_shinfo(n)->frags[i].page); } skb_shinfo(n)->nr_frags = i; + + if (skb_shinfo(skb)->zccd1 != NULL) { + BUG_TRAP(skb_shinfo(n)->zccd1 = NULL); + skb_shinfo(n)->zccd1 = skb_shinfo(skb)->zccd1; + zccd_incref(skb_shinfo(n)->zccd1); + } + + if (skb_shinfo(skb)->zccd2 != NULL) { + BUG_TRAP(skb_shinfo(n)->zccd2 = NULL); + skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; + zccd_incref(skb_shinfo(n)->zccd2); + } } if (skb_shinfo(skb)->frag_list) { @@ -653,6 +673,13 @@ memcpy(data+nhead, skb->head, skb->tail-skb->head); memcpy(data+size, skb->end, sizeof(struct skb_shared_info)); + /* zero-copy descriptors have been copied into the new shinfo - + * account the new references */ + if (skb_shinfo(skb)->zccd1 != NULL) + zccd_incref(skb_shinfo(skb)->zccd1); + if (skb_shinfo(skb)->zccd2 != NULL) + zccd_incref(skb_shinfo(skb)->zccd2); + for (i=0; inr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); @@ -809,6 +836,9 @@ offset = end; } + if (skb_shinfo(skb)->nr_frags == 0) /* dropped all the pages */ + skb_complete_zccd(skb); /* drop zccd refs */ + if (offset < len) { skb->data_len -= skb->len - len; skb->len = len; @@ -962,6 +992,9 @@ } skb_shinfo(skb)->nr_frags = k; + if (k == 0) /* dropped all the pages */ + skb_complete_zccd(skb); /* drop zccd refs */ + skb->tail += delta; skb->data_len -= delta; diff -uNr linux-2.4.21-32.0.1.EL/net/ipv4/tcp.c linux-2.4.21-32.0.1.EL-newzc/net/ipv4/tcp.c --- linux-2.4.21-32.0.1.EL/net/ipv4/tcp.c 2005-06-02 05:51:51.000000000 +0300 +++ linux-2.4.21-32.0.1.EL-newzc/net/ipv4/tcp.c 2006-10-24 00:10:34.000000000 +0300 @@ -1015,7 +1015,7 @@ goto out; } -ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags); +ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags, struct zccd *zccd); static inline int can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) @@ -1094,7 +1094,7 @@ return err; } -ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags) +ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags, struct zccd *zccd) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int mss_now; @@ -1147,6 +1147,17 @@ copy = size; i = skb_shinfo(skb)->nr_frags; + + if (zccd != NULL && /* this is a zcc I/O */ + skb_shinfo(skb)->zccd1 != NULL && /* skb is part of a zcc I/O */ + skb_shinfo(skb)->zccd2 != NULL && + skb_shinfo(skb)->zccd1 != zccd && /* not the same one */ + skb_shinfo(skb)->zccd2 != zccd) + { + tcp_mark_push (tp, skb); + goto new_segment; + } + if (can_coalesce(skb, i, page, offset)) { skb_shinfo(skb)->frags[i-1].size += copy; } else if (i < MAX_SKB_FRAGS) { @@ -1157,6 +1168,18 @@ goto new_segment; } + if (zccd != NULL && /* completion callback wanted */ + skb_shinfo(skb)->zccd1 != zccd && /* new to this skbuf */ + skb_shinfo(skb)->zccd2 != zccd) { + if (skb_shinfo(skb)->zccd1 == NULL) { + skb_shinfo(skb)->zccd1 = zccd; + } else { + BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); + skb_shinfo(skb)->zccd2 = zccd; + } + zccd_incref(zccd); /* new reference */ + } + skb->len += copy; skb->data_len += copy; skb->ip_summed = CHECKSUM_HW; @@ -1209,7 +1232,8 @@ return tcp_error(sk, flags, err); } -ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, + size_t size, int flags, struct zccd *zccd) { struct kveclet let = { page, offset, size }; ssize_t res; @@ -1224,12 +1248,18 @@ lock_sock(sk); TCP_CHECK_TIMER(sk); - res = do_tcp_sendpages(sk, &let, 0, size, flags); + res = do_tcp_sendpages(sk, &let, 0, size, flags, zccd); TCP_CHECK_TIMER(sk); release_sock(sk); return res; } +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + return tcp_sendpage_zccd(sock, page, offset, size, flags, NULL); +} + + static void tcp_kvec_write_worker(struct tcp_write_async_info *info) { struct sock *sk = info->sk; @@ -1238,7 +1268,7 @@ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) BUG(); - res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT); + res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT, NULL); if (res > 0) info->done += res; diff -uNr linux-2.4.21-32.0.1.EL/net/ipv4/tcp_output.c linux-2.4.21-32.0.1.EL-newzc/net/ipv4/tcp_output.c --- linux-2.4.21-32.0.1.EL/net/ipv4/tcp_output.c 2005-06-02 05:51:50.000000000 +0300 +++ linux-2.4.21-32.0.1.EL-newzc/net/ipv4/tcp_output.c 2006-10-23 23:44:10.000000000 +0300 @@ -363,6 +363,15 @@ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + /* Transfer zero-copy callback descriptors */ + BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); + skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; + skb_shinfo(skb)->zccd1 = NULL; + + BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); + skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; + skb_shinfo(skb)->zccd2 = NULL; + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; skb_shinfo(skb)->nr_frags = 0; @@ -409,6 +418,30 @@ pos += size; } skb_shinfo(skb1)->nr_frags = k; + + if (k != 0) { + /* skb1 has pages. Transfer or clone the zccds */ + + if (skb_shinfo(skb)->zccd1 != NULL) { + BUG_TRAP(skb_shinfo(skb1)->zccd1 == NULL); + skb_shinfo(skb1)->zccd1 = skb_shinfo(skb)->zccd1; + + if (skb_shinfo(skb)->nr_frags == 0) + skb_shinfo(skb)->zccd1 = NULL; + else + zccd_incref(skb_shinfo(skb)->zccd1); + } + + if (skb_shinfo(skb)->zccd2 != NULL) { + BUG_TRAP(skb_shinfo(skb1)->zccd2 == NULL); + skb_shinfo(skb1)->zccd2 = skb_shinfo(skb)->zccd2; + + if (skb_shinfo(skb)->nr_frags == 0) + skb_shinfo(skb)->zccd2 = NULL; + else + zccd_incref(skb_shinfo(skb)->zccd2); + } + } } } --- linux-2.4.21-32.0.1.EL/net/netsyms.c 2005-06-02 05:51:50.000000000 +0300 +++ linux-2.4.21-32.0.1.EL-newzc/net/netsyms.c 2006-10-23 23:44:10.000000000 +0300 @@ -424,6 +424,7 @@ EXPORT_SYMBOL(ip_generic_getfrag); #endif +EXPORT_SYMBOL(tcp_sendpage_zccd); EXPORT_SYMBOL(tcp_read_sock); EXPORT_SYMBOL(netlink_set_err);