From febbff70564e9bf66984d1b232080fed2cd929a8 Mon Sep 17 00:00:00 2001 From: adilger Date: Thu, 8 Mar 2007 20:14:45 +0000 Subject: [PATCH] Branch HEAD Remove old, dangerous tcp zero copy patches. --- .../patches/tcp-zero-copy-2.6-fc5.patch | 475 --------------------- .../patches/tcp-zero-copy-2.6.18-vanilla.patch | 450 ------------------- lustre/kernel_patches/series/2.6-fc5.series | 1 - 3 files changed, 926 deletions(-) delete mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.6-fc5.patch delete mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6-fc5.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6-fc5.patch deleted file mode 100644 index 2183518..0000000 --- a/lustre/kernel_patches/patches/tcp-zero-copy-2.6-fc5.patch +++ /dev/null @@ -1,475 +0,0 @@ -Index: linux-2.6.16.i686/net/core/dev.c -=================================================================== ---- linux-2.6.16.i686.orig/net/core/dev.c 2006-05-30 15:47:10.000000000 +0800 -+++ linux-2.6.16.i686/net/core/dev.c 2006-05-30 21:24:07.000000000 +0800 -@@ -1181,6 +1181,9 @@ - ninfo->tso_segs = skb_shinfo(skb)->tso_segs; - ninfo->nr_frags = 0; - ninfo->frag_list = NULL; -+ ninfo->zccd = NULL; /* copied data => no user zero copy descriptor */ -+ ninfo->zccd2 = NULL; -+ - - /* Offset between the two in bytes */ - offset = data - skb->head; -Index: linux-2.6.16.i686/net/core/skbuff.c -=================================================================== ---- linux-2.6.16.i686.orig/net/core/skbuff.c 2006-05-30 15:47:12.000000000 +0800 -+++ linux-2.6.16.i686/net/core/skbuff.c 2006-05-30 21:26:35.000000000 +0800 -@@ -170,7 +170,8 @@ - shinfo->ufo_size = 0; - shinfo->ip6_frag_id = 0; - shinfo->frag_list = NULL; -- -+ shinfo->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ -+ shinfo->zccd2 = NULL; - if (fclone) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); -@@ -242,7 +243,9 @@ - shinfo->ufo_size = 0; - shinfo->ip6_frag_id = 0; - shinfo->frag_list = NULL; -- -+ shinfo->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ -+ shinfo->zccd2 = NULL; -+ - if (fclone) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); -@@ -287,6 +290,10 @@ - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { -+ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -@@ -606,6 +613,14 @@ - n->data_len = skb->data_len; - n->len = skb->len; - -+ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; -+ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; -+ - if (skb_shinfo(skb)->nr_frags) { - int i; - -@@ -649,6 +664,9 @@ - u8 *data; - int size = nhead + (skb->end - skb->head) + ntail; - long off; -+ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ -+ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ -+ - - if (skb_shared(skb)) - BUG(); -@@ -670,6 +688,11 @@ - if (skb_shinfo(skb)->frag_list) - skb_clone_fraglist(skb); - -+ if (zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (zccd); /* extra ref (pages are shared) */ -+ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (zccd2); /* extra ref (pages are shared) */ -+ - skb_release_data(skb); - - off = (data + nhead) - skb->head; -@@ -684,6 +707,8 @@ - skb->cloned = 0; - skb->nohdr = 0; - atomic_set(&skb_shinfo(skb)->dataref, 1); -+ skb_shinfo(skb)->zccd = zccd; -+ skb_shinfo(skb)->zccd2 = zccd2; - return 0; - - nodata: -Index: linux-2.6.16.i686/net/ipv4/tcp.c -=================================================================== ---- linux-2.6.16.i686.orig/net/ipv4/tcp.c 2006-05-30 15:47:12.000000000 +0800 -+++ linux-2.6.16.i686/net/ipv4/tcp.c 2006-05-30 21:24:07.000000000 +0800 -@@ -498,8 +498,10 @@ - } - } - -+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ - static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, -- size_t psize, int flags) -+ size_t psize, int flags, zccd_t *zccd) -+ - { - struct tcp_sock *tp = tcp_sk(sk); - int mss_now, size_goal; -@@ -547,6 +549,17 @@ - copy = size; - - i = skb_shinfo(skb)->nr_frags; -+ -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ -+ skb_shinfo(skb)->zccd2 != NULL && -+ skb_shinfo(skb)->zccd != zccd && /* not the same one */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ tcp_mark_push (tp, skb); -+ goto new_segment; -+ } -+ - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { - tcp_mark_push(tp, skb); -@@ -562,6 +575,20 @@ - skb_fill_page_desc(skb, i, page, offset, copy); - } - -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ zccd_get (zccd); /* bump ref count */ -+ -+ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); -+ -+ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ -+ skb_shinfo(skb)->zccd = zccd; -+ else -+ skb_shinfo(skb)->zccd2 = zccd; -+ } -+ - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; -@@ -631,12 +658,37 @@ - - lock_sock(sk); - TCP_CHECK_TIMER(sk); -- res = do_tcp_sendpages(sk, &page, offset, size, flags); -+ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return res; -+} -+ -+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd) -+{ -+ ssize_t res; -+ struct sock *sk = sock->sk; -+ -+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) -+ -+ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ -+ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ -+ BUG (); -+ -+#undef TCP_ZC_CSUM_FLAGS -+ -+ lock_sock(sk); -+ TCP_CHECK_TIMER(sk); -+ -+ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); -+ - TCP_CHECK_TIMER(sk); - release_sock(sk); - return res; - } - -+ - #define TCP_PAGE(sk) (sk->sk_sndmsg_page) - #define TCP_OFF(sk) (sk->sk_sndmsg_off) - -@@ -1406,6 +1458,202 @@ - goto out; - } - -+int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, -+ int len, int nonblock) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ int copied; -+ long timeo; -+ -+ BUG_TRAP (len > 0); -+ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ -+ -+ lock_sock(sk); -+ -+ TCP_CHECK_TIMER(sk); -+ -+ copied = -ENOTCONN; -+ if (sk->sk_state == TCP_LISTEN) -+ goto out; -+ -+ copied = 0; -+ timeo = sock_rcvtimeo(sk, nonblock); -+ -+ do { -+ struct sk_buff * skb; -+ u32 offset; -+ unsigned long used; -+ int exhausted; -+ int eaten; -+ -+ /* Are we at urgent data? Stop if we have read anything. */ -+ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) -+ break; -+ -+ /* We need to check signals first, to get correct SIGURG -+ * handling. FIXME: Need to check this doesnt impact 1003.1g -+ * and move it down to the bottom of the loop -+ */ -+ if (signal_pending(current)) { -+ if (copied) -+ break; -+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; -+ break; -+ } -+ -+ /* Next get a buffer. */ -+ -+ skb = skb_peek(&sk->sk_receive_queue); -+ -+ if (skb == NULL) /* nothing ready */ -+ { -+ if (copied) { -+ if (sk->sk_err || -+ sk->sk_state == TCP_CLOSE || -+ (sk->sk_shutdown & RCV_SHUTDOWN) || -+ !timeo || -+ (0)) -+ break; -+ } else { -+ if (sock_flag(sk, SOCK_DONE)) -+ break; -+ -+ if (sk->sk_err) { -+ copied = sock_error(sk); -+ break; -+ } -+ -+ if (sk->sk_shutdown & RCV_SHUTDOWN) -+ break; -+ -+ if (sk->sk_state == TCP_CLOSE) { -+ if (!(sock_flag(sk, SOCK_DONE))) { -+ /* This occurs when user tries to read -+ * from never connected socket. -+ */ -+ copied = -ENOTCONN; -+ break; -+ } -+ break; -+ } -+ -+ if (!timeo) { -+ copied = -EAGAIN; -+ break; -+ } -+ } -+ -+ cleanup_rbuf(sk, copied); -+ sk_wait_data(sk, &timeo); -+ continue; -+ } -+ -+ BUG_TRAP (atomic_read (&skb->users) == 1); -+ -+ exhausted = eaten = 0; -+ -+ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; -+ if (skb->h.th->syn) -+ offset--; -+ -+ used = skb->len - offset; -+ -+ if (tp->urg_data) { -+ u32 urg_offset = tp->urg_seq - tp->copied_seq; -+ if (urg_offset < used) { -+ if (!urg_offset) { /* at urgent date */ -+ if (!(sock_flag(sk, SOCK_URGINLINE))) { -+ tp->copied_seq++; /* discard the single byte of urgent data */ -+ offset++; -+ used--; -+ } -+ } else /* truncate read */ -+ used = urg_offset; -+ } -+ } -+ -+ BUG_TRAP (used >= 0); -+ if (len < used) -+ used = len; -+ -+ if (used == 0) -+ exhausted = 1; -+ else -+ { -+ if (skb_is_nonlinear (skb)) -+ { -+ int rc = skb_linearize (skb, GFP_KERNEL); -+ -+ printk ("tcp_recvpackets(): linearising: %d\n", rc); -+ -+ if (rc) -+ { -+ if (!copied) -+ copied = rc; -+ break; -+ } -+ } -+ -+ if ((offset + used) == skb->len) /* consuming the whole packet */ -+ { -+ __skb_unlink (skb, &sk->sk_receive_queue); -+ dst_release (skb->dst); -+ skb_orphan (skb); -+ __skb_pull (skb, offset); -+ __skb_queue_tail (packets, skb); -+ exhausted = eaten = 1; -+ } -+ else /* consuming only part of the packet */ -+ { -+ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); -+ -+ if (skb2 == NULL) -+ { -+ if (!copied) -+ copied = -ENOMEM; -+ break; -+ } -+ -+ dst_release (skb2->dst); -+ __skb_pull (skb2, offset); -+ __skb_trim (skb2, used); -+ __skb_queue_tail (packets, skb2); -+ } -+ -+ tp->copied_seq += used; -+ copied += used; -+ len -= used; -+ } -+ -+ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { -+ tp->urg_data = 0; -+ tcp_fast_path_check(sk, tp); -+ } -+ -+ if (!exhausted) -+ continue; -+ -+ if (skb->h.th->fin) -+ { -+ tp->copied_seq++; -+ if (!eaten) -+ sk_eat_skb (sk, skb); -+ break; -+ } -+ -+ if (!eaten) -+ sk_eat_skb (sk, skb); -+ -+ } while (len > 0); -+ -+ out: -+ /* Clean up data we have read: This will do ACK frames. */ -+ cleanup_rbuf(sk, copied); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return copied; -+} -+ - /* - * State processing on a close. This implements the state shift for - * sending our FIN frame. Note that we only send a FIN for some -@@ -2139,6 +2387,8 @@ - EXPORT_SYMBOL(tcp_recvmsg); - EXPORT_SYMBOL(tcp_sendmsg); - EXPORT_SYMBOL(tcp_sendpage); -+EXPORT_SYMBOL(tcp_sendpage_zccd); -+EXPORT_SYMBOL(tcp_recvpackets); - EXPORT_SYMBOL(tcp_setsockopt); - EXPORT_SYMBOL(tcp_shutdown); - EXPORT_SYMBOL(tcp_statistics); -Index: linux-2.6.16.i686/include/linux/skbuff.h -=================================================================== ---- linux-2.6.16.i686.orig/include/linux/skbuff.h 2006-05-30 15:47:11.000000000 +0800 -+++ linux-2.6.16.i686/include/linux/skbuff.h 2006-05-30 21:24:07.000000000 +0800 -@@ -128,6 +128,30 @@ - __u16 size; - }; - -+/* Support for callback when skb data has been released */ -+typedef struct zccd /* Zero Copy Callback Descriptor */ -+{ /* (embed as first member of custom struct) */ -+ atomic_t zccd_count; /* reference count */ -+ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ -+} zccd_t; -+ -+static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) -+{ -+ atomic_set (&d->zccd_count, 1); -+ d->zccd_destructor = callback; -+} -+ -+static inline void zccd_get (zccd_t *d) /* take a reference */ -+{ -+ atomic_inc (&d->zccd_count); -+} -+ -+static inline void zccd_put (zccd_t *d) /* release a reference */ -+{ -+ if (atomic_dec_and_test (&d->zccd_count)) -+ (d->zccd_destructor)(d); -+} -+ - /* This data is invariant across clones and lives at - * the end of the header data, ie. at skb->end. - */ -@@ -139,6 +163,13 @@ - unsigned short ufo_size; - unsigned int ip6_frag_id; - struct sk_buff *frag_list; -+ zccd_t *zccd; /* zero copy descriptor */ -+ zccd_t *zccd2; /* 2nd zero copy descriptor */ -+ /* NB we expect zero-copy data to be at least 1 packet, so -+ * having 2 zccds means we don't unneccessarily split the packet -+ * where consecutive zero-copy sends abutt. -+ */ -+ - skb_frag_t frags[MAX_SKB_FRAGS]; - }; - -Index: linux-2.6.16.i686/include/net/tcp.h -=================================================================== ---- linux-2.6.16.i686.orig/include/net/tcp.h 2006-05-30 15:47:11.000000000 +0800 -+++ linux-2.6.16.i686/include/net/tcp.h 2006-05-30 21:24:07.000000000 +0800 -@@ -272,6 +272,9 @@ - extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size); - extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); -+extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd); -+ - - extern int tcp_ioctl(struct sock *sk, - int cmd, -@@ -354,6 +357,9 @@ - struct msghdr *msg, - size_t len, int nonblock, - int flags, int *addr_len); -+extern int tcp_recvpackets(struct sock *sk, -+ struct sk_buff_head *packets, -+ int len, int nonblock); - - extern void tcp_parse_options(struct sk_buff *skb, - struct tcp_options_received *opt_rx, diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch deleted file mode 100644 index cb33b04..0000000 --- a/lustre/kernel_patches/patches/tcp-zero-copy-2.6.18-vanilla.patch +++ /dev/null @@ -1,450 +0,0 @@ -Index: linux-2.6/net/core/skbuff.c -=================================================================== ---- linux-2.6.orig/net/core/skbuff.c 2006-07-15 21:08:45.000000000 +0800 -+++ linux-2.6/net/core/skbuff.c 2006-07-15 21:12:21.000000000 +0800 -@@ -183,7 +183,8 @@ struct sk_buff *__alloc_skb(unsigned int - shinfo->gso_type = 0; - shinfo->ip6_frag_id = 0; - shinfo->frag_list = NULL; -- -+ shinfo->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ -+ shinfo->zccd2 = NULL; - if (fclone) { - struct sk_buff *child = skb + 1; - atomic_t *fclone_ref = (atomic_t *) (child + 1); -@@ -283,6 +284,10 @@ static void skb_release_data(struct sk_b - if (!skb->cloned || - !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, - &skb_shinfo(skb)->dataref)) { -+ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ -+ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ - if (skb_shinfo(skb)->nr_frags) { - int i; - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -@@ -618,6 +623,14 @@ struct sk_buff *pskb_copy(struct sk_buff - n->data_len = skb->data_len; - n->len = skb->len; - -+ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; -+ -+ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ -+ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; -+ - if (skb_shinfo(skb)->nr_frags) { - int i; - -@@ -661,6 +674,9 @@ int pskb_expand_head(struct sk_buff *skb - u8 *data; - int size = nhead + (skb->end - skb->head) + ntail; - long off; -+ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ -+ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ -+ - - if (skb_shared(skb)) - BUG(); -@@ -682,6 +698,11 @@ int pskb_expand_head(struct sk_buff *skb - if (skb_shinfo(skb)->frag_list) - skb_clone_fraglist(skb); - -+ if (zccd != NULL) /* user zero copy descriptor? */ -+ zccd_get (zccd); /* extra ref (pages are shared) */ -+ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ -+ zccd_get (zccd2); /* extra ref (pages are shared) */ -+ - skb_release_data(skb); - - off = (data + nhead) - skb->head; -@@ -696,6 +717,8 @@ int pskb_expand_head(struct sk_buff *skb - skb->cloned = 0; - skb->nohdr = 0; - atomic_set(&skb_shinfo(skb)->dataref, 1); -+ skb_shinfo(skb)->zccd = zccd; -+ skb_shinfo(skb)->zccd2 = zccd2; - return 0; - - nodata: -Index: linux-2.6/net/ipv4/tcp.c -=================================================================== ---- linux-2.6.orig/net/ipv4/tcp.c 2006-07-15 21:08:45.000000000 +0800 -+++ linux-2.6/net/ipv4/tcp.c 2006-07-15 22:32:12.000000000 +0800 -@@ -499,8 +499,10 @@ static inline void tcp_push(struct sock - } - } - -+/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ - static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, -- size_t psize, int flags) -+ size_t psize, int flags, zccd_t *zccd) -+ - { - struct tcp_sock *tp = tcp_sk(sk); - int mss_now, size_goal; -@@ -548,6 +550,17 @@ new_segment: - copy = size; - - i = skb_shinfo(skb)->nr_frags; -+ -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ -+ skb_shinfo(skb)->zccd2 != NULL && -+ skb_shinfo(skb)->zccd != zccd && /* not the same one */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ tcp_mark_push (tp, skb); -+ goto new_segment; -+ } -+ - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= MAX_SKB_FRAGS) { - tcp_mark_push(tp, skb); -@@ -563,6 +576,20 @@ new_segment: - skb_fill_page_desc(skb, i, page, offset, copy); - } - -+ if (zccd != NULL && /* this is a zcc I/O */ -+ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ -+ skb_shinfo(skb)->zccd2 != zccd) -+ { -+ zccd_get (zccd); /* bump ref count */ -+ -+ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); -+ -+ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ -+ skb_shinfo(skb)->zccd = zccd; -+ else -+ skb_shinfo(skb)->zccd2 = zccd; -+ } -+ - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; -@@ -628,12 +655,37 @@ ssize_t tcp_sendpage(struct socket *sock - - lock_sock(sk); - TCP_CHECK_TIMER(sk); -- res = do_tcp_sendpages(sk, &page, offset, size, flags); -+ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return res; -+} -+ -+ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd) -+{ -+ ssize_t res; -+ struct sock *sk = sock->sk; -+ -+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) -+ -+ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ -+ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ -+ BUG (); -+ -+#undef TCP_ZC_CSUM_FLAGS -+ -+ lock_sock(sk); -+ TCP_CHECK_TIMER(sk); -+ -+ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); -+ - TCP_CHECK_TIMER(sk); - release_sock(sk); - return res; - } - -+ - #define TCP_PAGE(sk) (sk->sk_sndmsg_page) - #define TCP_OFF(sk) (sk->sk_sndmsg_off) - -@@ -1477,6 +1529,202 @@ recv_urg: - goto out; - } - -+int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, -+ int len, int nonblock) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ int copied; -+ long timeo; -+ -+ BUG_TRAP (len > 0); -+ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ -+ -+ lock_sock(sk); -+ -+ TCP_CHECK_TIMER(sk); -+ -+ copied = -ENOTCONN; -+ if (sk->sk_state == TCP_LISTEN) -+ goto out; -+ -+ copied = 0; -+ timeo = sock_rcvtimeo(sk, nonblock); -+ -+ do { -+ struct sk_buff * skb; -+ u32 offset; -+ unsigned long used; -+ int exhausted; -+ int eaten; -+ -+ /* Are we at urgent data? Stop if we have read anything. */ -+ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) -+ break; -+ -+ /* We need to check signals first, to get correct SIGURG -+ * handling. FIXME: Need to check this doesnt impact 1003.1g -+ * and move it down to the bottom of the loop -+ */ -+ if (signal_pending(current)) { -+ if (copied) -+ break; -+ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; -+ break; -+ } -+ -+ /* Next get a buffer. */ -+ -+ skb = skb_peek(&sk->sk_receive_queue); -+ -+ if (skb == NULL) /* nothing ready */ -+ { -+ if (copied) { -+ if (sk->sk_err || -+ sk->sk_state == TCP_CLOSE || -+ (sk->sk_shutdown & RCV_SHUTDOWN) || -+ !timeo || -+ (0)) -+ break; -+ } else { -+ if (sock_flag(sk, SOCK_DONE)) -+ break; -+ -+ if (sk->sk_err) { -+ copied = sock_error(sk); -+ break; -+ } -+ -+ if (sk->sk_shutdown & RCV_SHUTDOWN) -+ break; -+ -+ if (sk->sk_state == TCP_CLOSE) { -+ if (!(sock_flag(sk, SOCK_DONE))) { -+ /* This occurs when user tries to read -+ * from never connected socket. -+ */ -+ copied = -ENOTCONN; -+ break; -+ } -+ break; -+ } -+ -+ if (!timeo) { -+ copied = -EAGAIN; -+ break; -+ } -+ } -+ -+ tcp_cleanup_rbuf(sk, copied); -+ sk_wait_data(sk, &timeo); -+ continue; -+ } -+ -+ BUG_TRAP (atomic_read (&skb->users) == 1); -+ -+ exhausted = eaten = 0; -+ -+ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; -+ if (skb->h.th->syn) -+ offset--; -+ -+ used = skb->len - offset; -+ -+ if (tp->urg_data) { -+ u32 urg_offset = tp->urg_seq - tp->copied_seq; -+ if (urg_offset < used) { -+ if (!urg_offset) { /* at urgent date */ -+ if (!(sock_flag(sk, SOCK_URGINLINE))) { -+ tp->copied_seq++; /* discard the single byte of urgent data */ -+ offset++; -+ used--; -+ } -+ } else /* truncate read */ -+ used = urg_offset; -+ } -+ } -+ -+ BUG_TRAP (used >= 0); -+ if (len < used) -+ used = len; -+ -+ if (used == 0) -+ exhausted = 1; -+ else -+ { -+ if (skb_is_nonlinear (skb)) -+ { -+ int rc = skb_linearize (skb); -+ -+ printk ("tcp_recvpackets(): linearising: %d\n", rc); -+ -+ if (rc) -+ { -+ if (!copied) -+ copied = rc; -+ break; -+ } -+ } -+ -+ if ((offset + used) == skb->len) /* consuming the whole packet */ -+ { -+ __skb_unlink (skb, &sk->sk_receive_queue); -+ dst_release (skb->dst); -+ skb_orphan (skb); -+ __skb_pull (skb, offset); -+ __skb_queue_tail (packets, skb); -+ exhausted = eaten = 1; -+ } -+ else /* consuming only part of the packet */ -+ { -+ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); -+ -+ if (skb2 == NULL) -+ { -+ if (!copied) -+ copied = -ENOMEM; -+ break; -+ } -+ -+ dst_release (skb2->dst); -+ __skb_pull (skb2, offset); -+ __skb_trim (skb2, used); -+ __skb_queue_tail (packets, skb2); -+ } -+ -+ tp->copied_seq += used; -+ copied += used; -+ len -= used; -+ } -+ -+ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { -+ tp->urg_data = 0; -+ tcp_fast_path_check(sk, tp); -+ } -+ -+ if (!exhausted) -+ continue; -+ -+ if (skb->h.th->fin) -+ { -+ tp->copied_seq++; -+ if (!eaten) -+ sk_eat_skb (sk, skb, 0); -+ break; -+ } -+ -+ if (!eaten) -+ sk_eat_skb (sk, skb, 0); -+ -+ } while (len > 0); -+ -+ out: -+ /* Clean up data we have read: This will do ACK frames. */ -+ tcp_cleanup_rbuf(sk, copied); -+ TCP_CHECK_TIMER(sk); -+ release_sock(sk); -+ return copied; -+} -+ - /* - * State processing on a close. This implements the state shift for - * sending our FIN frame. Note that we only send a FIN for some -@@ -2345,6 +2593,8 @@ EXPORT_SYMBOL(tcp_read_sock); - EXPORT_SYMBOL(tcp_recvmsg); - EXPORT_SYMBOL(tcp_sendmsg); - EXPORT_SYMBOL(tcp_sendpage); -+EXPORT_SYMBOL(tcp_sendpage_zccd); -+EXPORT_SYMBOL(tcp_recvpackets); - EXPORT_SYMBOL(tcp_setsockopt); - EXPORT_SYMBOL(tcp_shutdown); - EXPORT_SYMBOL(tcp_statistics); -Index: linux-2.6/include/linux/skbuff.h -=================================================================== ---- linux-2.6.orig/include/linux/skbuff.h 2006-07-15 21:08:45.000000000 +0800 -+++ linux-2.6/include/linux/skbuff.h 2006-07-15 21:12:21.000000000 +0800 -@@ -128,6 +128,30 @@ struct skb_frag_struct { - __u16 size; - }; - -+/* Support for callback when skb data has been released */ -+typedef struct zccd /* Zero Copy Callback Descriptor */ -+{ /* (embed as first member of custom struct) */ -+ atomic_t zccd_count; /* reference count */ -+ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ -+} zccd_t; -+ -+static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) -+{ -+ atomic_set (&d->zccd_count, 1); -+ d->zccd_destructor = callback; -+} -+ -+static inline void zccd_get (zccd_t *d) /* take a reference */ -+{ -+ atomic_inc (&d->zccd_count); -+} -+ -+static inline void zccd_put (zccd_t *d) /* release a reference */ -+{ -+ if (atomic_dec_and_test (&d->zccd_count)) -+ (d->zccd_destructor)(d); -+} -+ - /* This data is invariant across clones and lives at - * the end of the header data, ie. at skb->end. - */ -@@ -140,6 +164,13 @@ struct skb_shared_info { - unsigned short gso_type; - unsigned int ip6_frag_id; - struct sk_buff *frag_list; -+ zccd_t *zccd; /* zero copy descriptor */ -+ zccd_t *zccd2; /* 2nd zero copy descriptor */ -+ /* NB we expect zero-copy data to be at least 1 packet, so -+ * having 2 zccds means we don't unneccessarily split the packet -+ * where consecutive zero-copy sends abutt. -+ */ -+ - skb_frag_t frags[MAX_SKB_FRAGS]; - }; - -Index: linux-2.6/include/net/tcp.h -=================================================================== ---- linux-2.6.orig/include/net/tcp.h 2006-07-15 21:08:45.000000000 +0800 -+++ linux-2.6/include/net/tcp.h 2006-07-15 21:12:21.000000000 +0800 -@@ -278,6 +278,9 @@ extern int tcp_v4_tw_remember_stam - extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, - struct msghdr *msg, size_t size); - extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); -+extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, -+ int flags, zccd_t *zccd); -+ - - extern int tcp_ioctl(struct sock *sk, - int cmd, -@@ -368,6 +371,9 @@ extern int tcp_recvmsg(struct kiocb *i - struct msghdr *msg, - size_t len, int nonblock, - int flags, int *addr_len); -+extern int tcp_recvpackets(struct sock *sk, -+ struct sk_buff_head *packets, -+ int len, int nonblock); - - extern void tcp_parse_options(struct sk_buff *skb, - struct tcp_options_received *opt_rx, diff --git a/lustre/kernel_patches/series/2.6-fc5.series b/lustre/kernel_patches/series/2.6-fc5.series index c9abdd0..1835748 100644 --- a/lustre/kernel_patches/series/2.6-fc5.series +++ b/lustre/kernel_patches/series/2.6-fc5.series @@ -16,5 +16,4 @@ remove-suid-2.6-suse.patch export-show_task-2.6-fc5.patch sd_iostats-2.6-rhel4.patch export_symbol_numa-2.6-fc5.patch -tcp-zero-copy-2.6-fc5.patch vfs_intent-2.6-fc5-fix.patch -- 1.8.3.1