From 3cadbaca4e3fbe4c8f25d9fde8800d51f573f2de Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 18 Aug 2004 09:30:43 +0000 Subject: [PATCH] Update b1_4_smallfix from b1_4 (20040817_1834) - direct IO reads on OST (4048) - tune the read pipeline (3236) - provide read-ahead stats and refine rpc in flight stats (3328) --- .../patches/tcp-zero-copy-2.4.21-chaos.patch | 477 +++++++++++++++++++++ 1 file changed, 477 insertions(+) create mode 100644 lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-chaos.patch diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-chaos.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-chaos.patch new file mode 100644 index 0000000..41ef631 --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.4.21-chaos.patch @@ -0,0 +1,477 @@ + include/linux/skbuff.h | 30 +++++ + include/net/tcp.h | 5 + net/core/skbuff.c | 25 ++++ + net/ipv4/tcp.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++- + net/netsyms.c | 2 + 5 files changed, 311 insertions(+), 3 deletions(-) + +Index: linux-2.4.21-4.EL/include/linux/skbuff.h +=================================================================== +--- linux-2.4.21-4.EL.orig/include/linux/skbuff.h 2004-06-24 15:52:05.000000000 +0800 ++++ linux-2.4.21-4.EL/include/linux/skbuff.h 2004-06-24 16:31:28.000000000 +0800 +@@ -116,6 +116,30 @@ + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -123,6 +147,12 @@ + atomic_t dataref; + unsigned int nr_frags; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +Index: linux-2.4.21-4.EL/include/net/tcp.h +=================================================================== +--- linux-2.4.21-4.EL.orig/include/net/tcp.h 2004-06-24 15:52:05.000000000 +0800 ++++ linux-2.4.21-4.EL/include/net/tcp.h 2004-06-24 16:32:30.000000000 +0800 +@@ -636,6 +636,8 @@ + + extern int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -730,6 +732,10 @@ + struct msghdr *msg, + int len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); ++ + extern int tcp_kvec_read(struct sock *sk, kvec_cb_t cb, int len); + extern int tcp_kvec_write(struct sock *sk, kvec_cb_t cb, int len); + +Index: linux-2.4.21-4.EL/net/core/skbuff.c +=================================================================== +--- linux-2.4.21-4.EL.orig/net/core/skbuff.c 2003-10-04 05:28:51.000000000 +0800 ++++ linux-2.4.21-4.EL/net/core/skbuff.c 2004-06-24 16:31:28.000000000 +0800 +@@ -210,6 +210,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ skb_shinfo(skb)->zccd2 = NULL; + return skb; + + nodata: +@@ -280,6 +282,10 @@ + { + if (!skb->cloned || + atomic_dec_and_test(&(skb_shinfo(skb)->dataref))) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -550,6 +556,8 @@ + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->zccd = NULL; /* copied data => no user zero copy descriptor */ ++ skb_shinfo(skb)->zccd2 = NULL; + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; +@@ -596,6 +604,14 @@ + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -638,6 +654,8 @@ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ + + if (skb_shared(skb)) + BUG(); +@@ -659,6 +677,11 @@ + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data+nhead) - skb->head; +@@ -673,6 +696,8 @@ + skb->nh.raw += off; + skb->cloned = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +Index: linux-2.4.21-4.EL/net/ipv4/tcp.c +=================================================================== +--- linux-2.4.21-4.EL.orig/net/ipv4/tcp.c 2003-10-04 05:28:43.000000000 +0800 ++++ linux-2.4.21-4.EL/net/ipv4/tcp.c 2004-06-24 16:31:28.000000000 +0800 +@@ -1015,7 +1015,7 @@ + goto out; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags); ++ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags, zccd_t *zccd); + + static inline int + can_coalesce(struct sk_buff *skb, int i, struct page *page, int off) +@@ -1094,7 +1094,7 @@ + return err; + } + +-ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags) ++ssize_t do_tcp_sendpages(struct sock *sk, struct kveclet *let, int poffset, size_t psize, int flags, zccd_t *zccd) + { + struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); + int mss_now; +@@ -1147,6 +1147,17 @@ + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + if (can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += copy; + } else if (i < MAX_SKB_FRAGS) { +@@ -1157,6 +1168,20 @@ + goto new_segment; + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->ip_summed = CHECKSUM_HW; +@@ -1224,12 +1249,32 @@ + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &let, 0, size, flags); ++ res = do_tcp_sendpages(sk, &let, 0, size, flags, NULL); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; + } + ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ struct kveclet let = { page, offset, size }; ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++ if (!(sk->route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &let, 0, size, flags, zccd); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ + static void tcp_kvec_write_worker(struct tcp_write_async_info *info) + { + struct sock *sk = info->sk; +@@ -1238,7 +1283,7 @@ + !(sk->route_caps & TCP_ZC_CSUM_FLAGS)) + BUG(); + +- res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT); ++ res = do_tcp_sendpages(sk, info->cur_let, info->offset, info->len - info->done, MSG_DONTWAIT, NULL); + if (res > 0) + info->done += res; + +@@ -2102,6 +2147,202 @@ + goto out; + } + ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->err || ++ sk->state == TCP_CLOSE || ++ (sk->shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sk->done) ++ break; ++ ++ if (sk->err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->state == TCP_CLOSE) { ++ if (!sk->done) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ cleanup_rbuf(sk, copied); ++ timeo = tcp_data_wait(sk, timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!sk->urginline) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ break; ++ } ++ ++ if (!eaten) ++ tcp_eat_skb (sk, skb); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} ++ + /* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some +Index: linux-2.4.21-4.EL/net/netsyms.c +=================================================================== +--- linux-2.4.21-4.EL.orig/net/netsyms.c 2003-10-04 05:28:43.000000000 +0800 ++++ linux-2.4.21-4.EL/net/netsyms.c 2004-06-24 16:31:28.000000000 +0800 +@@ -424,6 +424,8 @@ + EXPORT_SYMBOL(ip_generic_getfrag); + + #endif ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + EXPORT_SYMBOL(tcp_read_sock); + + EXPORT_SYMBOL(netlink_set_err); -- 1.8.3.1