X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fkernel_patches%2Fpatches%2Ftcp-zero-copy-2.6-sles10.patch;fp=lustre%2Fkernel_patches%2Fpatches%2Ftcp-zero-copy-2.6-sles10.patch;h=6e017e3b9891ff10d9e27ffa822ce91796b6a9dc;hb=113303973ec9f8484eb2355a1a6ef3c4c7fd6a56;hp=0000000000000000000000000000000000000000;hpb=272294dfa235dac803ed2d2b2ee8e0bd402622bb;p=fs%2Flustre-release.git diff --git a/lustre/kernel_patches/patches/tcp-zero-copy-2.6-sles10.patch b/lustre/kernel_patches/patches/tcp-zero-copy-2.6-sles10.patch new file mode 100644 index 0000000..6e017e3 --- /dev/null +++ b/lustre/kernel_patches/patches/tcp-zero-copy-2.6-sles10.patch @@ -0,0 +1,450 @@ +Index: linux-2.6.16.21-0.8/net/core/skbuff.c +=================================================================== +--- linux-2.6.16.21-0.8.orig/net/core/skbuff.c 2006-08-03 21:11:25.000000000 -0600 ++++ linux-2.6.16.21-0.8/net/core/skbuff.c 2006-08-03 21:11:30.000000000 -0600 +@@ -170,7 +170,8 @@ + shinfo->ufo_size = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; +- ++ shinfo->zccd = NULL; /* skbuffs kick off with NO user zero copy descriptors */ ++ shinfo->zccd2 = NULL; + if (fclone) { + struct sk_buff *child = skb + 1; + atomic_t *fclone_ref = (atomic_t *) (child + 1); +@@ -287,6 +288,10 @@ + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { ++ if (skb_shinfo(skb)->zccd != NULL) /* zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd); /* release hold */ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd zero copy callback descriptor? */ ++ zccd_put (skb_shinfo(skb)->zccd2); /* release hold */ + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) +@@ -606,6 +611,14 @@ + n->data_len = skb->data_len; + n->len = skb->len; + ++ if (skb_shinfo(skb)->zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd = skb_shinfo(skb)->zccd; ++ ++ if (skb_shinfo(skb)->zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (skb_shinfo(skb)->zccd2); /* 1 more ref (pages are shared) */ ++ skb_shinfo(n)->zccd2 = skb_shinfo(skb)->zccd2; ++ + if (skb_shinfo(skb)->nr_frags) { + int i; + +@@ -649,6 +662,9 @@ + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; ++ zccd_t *zccd = skb_shinfo(skb)->zccd; /* stash user zero copy descriptor */ ++ zccd_t *zccd2 = skb_shinfo(skb)->zccd2; /* stash 2nd user zero copy descriptor */ ++ + + if (skb_shared(skb)) + BUG(); +@@ -670,6 +686,11 @@ + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + ++ if (zccd != NULL) /* user zero copy descriptor? */ ++ zccd_get (zccd); /* extra ref (pages are shared) */ ++ if (zccd2 != NULL) /* 2nd user zero copy descriptor? */ ++ zccd_get (zccd2); /* extra ref (pages are shared) */ ++ + skb_release_data(skb); + + off = (data + nhead) - skb->head; +@@ -684,6 +705,8 @@ + skb->cloned = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); ++ skb_shinfo(skb)->zccd = zccd; ++ skb_shinfo(skb)->zccd2 = zccd2; + return 0; + + nodata: +Index: linux-2.6.16.21-0.8/net/ipv4/tcp.c +=================================================================== +--- linux-2.6.16.21-0.8.orig/net/ipv4/tcp.c 2006-08-03 21:11:25.000000000 -0600 ++++ linux-2.6.16.21-0.8/net/ipv4/tcp.c 2006-08-03 21:11:30.000000000 -0600 +@@ -498,8 +498,10 @@ + } + } + ++/* Extra parameter: user zero copy descriptor (or NULL if not doing that) */ + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, +- size_t psize, int flags) ++ size_t psize, int flags, zccd_t *zccd) ++ + { + struct tcp_sock *tp = tcp_sk(sk); + int mss_now, size_goal; +@@ -547,6 +549,17 @@ + copy = size; + + i = skb_shinfo(skb)->nr_frags; ++ ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != NULL && /* skb is part of a zcc I/O */ ++ skb_shinfo(skb)->zccd2 != NULL && ++ skb_shinfo(skb)->zccd != zccd && /* not the same one */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ tcp_mark_push (tp, skb); ++ goto new_segment; ++ } ++ + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= MAX_SKB_FRAGS) { + tcp_mark_push(tp, skb); +@@ -562,6 +575,20 @@ + skb_fill_page_desc(skb, i, page, offset, copy); + } + ++ if (zccd != NULL && /* this is a zcc I/O */ ++ skb_shinfo(skb)->zccd != zccd && /* not already referencing this zccd */ ++ skb_shinfo(skb)->zccd2 != zccd) ++ { ++ zccd_get (zccd); /* bump ref count */ ++ ++ BUG_TRAP (skb_shinfo(skb)->zccd2 == NULL); ++ ++ if (skb_shinfo(skb)->zccd == NULL) /* reference this zccd */ ++ skb_shinfo(skb)->zccd = zccd; ++ else ++ skb_shinfo(skb)->zccd2 = zccd; ++ } ++ + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; +@@ -631,12 +658,37 @@ + + lock_sock(sk); + TCP_CHECK_TIMER(sk); +- res = do_tcp_sendpages(sk, &page, offset, size, flags); ++ res = do_tcp_sendpages(sk, &page, offset, size, flags,NULL); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return res; ++} ++ ++ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd) ++{ ++ ssize_t res; ++ struct sock *sk = sock->sk; ++ ++#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) ++ ++ if (!(sk->sk_route_caps & NETIF_F_SG) || /* caller shouldn't waste her time */ ++ !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) /* on double mapping */ ++ BUG (); ++ ++#undef TCP_ZC_CSUM_FLAGS ++ ++ lock_sock(sk); ++ TCP_CHECK_TIMER(sk); ++ ++ res = do_tcp_sendpages(sk, &page, offset, size, flags, zccd); ++ + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; + } + ++ + #define TCP_PAGE(sk) (sk->sk_sndmsg_page) + #define TCP_OFF(sk) (sk->sk_sndmsg_off) + +@@ -1482,6 +1534,202 @@ + goto out; + } + ++int tcp_recvpackets (struct sock *sk, struct sk_buff_head *packets, ++ int len, int nonblock) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ int copied; ++ long timeo; ++ ++ BUG_TRAP (len > 0); ++ /*BUG_TRAP ((flags & (MSG_OOB | MSG_PEEK | MSG_TRUNC)) == 0);*/ ++ ++ lock_sock(sk); ++ ++ TCP_CHECK_TIMER(sk); ++ ++ copied = -ENOTCONN; ++ if (sk->sk_state == TCP_LISTEN) ++ goto out; ++ ++ copied = 0; ++ timeo = sock_rcvtimeo(sk, nonblock); ++ ++ do { ++ struct sk_buff * skb; ++ u32 offset; ++ unsigned long used; ++ int exhausted; ++ int eaten; ++ ++ /* Are we at urgent data? Stop if we have read anything. */ ++ if (copied && tp->urg_data && tp->urg_seq == tp->copied_seq) ++ break; ++ ++ /* We need to check signals first, to get correct SIGURG ++ * handling. FIXME: Need to check this doesnt impact 1003.1g ++ * and move it down to the bottom of the loop ++ */ ++ if (signal_pending(current)) { ++ if (copied) ++ break; ++ copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; ++ break; ++ } ++ ++ /* Next get a buffer. */ ++ ++ skb = skb_peek(&sk->sk_receive_queue); ++ ++ if (skb == NULL) /* nothing ready */ ++ { ++ if (copied) { ++ if (sk->sk_err || ++ sk->sk_state == TCP_CLOSE || ++ (sk->sk_shutdown & RCV_SHUTDOWN) || ++ !timeo || ++ (0)) ++ break; ++ } else { ++ if (sock_flag(sk, SOCK_DONE)) ++ break; ++ ++ if (sk->sk_err) { ++ copied = sock_error(sk); ++ break; ++ } ++ ++ if (sk->sk_shutdown & RCV_SHUTDOWN) ++ break; ++ ++ if (sk->sk_state == TCP_CLOSE) { ++ if (!(sock_flag(sk, SOCK_DONE))) { ++ /* This occurs when user tries to read ++ * from never connected socket. ++ */ ++ copied = -ENOTCONN; ++ break; ++ } ++ break; ++ } ++ ++ if (!timeo) { ++ copied = -EAGAIN; ++ break; ++ } ++ } ++ ++ tcp_cleanup_rbuf(sk, copied); ++ sk_wait_data(sk, &timeo); ++ continue; ++ } ++ ++ BUG_TRAP (atomic_read (&skb->users) == 1); ++ ++ exhausted = eaten = 0; ++ ++ offset = tp->copied_seq - TCP_SKB_CB(skb)->seq; ++ if (skb->h.th->syn) ++ offset--; ++ ++ used = skb->len - offset; ++ ++ if (tp->urg_data) { ++ u32 urg_offset = tp->urg_seq - tp->copied_seq; ++ if (urg_offset < used) { ++ if (!urg_offset) { /* at urgent date */ ++ if (!(sock_flag(sk, SOCK_URGINLINE))) { ++ tp->copied_seq++; /* discard the single byte of urgent data */ ++ offset++; ++ used--; ++ } ++ } else /* truncate read */ ++ used = urg_offset; ++ } ++ } ++ ++ BUG_TRAP (used >= 0); ++ if (len < used) ++ used = len; ++ ++ if (used == 0) ++ exhausted = 1; ++ else ++ { ++ if (skb_is_nonlinear (skb)) ++ { ++ int rc = skb_linearize (skb, GFP_KERNEL); ++ ++ printk ("tcp_recvpackets(): linearising: %d\n", rc); ++ ++ if (rc) ++ { ++ if (!copied) ++ copied = rc; ++ break; ++ } ++ } ++ ++ if ((offset + used) == skb->len) /* consuming the whole packet */ ++ { ++ __skb_unlink (skb, &sk->sk_receive_queue); ++ dst_release (skb->dst); ++ skb_orphan (skb); ++ __skb_pull (skb, offset); ++ __skb_queue_tail (packets, skb); ++ exhausted = eaten = 1; ++ } ++ else /* consuming only part of the packet */ ++ { ++ struct sk_buff *skb2 = skb_clone (skb, GFP_KERNEL); ++ ++ if (skb2 == NULL) ++ { ++ if (!copied) ++ copied = -ENOMEM; ++ break; ++ } ++ ++ dst_release (skb2->dst); ++ __skb_pull (skb2, offset); ++ __skb_trim (skb2, used); ++ __skb_queue_tail (packets, skb2); ++ } ++ ++ tp->copied_seq += used; ++ copied += used; ++ len -= used; ++ } ++ ++ if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) { ++ tp->urg_data = 0; ++ tcp_fast_path_check(sk, tp); ++ } ++ ++ if (!exhausted) ++ continue; ++ ++ if (skb->h.th->fin) ++ { ++ tp->copied_seq++; ++ if (!eaten) ++ sk_eat_skb (sk, skb, 0); ++ break; ++ } ++ ++ if (!eaten) ++ sk_eat_skb (sk, skb, 0); ++ ++ } while (len > 0); ++ ++ out: ++ /* Clean up data we have read: This will do ACK frames. */ ++ tcp_cleanup_rbuf(sk, copied); ++ TCP_CHECK_TIMER(sk); ++ release_sock(sk); ++ return copied; ++} ++ + /* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some +@@ -2218,6 +2466,8 @@ + EXPORT_SYMBOL(tcp_recvmsg); + EXPORT_SYMBOL(tcp_sendmsg); + EXPORT_SYMBOL(tcp_sendpage); ++EXPORT_SYMBOL(tcp_sendpage_zccd); ++EXPORT_SYMBOL(tcp_recvpackets); + EXPORT_SYMBOL(tcp_setsockopt); + EXPORT_SYMBOL(tcp_shutdown); + EXPORT_SYMBOL(tcp_statistics); +Index: linux-2.6.16.21-0.8/include/linux/skbuff.h +=================================================================== +--- linux-2.6.16.21-0.8.orig/include/linux/skbuff.h 2006-08-03 21:11:25.000000000 -0600 ++++ linux-2.6.16.21-0.8/include/linux/skbuff.h 2006-08-03 21:11:30.000000000 -0600 +@@ -129,6 +129,30 @@ + __u16 size; + }; + ++/* Support for callback when skb data has been released */ ++typedef struct zccd /* Zero Copy Callback Descriptor */ ++{ /* (embed as first member of custom struct) */ ++ atomic_t zccd_count; /* reference count */ ++ void (*zccd_destructor)(struct zccd *); /* callback when refcount reaches zero */ ++} zccd_t; ++ ++static inline void zccd_init (zccd_t *d, void (*callback)(zccd_t *)) ++{ ++ atomic_set (&d->zccd_count, 1); ++ d->zccd_destructor = callback; ++} ++ ++static inline void zccd_get (zccd_t *d) /* take a reference */ ++{ ++ atomic_inc (&d->zccd_count); ++} ++ ++static inline void zccd_put (zccd_t *d) /* release a reference */ ++{ ++ if (atomic_dec_and_test (&d->zccd_count)) ++ (d->zccd_destructor)(d); ++} ++ + /* This data is invariant across clones and lives at + * the end of the header data, ie. at skb->end. + */ +@@ -140,6 +164,13 @@ + unsigned short ufo_size; + unsigned int ip6_frag_id; + struct sk_buff *frag_list; ++ zccd_t *zccd; /* zero copy descriptor */ ++ zccd_t *zccd2; /* 2nd zero copy descriptor */ ++ /* NB we expect zero-copy data to be at least 1 packet, so ++ * having 2 zccds means we don't unneccessarily split the packet ++ * where consecutive zero-copy sends abutt. ++ */ ++ + skb_frag_t frags[MAX_SKB_FRAGS]; + }; + +Index: linux-2.6.16.21-0.8/include/net/tcp.h +=================================================================== +--- linux-2.6.16.21-0.8.orig/include/net/tcp.h 2006-08-03 21:11:25.000000000 -0600 ++++ linux-2.6.16.21-0.8/include/net/tcp.h 2006-08-03 21:11:30.000000000 -0600 +@@ -272,6 +272,9 @@ + extern int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size); + extern ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ++extern ssize_t tcp_sendpage_zccd(struct socket *sock, struct page *page, int offset, size_t size, ++ int flags, zccd_t *zccd); ++ + + extern int tcp_ioctl(struct sock *sk, + int cmd, +@@ -356,6 +359,9 @@ + struct msghdr *msg, + size_t len, int nonblock, + int flags, int *addr_len); ++extern int tcp_recvpackets(struct sock *sk, ++ struct sk_buff_head *packets, ++ int len, int nonblock); + + extern void tcp_parse_options(struct sk_buff *skb, + struct tcp_options_received *opt_rx,