From f9d9a20e1f0de7e7099537ec5a47cef5130e283a Mon Sep 17 00:00:00 2001 From: liangzhen Date: Fri, 7 Nov 2008 14:25:53 +0000 Subject: [PATCH] Branch HEAD Support zero-copy receive of Chelsio driver b=15093 i=Isaac i=Maxim --- lnet/ChangeLog | 6 ++ lnet/klnds/socklnd/socklnd.c | 1 + lnet/klnds/socklnd/socklnd.h | 16 +++-- lnet/klnds/socklnd/socklnd_lib-darwin.c | 16 ++--- lnet/klnds/socklnd/socklnd_lib-linux.c | 114 +++++++++++++++++++++++++++----- lnet/klnds/socklnd/socklnd_modparams.c | 57 +++++++++------- 6 files changed, 157 insertions(+), 53 deletions(-) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index b276ad9..e5bfa57 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -17,6 +17,12 @@ Bugzilla : Description: Details : +Severity : major +Bugzilla : 15093 +Description: Support Zerocopy receive of Chelsio device +Details : Chelsio driver can support zerocopy for iov[1] if it's + contiguous and large enough. + Severity : normal Bugzilla : 13490 Description: fix credit flow deadlock in uptllnd diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 2fad395..8190d3d 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1051,6 +1051,7 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, } memset (conn, 0, sizeof (*conn)); + conn->ksnc_peer = NULL; conn->ksnc_route = NULL; conn->ksnc_sock = sock; diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index d1985c9..ff7bd8f 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -74,6 +74,13 @@ typedef struct /* per scheduler state */ struct list_head kss_zombie_noop_txs; /* zombie noop tx list */ cfs_waitq_t kss_waitq; /* where scheduler sleeps */ int kss_nconns; /* # connections assigned to this scheduler */ +#if !SOCKNAL_SINGLE_FRAG_RX + struct page *kss_rx_scratch_pgs[LNET_MAX_IOV]; +#endif +#if !SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_SINGLE_FRAG_RX + struct iovec kss_scratch_iov[LNET_MAX_IOV]; +#endif + } ksock_sched_t; typedef struct @@ -112,6 +119,8 @@ typedef struct int *ksnd_enable_csum; /* enable check sum */ int *ksnd_inject_csum_error; /* set non-zero to inject checksum error */ unsigned int *ksnd_zc_min_frag; /* minimum zero copy frag size */ + int *ksnd_zc_recv; /* enable ZC receive (for Chelsio TOE) */ + int *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */ #ifdef CPU_AFFINITY int *ksnd_irq_affinity; /* enable IRQ affinity? */ #endif @@ -293,13 +302,6 @@ typedef struct ksock_conn atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ - -#if !SOCKNAL_SINGLE_FRAG_RX - struct iovec ksnc_rx_scratch_iov[LNET_MAX_IOV]; -#endif -#if !SOCKNAL_SINGLE_FRAG_TX - struct iovec ksnc_tx_scratch_iov[LNET_MAX_IOV]; -#endif } ksock_conn_t; typedef struct ksock_route diff --git a/lnet/klnds/socklnd/socklnd_lib-darwin.c b/lnet/klnds/socklnd/socklnd_lib-darwin.c index fbb2a5b..70e4294 100644 --- a/lnet/klnds/socklnd/socklnd_lib-darwin.c +++ b/lnet/klnds/socklnd/socklnd_lib-darwin.c @@ -215,7 +215,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_niov; #endif struct msghdr msg = { @@ -260,7 +260,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_nkiov; #endif struct msghdr msg = { @@ -302,7 +302,7 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; @@ -342,7 +342,7 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_nkiov; #endif lnet_kiov_t *kiov = conn->ksnc_rx_kiov; @@ -544,7 +544,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_niov; #endif struct socket *sock = conn->ksnc_sock; @@ -600,7 +600,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_nkiov; #endif struct socket *sock = conn->ksnc_sock; @@ -738,7 +738,7 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; @@ -792,7 +792,7 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_nkiov; #endif lnet_kiov_t *kiov = conn->ksnc_rx_kiov; diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index 5b851c4..c4dc1e1 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -37,7 +37,7 @@ #include "socklnd.h" # if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM -static cfs_sysctl_table_t ksocknal_ctl_table[21]; +static cfs_sysctl_table_t ksocknal_ctl_table[23]; cfs_sysctl_table_t ksocknal_top_ctl_table[] = { { @@ -57,6 +57,12 @@ ksocknal_lib_tunables_init () int i = 0; int j = 1; + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2; + + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV; + ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { .ctl_name = j++, .procname = "timeout", @@ -123,6 +129,22 @@ ksocknal_lib_tunables_init () }; ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { .ctl_name = j++, + .procname = "zero_copy_recv", + .data = ksocknal_tunables.ksnd_zc_recv, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec + }; + ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { + .ctl_name = j++, + .procname = "zero_copy_recv_min_nfrags", + .data = ksocknal_tunables.ksnd_zc_recv_min_nfrags, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec + }; + ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) { + .ctl_name = j++, .procname = "typed", .data = ksocknal_tunables.ksnd_typed_conns, .maxlen = sizeof (int), @@ -387,7 +409,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_niov; #endif struct msghdr msg = { @@ -460,7 +482,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_nkiov; #endif struct msghdr msg = { @@ -521,7 +543,7 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; @@ -581,26 +603,72 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) return rc; } +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, + struct iovec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT (niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].kiov_page; + nob += kiov[i].kiov_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].kiov_offset; + iov->iov_len = nob; + + return addr; +} + int ksocknal_lib_recv_kiov (ksock_conn_t *conn) { #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct iovec scratch; - struct iovec *scratchiov = &scratch; - unsigned int niov = 1; + struct iovec scratch; + struct iovec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; #else #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - unsigned int niov = conn->ksnc_rx_nkiov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; + unsigned int niov = conn->ksnc_rx_nkiov; #endif lnet_kiov_t *kiov = conn->ksnc_rx_kiov; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_iov = scratchiov, - .msg_iovlen = niov, .msg_control = NULL, .msg_controllen = 0, .msg_flags = 0 @@ -610,15 +678,26 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) int i; int rc; void *base; + void *addr; int sum; int fragnob; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ - for (nob = i = 0; i < niov; i++) { - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; - nob += scratchiov[i].iov_len = kiov[i].kiov_len; + if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) { + nob = scratchiov[0].iov_len; + msg.msg_iovlen = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + } + msg.msg_iovlen = niov; } + + LASSERT (nob <= conn->ksnc_rx_nob_wanted); set_fs (KERNEL_DS); @@ -645,8 +724,13 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) kunmap(kiov[i].kiov_page); } } - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); + + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } return (rc); } diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index 2530dcf..79d9dbd 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -117,6 +117,14 @@ static unsigned int zc_min_frag = (2<<10); CFS_MODULE_PARM(zc_min_frag, "i", int, 0644, "minimum fragment to zero copy"); +static unsigned int zc_recv = 0; +CFS_MODULE_PARM(zc_recv, "i", int, 0444, + "enable ZC recv for Chelsio driver"); + +static unsigned int zc_recv_min_nfrags = 16; +CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0444, + "minimum # of fragments to enable ZC recv"); + #ifdef SOCKNAL_BACKOFF static int backoff_init = 3; CFS_MODULE_PARM(backoff_init, "i", int, 0644, @@ -139,40 +147,43 @@ int ksocknal_tunables_init(void) { /* initialize ksocknal_tunables structure */ - ksocknal_tunables.ksnd_timeout = &sock_timeout; - ksocknal_tunables.ksnd_nconnds = &nconnds; - ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; - ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; - ksocknal_tunables.ksnd_eager_ack = &eager_ack; - ksocknal_tunables.ksnd_typed_conns = &typed_conns; - ksocknal_tunables.ksnd_min_bulk = &min_bulk; - ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; - ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; - ksocknal_tunables.ksnd_nagle = &nagle; - ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; - ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; - ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; - ksocknal_tunables.ksnd_credits = &credits; - ksocknal_tunables.ksnd_peercredits = &peer_credits; - ksocknal_tunables.ksnd_enable_csum = &enable_csum; - ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; - ksocknal_tunables.ksnd_zc_min_frag = &zc_min_frag; + ksocknal_tunables.ksnd_timeout = &sock_timeout; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peercredits = &peer_credits; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_zc_min_frag = &zc_min_frag; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + #ifdef CPU_AFFINITY - ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; + ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; #endif #ifdef SOCKNAL_BACKOFF - ksocknal_tunables.ksnd_backoff_init = &backoff_init; - ksocknal_tunables.ksnd_backoff_max = &backoff_max; + ksocknal_tunables.ksnd_backoff_init = &backoff_init; + ksocknal_tunables.ksnd_backoff_max = &backoff_max; #endif #if SOCKNAL_VERSION_DEBUG - ksocknal_tunables.ksnd_protocol = &protocol; + ksocknal_tunables.ksnd_protocol = &protocol; #endif #if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - ksocknal_tunables.ksnd_sysctl = NULL; + ksocknal_tunables.ksnd_sysctl = NULL; #endif /* initialize platform-sepcific tunables */ -- 1.8.3.1