Whamcloud - gitweb
Branch HEAD
authorliangzhen <liangzhen>
Fri, 7 Nov 2008 14:25:53 +0000 (14:25 +0000)
committerliangzhen <liangzhen>
Fri, 7 Nov 2008 14:25:53 +0000 (14:25 +0000)
Support zero-copy receive of Chelsio driver

b=15093
i=Isaac
i=Maxim

lnet/ChangeLog
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_lib-darwin.c
lnet/klnds/socklnd/socklnd_lib-linux.c
lnet/klnds/socklnd/socklnd_modparams.c

index b276ad9..e5bfa57 100644 (file)
@@ -17,6 +17,12 @@ Bugzilla   :
 Description: 
 Details    : 
 
+Severity   : major
+Bugzilla   : 15093
+Description: Support Zerocopy receive of Chelsio device
+Details    : Chelsio driver can support zerocopy for iov[1] if it's
+             contiguous and large enough.
+
 Severity   : normal
 Bugzilla   : 13490
 Description: fix credit flow deadlock in uptllnd
index 2fad395..8190d3d 100644 (file)
@@ -1051,6 +1051,7 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         }
 
         memset (conn, 0, sizeof (*conn));
+
         conn->ksnc_peer = NULL;
         conn->ksnc_route = NULL;
         conn->ksnc_sock = sock;
index d1985c9..ff7bd8f 100644 (file)
@@ -74,6 +74,13 @@ typedef struct                                  /* per scheduler state */
         struct list_head  kss_zombie_noop_txs;  /* zombie noop tx list */
         cfs_waitq_t       kss_waitq;            /* where scheduler sleeps */
         int               kss_nconns;           /* # connections assigned to this scheduler */
+#if !SOCKNAL_SINGLE_FRAG_RX
+        struct page      *kss_rx_scratch_pgs[LNET_MAX_IOV];
+#endif
+#if !SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_SINGLE_FRAG_RX
+        struct iovec      kss_scratch_iov[LNET_MAX_IOV];
+#endif
+
 } ksock_sched_t;
 
 typedef struct
@@ -112,6 +119,8 @@ typedef struct
         int              *ksnd_enable_csum;     /* enable check sum */
         int              *ksnd_inject_csum_error; /* set non-zero to inject checksum error */
         unsigned int     *ksnd_zc_min_frag;     /* minimum zero copy frag size */
+        int              *ksnd_zc_recv;         /* enable ZC receive (for Chelsio TOE) */
+        int              *ksnd_zc_recv_min_nfrags; /* minimum # of fragments to enable ZC receive */
 #ifdef CPU_AFFINITY
         int              *ksnd_irq_affinity;    /* enable IRQ affinity? */
 #endif
@@ -293,13 +302,6 @@ typedef struct ksock_conn
         atomic_t            ksnc_tx_nob;        /* # bytes queued */
         int                 ksnc_tx_ready;      /* write space */
         int                 ksnc_tx_scheduled;  /* being progressed */
-
-#if !SOCKNAL_SINGLE_FRAG_RX
-        struct iovec        ksnc_rx_scratch_iov[LNET_MAX_IOV];
-#endif
-#if !SOCKNAL_SINGLE_FRAG_TX
-        struct iovec        ksnc_tx_scratch_iov[LNET_MAX_IOV];
-#endif
 } ksock_conn_t;
 
 typedef struct ksock_route
index fbb2a5b..70e4294 100644 (file)
@@ -215,7 +215,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
         struct iovec   *scratchiov = &scratch;
         unsigned int    niov = 1;
 #else
-        struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov;
+        struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int    niov = tx->tx_niov;
 #endif
         struct msghdr msg = {
@@ -260,7 +260,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
         struct iovec *scratchiov = &scratch;
         unsigned int  niov = 1;
 #else
-        struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int  niov = tx->tx_nkiov;
 #endif
         struct msghdr msg = {
@@ -302,7 +302,7 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
         struct iovec *scratchiov = &scratch;
         unsigned int  niov = 1;
 #else
-        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int  niov = conn->ksnc_rx_niov;
 #endif
         struct iovec *iov = conn->ksnc_rx_iov;
@@ -342,7 +342,7 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
         struct iovec *scratchiov = &scratch;
         unsigned int  niov = 1;
 #else
-        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int  niov = conn->ksnc_rx_nkiov;
 #endif
         lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
@@ -544,7 +544,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
         struct iovec   *scratchiov = &scratch; 
         unsigned int    niov = 1;
 #else 
-        struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov; 
+        struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; 
         unsigned int    niov = tx->tx_niov;
 #endif
         struct socket *sock = conn->ksnc_sock;
@@ -600,7 +600,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
         struct iovec *scratchiov = &scratch; 
         unsigned int  niov = 1;
 #else
-        struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; 
+        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int  niov = tx->tx_nkiov;
 #endif
         struct socket *sock = conn->ksnc_sock;
@@ -738,7 +738,7 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
         struct iovec *scratchiov = &scratch; 
         unsigned int  niov = 1;
 #else 
-        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; 
+        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int  niov = conn->ksnc_rx_niov;
 #endif
         struct iovec *iov = conn->ksnc_rx_iov;
@@ -792,7 +792,7 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
         struct iovec *scratchiov = &scratch; 
         unsigned int  niov = 1;
 #else 
-        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; 
+        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int  niov = conn->ksnc_rx_nkiov;
 #endif
         lnet_kiov_t    *kiov = conn->ksnc_rx_kiov;
index 5b851c4..c4dc1e1 100644 (file)
@@ -37,7 +37,7 @@
 #include "socklnd.h"
 
 # if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
-static cfs_sysctl_table_t ksocknal_ctl_table[21];
+static cfs_sysctl_table_t ksocknal_ctl_table[23];
 
 cfs_sysctl_table_t ksocknal_top_ctl_table[] = {
         {
@@ -57,6 +57,12 @@ ksocknal_lib_tunables_init ()
         int    i = 0;
         int    j = 1;
 
+        if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2)
+                *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2;
+
+        if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV)
+                *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV;
+
         ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) {
                 .ctl_name = j++,
                 .procname = "timeout",
@@ -123,6 +129,22 @@ ksocknal_lib_tunables_init ()
         };
         ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) {
                 .ctl_name = j++,
+                .procname = "zero_copy_recv",
+                .data     = ksocknal_tunables.ksnd_zc_recv,
+                .maxlen   = sizeof (int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec
+        };
+        ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) {
+                .ctl_name = j++,
+                .procname = "zero_copy_recv_min_nfrags",
+                .data     = ksocknal_tunables.ksnd_zc_recv_min_nfrags,
+                .maxlen   = sizeof (int),
+                .mode     = 0644,
+                .proc_handler = &proc_dointvec
+        };
+        ksocknal_ctl_table[i++] = (cfs_sysctl_table_t) {
+                .ctl_name = j++,
                 .procname = "typed",
                 .data     = ksocknal_tunables.ksnd_typed_conns,
                 .maxlen   = sizeof (int),
@@ -387,7 +409,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
                 struct iovec   *scratchiov = &scratch;
                 unsigned int    niov = 1;
 #else
-                struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov;
+                struct iovec   *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
                 unsigned int    niov = tx->tx_niov;
 #endif
                 struct msghdr msg = {
@@ -460,7 +482,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
-                struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+                struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
                 unsigned int  niov = tx->tx_nkiov;
 #endif
                 struct msghdr msg = {
@@ -521,7 +543,7 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
         struct iovec *scratchiov = &scratch;
         unsigned int  niov = 1;
 #else
-        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+        struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
         unsigned int  niov = conn->ksnc_rx_niov;
 #endif
         struct iovec *iov = conn->ksnc_rx_iov;
@@ -581,26 +603,72 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn)
         return rc;
 }
 
+static void
+ksocknal_lib_kiov_vunmap(void *addr)
+{
+        if (addr == NULL)
+                return;
+
+        vunmap(addr);
+}
+
+static void *
+ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov,
+                       struct iovec *iov, struct page **pages)
+{
+        void             *addr;
+        int               nob;
+        int               i;
+
+        if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL)
+                return NULL;
+
+        LASSERT (niov <= LNET_MAX_IOV);
+
+        if (niov < 2 ||
+            niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags)
+                return NULL;
+
+        for (nob = i = 0; i < niov; i++) {
+                if ((kiov[i].kiov_offset != 0 && i > 0) ||
+                    (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1))
+                        return NULL;
+
+                pages[i] = kiov[i].kiov_page;
+                nob += kiov[i].kiov_len;
+        }
+
+        addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL);
+        if (addr == NULL)
+                return NULL;
+
+        iov->iov_base = addr + kiov[0].kiov_offset;
+        iov->iov_len = nob;
+
+        return addr;
+}
+
 int
 ksocknal_lib_recv_kiov (ksock_conn_t *conn)
 {
 #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
-        struct iovec  scratch;
-        struct iovec *scratchiov = &scratch;
-        unsigned int  niov = 1;
+        struct iovec   scratch;
+        struct iovec  *scratchiov = &scratch;
+        struct page  **pages      = NULL;
+        unsigned int   niov       = 1;
 #else
 #ifdef CONFIG_HIGHMEM
 #warning "XXX risk of kmap deadlock on multiple frags..."
 #endif
-        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
-        unsigned int  niov = conn->ksnc_rx_nkiov;
+        struct iovec  *scratchiov = conn->ksnc_scheduler->kss_scratch_iov;
+        struct page  **pages      = conn->ksnc_scheduler->kss_rx_scratch_pgs;
+        unsigned int   niov       = conn->ksnc_rx_nkiov;
 #endif
         lnet_kiov_t   *kiov = conn->ksnc_rx_kiov;
         struct msghdr msg = {
                 .msg_name       = NULL,
                 .msg_namelen    = 0,
                 .msg_iov        = scratchiov,
-                .msg_iovlen     = niov,
                 .msg_control    = NULL,
                 .msg_controllen = 0,
                 .msg_flags      = 0
@@ -610,15 +678,26 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
         int          i;
         int          rc;
         void        *base;
+        void        *addr;
         int          sum;
         int          fragnob;
 
         /* NB we can't trust socket ops to either consume our iovs
          * or leave them alone. */
-        for (nob = i = 0; i < niov; i++) {
-                scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
-                nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+        if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) {
+                nob = scratchiov[0].iov_len;
+                msg.msg_iovlen = 1;
+
+        } else {
+                for (nob = i = 0; i < niov; i++) {
+                        nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+                        scratchiov[i].iov_base = kmap(kiov[i].kiov_page) +
+                                                 kiov[i].kiov_offset;
+                }
+                msg.msg_iovlen = niov;
         }
+
+
         LASSERT (nob <= conn->ksnc_rx_nob_wanted);
 
         set_fs (KERNEL_DS);
@@ -645,8 +724,13 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn)
                         kunmap(kiov[i].kiov_page);
                 }
         }
-        for (i = 0; i < niov; i++)
-                kunmap(kiov[i].kiov_page);
+
+        if (addr != NULL) {
+                ksocknal_lib_kiov_vunmap(addr);
+        } else {
+                for (i = 0; i < niov; i++)
+                        kunmap(kiov[i].kiov_page);
+        }
 
         return (rc);
 }
index 2530dcf..79d9dbd 100644 (file)
@@ -117,6 +117,14 @@ static unsigned int zc_min_frag = (2<<10);
 CFS_MODULE_PARM(zc_min_frag, "i", int, 0644,
                 "minimum fragment to zero copy");
 
+static unsigned int zc_recv = 0;
+CFS_MODULE_PARM(zc_recv, "i", int, 0444,
+                "enable ZC recv for Chelsio driver");
+
+static unsigned int zc_recv_min_nfrags = 16;
+CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0444,
+                "minimum # of fragments to enable ZC recv");
+
 #ifdef SOCKNAL_BACKOFF
 static int backoff_init = 3;
 CFS_MODULE_PARM(backoff_init, "i", int, 0644,
@@ -139,40 +147,43 @@ int ksocknal_tunables_init(void)
 {
 
         /* initialize ksocknal_tunables structure */
-        ksocknal_tunables.ksnd_timeout         = &sock_timeout;
-        ksocknal_tunables.ksnd_nconnds         = &nconnds;
-        ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
-        ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
-        ksocknal_tunables.ksnd_eager_ack       = &eager_ack;
-        ksocknal_tunables.ksnd_typed_conns     = &typed_conns;
-        ksocknal_tunables.ksnd_min_bulk        = &min_bulk;
-        ksocknal_tunables.ksnd_tx_buffer_size  = &tx_buffer_size;
-        ksocknal_tunables.ksnd_rx_buffer_size  = &rx_buffer_size;
-        ksocknal_tunables.ksnd_nagle           = &nagle;
-        ksocknal_tunables.ksnd_keepalive_idle  = &keepalive_idle;
-        ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
-        ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
-        ksocknal_tunables.ksnd_credits         = &credits;
-        ksocknal_tunables.ksnd_peercredits     = &peer_credits;
-        ksocknal_tunables.ksnd_enable_csum     = &enable_csum;
-        ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
-        ksocknal_tunables.ksnd_zc_min_frag     = &zc_min_frag;
+        ksocknal_tunables.ksnd_timeout            = &sock_timeout;
+        ksocknal_tunables.ksnd_nconnds            = &nconnds;
+        ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+        ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+        ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
+        ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
+        ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
+        ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+        ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+        ksocknal_tunables.ksnd_nagle              = &nagle;
+        ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+        ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+        ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+        ksocknal_tunables.ksnd_credits            = &credits;
+        ksocknal_tunables.ksnd_peercredits        = &peer_credits;
+        ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
+        ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+        ksocknal_tunables.ksnd_zc_min_frag        = &zc_min_frag;
+        ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
+        ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+
 
 #ifdef CPU_AFFINITY
-        ksocknal_tunables.ksnd_irq_affinity    = &enable_irq_affinity;
+        ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
 #endif
 
 #ifdef SOCKNAL_BACKOFF
-        ksocknal_tunables.ksnd_backoff_init     = &backoff_init;
-        ksocknal_tunables.ksnd_backoff_max      = &backoff_max;
+        ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
+        ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
 #endif
 
 #if SOCKNAL_VERSION_DEBUG
-        ksocknal_tunables.ksnd_protocol         = &protocol;
+        ksocknal_tunables.ksnd_protocol           = &protocol;
 #endif
 
 #if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
-        ksocknal_tunables.ksnd_sysctl           =  NULL;
+        ksocknal_tunables.ksnd_sysctl             =  NULL;
 #endif
 
         /* initialize platform-sepcific tunables */