X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd_lib-linux.c;h=54513addcf9a900eb9863d104647067d3bef1551;hp=955849cead2ede29e9c156d2d8aa09767cd99d52;hb=2120ef466f9748774086177a3119fe641067a53d;hpb=a1027e45b32e21ee1c94832329b1577c6e8ab8bb diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index 955849c..54513ad 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -1,92 +1,323 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. */ #include "socklnd.h" -# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM -static ctl_table ksocknal_ctl_table[21]; +# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + +static struct ctl_table ksocknal_ctl_table[] = { + { + INIT_CTL_NAME + .procname = "timeout", + .data = &ksocknal_tunables.ksnd_timeout, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "credits", + .data = &ksocknal_tunables.ksnd_credits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "peer_credits", + .data = &ksocknal_tunables.ksnd_peertxcredits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "peer_buffer_credits", + .data = &ksocknal_tunables.ksnd_peerrtrcredits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "peer_timeout", + .data = &ksocknal_tunables.ksnd_peertimeout, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "nconnds", + .data = &ksocknal_tunables.ksnd_nconnds, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "min_reconnectms", + .data = &ksocknal_tunables.ksnd_min_reconnectms, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "max_reconnectms", + .data = &ksocknal_tunables.ksnd_max_reconnectms, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "eager_ack", + .data = &ksocknal_tunables.ksnd_eager_ack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "zero_copy", + .data = &ksocknal_tunables.ksnd_zc_min_payload, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "zero_copy_recv", + .data = &ksocknal_tunables.ksnd_zc_recv, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "zero_copy_recv", + .data = &ksocknal_tunables.ksnd_zc_recv_min_nfrags, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "typed", + .data = &ksocknal_tunables.ksnd_typed_conns, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "min_bulk", + .data = &ksocknal_tunables.ksnd_min_bulk, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "rx_buffer_size", + .data = &ksocknal_tunables.ksnd_rx_buffer_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "tx_buffer_size", + .data = &ksocknal_tunables.ksnd_tx_buffer_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "nagle", + .data = &ksocknal_tunables.ksnd_nagle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, +#ifdef CPU_AFFINITY + { + INIT_CTL_NAME + .procname = "irq_affinity", + .data = &ksocknal_tunables.ksnd_irq_affinity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, +#endif + { + INIT_CTL_NAME + .procname = "round_robin", + .data = &ksocknal_tunables.ksnd_round_robin, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "keepalive", + .data = &ksocknal_tunables.ksnd_keepalive, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "keepalive_idle", + .data = &ksocknal_tunables.ksnd_keepalive_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "keepalive_count", + .data = &ksocknal_tunables.ksnd_keepalive_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "keepalive_intvl", + .data = &ksocknal_tunables.ksnd_keepalive_intvl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, +#ifdef SOCKNAL_BACKOFF + { + INIT_CTL_NAME + .procname = "backoff_init", + .data = &ksocknal_tunables.ksnd_backoff_init, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, + { + INIT_CTL_NAME + .procname = "backoff_max", + .data = &ksocknal_tunables.ksnd_backoff_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, +#endif +#if SOCKNAL_VERSION_DEBUG + { + INIT_CTL_NAME + .procname = "protocol", + .data = &ksocknal_tunables.ksnd_protocol, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + INIT_STRATEGY + }, +#endif + { 0 } +}; -ctl_table ksocknal_top_ctl_table[] = { - {200, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, - { 0 } +struct ctl_table ksocknal_top_ctl_table[] = { + { + INIT_CTL_NAME + .procname = "socknal", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ksocknal_ctl_table + }, + { 0 } }; int ksocknal_lib_tunables_init () { - int i = 0; - int j = 1; - - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "timeout", ksocknal_tunables.ksnd_timeout, - sizeof (int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "credits", ksocknal_tunables.ksnd_credits, - sizeof (int), 0444, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "peer_credits", ksocknal_tunables.ksnd_peercredits, - sizeof (int), 0444, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "nconnds", ksocknal_tunables.ksnd_nconnds, - sizeof (int), 0444, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "min_reconnectms", ksocknal_tunables.ksnd_min_reconnectms, - sizeof (int), 0444, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "max_reconnectms", ksocknal_tunables.ksnd_max_reconnectms, - sizeof (int), 0444, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "eager_ack", ksocknal_tunables.ksnd_eager_ack, - sizeof (int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "zero_copy", ksocknal_tunables.ksnd_zc_min_frag, - sizeof (int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "typed", ksocknal_tunables.ksnd_typed_conns, - sizeof (int), 0444, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "min_bulk", ksocknal_tunables.ksnd_min_bulk, - sizeof (int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "rx_buffer_size", ksocknal_tunables.ksnd_rx_buffer_size, - sizeof(int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "tx_buffer_size", ksocknal_tunables.ksnd_tx_buffer_size, - sizeof(int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "nagle", ksocknal_tunables.ksnd_nagle, - sizeof(int), 0644, NULL, &proc_dointvec}; -#if CPU_AFFINITY - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "irq_affinity", ksocknal_tunables.ksnd_irq_affinity, - sizeof(int), 0644, NULL, &proc_dointvec}; -#endif - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "keepalive_idle", ksocknal_tunables.ksnd_keepalive_idle, - sizeof(int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "keepalive_count", ksocknal_tunables.ksnd_keepalive_count, - sizeof(int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "keepalive_intvl", ksocknal_tunables.ksnd_keepalive_intvl, - sizeof(int), 0644, NULL, &proc_dointvec}; -#ifdef SOCKNAL_BACKOFF - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "backoff_init", ksocknal_tunables.ksnd_backoff_init, - sizeof(int), 0644, NULL, &proc_dointvec}; - ksocknal_ctl_table[i++] = (ctl_table) - {j++, "backoff_max", ksocknal_tunables.ksnd_backoff_max, - sizeof(int), 0644, NULL, &proc_dointvec}; + if (!*ksocknal_tunables.ksnd_typed_conns) { + int rc = -EINVAL; +#if SOCKNAL_VERSION_DEBUG + if (*ksocknal_tunables.ksnd_protocol < 3) + rc = 0; #endif + if (rc != 0) { + CERROR("Protocol V3.x MUST have typed connections\n"); + return rc; + } + } - LASSERT (j == i+1); - LASSERT (i < sizeof(ksocknal_ctl_table)/sizeof(ksocknal_ctl_table[0])); + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2; + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV; - ksocknal_tunables.ksnd_sysctl = - cfs_register_sysctl_table(ksocknal_top_ctl_table, 0); + ksocknal_tunables.ksnd_sysctl = + register_sysctl_table(ksocknal_top_ctl_table); - if (ksocknal_tunables.ksnd_sysctl == NULL) + if (ksocknal_tunables.ksnd_sysctl == NULL) CWARN("Can't setup /proc tunables\n"); return 0; @@ -95,14 +326,14 @@ ksocknal_lib_tunables_init () void ksocknal_lib_tunables_fini () { - if (ksocknal_tunables.ksnd_sysctl != NULL) - cfs_unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl); + if (ksocknal_tunables.ksnd_sysctl != NULL) + unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl); } #else int ksocknal_lib_tunables_init () { - return 0; + return 0; } void @@ -111,59 +342,12 @@ ksocknal_lib_tunables_fini () } #endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */ -void -ksocknal_lib_bind_irq (unsigned int irq) -{ -#if (defined(CONFIG_SMP) && CPU_AFFINITY) - int bind; - int cpu; - char cmdline[64]; - ksock_irqinfo_t *info; - char *argv[] = {"/bin/sh", - "-c", - cmdline, - NULL}; - char *envp[] = {"HOME=/", - "PATH=/sbin:/bin:/usr/sbin:/usr/bin", - NULL}; - - LASSERT (irq < NR_IRQS); - if (irq == 0) /* software NIC or affinity disabled */ - return; - - info = &ksocknal_data.ksnd_irqinfo[irq]; - - write_lock_bh (&ksocknal_data.ksnd_global_lock); - - LASSERT (info->ksni_valid); - bind = !info->ksni_bound; - info->ksni_bound = 1; - - write_unlock_bh (&ksocknal_data.ksnd_global_lock); - - if (!bind) /* bound already */ - return; - - cpu = ksocknal_irqsched2cpu(info->ksni_sched); - snprintf (cmdline, sizeof (cmdline), - "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); - - LCONSOLE_INFO("Binding irq %u to CPU %d with cmd: %s\n", - irq, cpu, cmdline); - - /* FIXME: Find a better method of setting IRQ affinity... - */ - - USERMODEHELPER(argv[0], argv, envp); -#endif -} - int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) { int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, - &conn->ksnc_ipaddr, - &conn->ksnc_port); + &conn->ksnc_ipaddr, + &conn->ksnc_port); /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ LASSERT (!conn->ksnc_closing); @@ -174,7 +358,7 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) } rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, - &conn->ksnc_myipaddr, NULL); + &conn->ksnc_myipaddr, NULL); if (rc != 0) { CERROR ("Error %d getting sock local IP\n", rc); return rc; @@ -183,41 +367,17 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) return 0; } -unsigned int -ksocknal_lib_sock_irq (struct socket *sock) +int +ksocknal_lib_zc_capable(ksock_conn_t *conn) { - int irq = 0; -#if CPU_AFFINITY - struct dst_entry *dst; - - if (!*ksocknal_tunables.ksnd_irq_affinity) - return 0; - - dst = sk_dst_get (sock->sk); - if (dst != NULL) { - if (dst->dev != NULL) { - irq = dst->dev->irq; - if (irq >= NR_IRQS) { - CERROR ("Unexpected IRQ %x\n", irq); - irq = 0; - } - } - dst_release (dst); - } + int caps = conn->ksnc_sock->sk->sk_route_caps; -#endif - return irq; -} + if (conn->ksnc_proto == &ksocknal_protocol_v1x) + return 0; -int -ksocknal_lib_zc_capable(struct socket *sock) -{ - int caps = sock->sk->sk_route_caps; - - /* ZC if the socket supports scatter/gather and doesn't need software - * checksums */ - return ((caps & NETIF_F_SG) != 0 && - (caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) != 0); + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && (caps & NETIF_F_ALL_CSUM) != 0); } int @@ -242,7 +402,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_niov; #endif struct msghdr msg = { @@ -262,7 +422,7 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) nob += scratchiov[i].iov_len; } - if (!list_empty(&conn->ksnc_tx_queue) || + if (!cfs_list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) msg.msg_flags |= MSG_MORE; @@ -270,23 +430,25 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) rc = sock_sendmsg(sock, &msg, nob); set_fs (oldmm); } - return rc; + return rc; } int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { struct socket *sock = conn->ksnc_sock; - lnet_kiov_t *kiov = tx->tx_kiov; + lnet_kiov_t *kiov = tx->tx_kiov; int rc; int nob; + /* Not NOOP message */ + LASSERT (tx->tx_lnetmsg != NULL); + /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ - - if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag && - tx->tx_msg.ksm_zc_req_cookie != 0) { + if (tx->tx_msg.ksm_zc_cookies[0] != 0) { /* Zero copy is enabled */ + struct sock *sk = sock->sk; struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; int fragsize = kiov->kiov_len; @@ -295,11 +457,17 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) CDEBUG(D_NET, "page %p + offset %x for %d\n", page, offset, kiov->kiov_len); - if (!list_empty(&conn->ksnc_tx_queue) || + if (!cfs_list_empty(&conn->ksnc_tx_queue) || fragsize < tx->tx_resid) msgflg |= MSG_MORE; - rc = tcp_sendpage(sock, page, offset, fragsize, msgflg); + if (sk->sk_prot->sendpage != NULL) { + rc = sk->sk_prot->sendpage(sk, page, + offset, fragsize, msgflg); + } else { + rc = cfs_tcp_sendpage(sk, page, offset, fragsize, + msgflg); + } } else { #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; @@ -309,7 +477,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = tx->tx_nkiov; #endif struct msghdr msg = { @@ -330,7 +498,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) nob += scratchiov[i].iov_len = kiov[i].kiov_len; } - if (!list_empty(&conn->ksnc_tx_queue) || + if (!cfs_list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) msg.msg_flags |= MSG_MORE; @@ -341,7 +509,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) for (i = 0; i < niov; i++) kunmap(kiov[i].kiov_page); } - return rc; + return rc; } void @@ -370,7 +538,7 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) struct iovec *scratchiov = &scratch; unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; @@ -420,36 +588,83 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) fragnob = iov[i].iov_len; if (fragnob > sum) fragnob = sum; - - conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, iov[i].iov_base, fragnob); } conn->ksnc_msg.ksm_csum = saved_csum; } - return rc; + return rc; +} + +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, + struct iovec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT (niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != + PAGE_CACHE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].kiov_page; + nob += kiov[i].kiov_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].kiov_offset; + iov->iov_len = nob; + + return addr; } int ksocknal_lib_recv_kiov (ksock_conn_t *conn) { #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct iovec scratch; - struct iovec *scratchiov = &scratch; - unsigned int niov = 1; + struct iovec scratch; + struct iovec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; #else #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - unsigned int niov = conn->ksnc_rx_nkiov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; + unsigned int niov = conn->ksnc_rx_nkiov; #endif lnet_kiov_t *kiov = conn->ksnc_rx_kiov; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_iov = scratchiov, - .msg_iovlen = niov, .msg_control = NULL, .msg_controllen = 0, .msg_flags = 0 @@ -459,15 +674,25 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) int i; int rc; void *base; + void *addr; int sum; int fragnob; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ - for (nob = i = 0; i < niov; i++) { - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; - nob += scratchiov[i].iov_len = kiov[i].kiov_len; + if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) { + nob = scratchiov[0].iov_len; + msg.msg_iovlen = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + } + msg.msg_iovlen = niov; } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); set_fs (KERNEL_DS); @@ -487,20 +712,26 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) fragnob = kiov[i].kiov_len; if (fragnob > sum) fragnob = sum; - + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, base, fragnob); kunmap(kiov[i].kiov_page); } } - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); - return (rc); + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); + } + + return (rc); } -void ksocknal_lib_csum_tx(ksock_tx_t *tx) +void +ksocknal_lib_csum_tx(ksock_tx_t *tx) { int i; __u32 csum; @@ -553,13 +784,13 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int return (-ESHUTDOWN); } - rc = libcfs_sock_getbuf(sock, txmem, rxmem); + rc = libcfs_sock_getbuf(sock, txmem, rxmem); if (rc == 0) { len = sizeof(*nagle); - set_fs(KERNEL_DS); + set_fs(KERNEL_DS); rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)nagle, &len); - set_fs(oldmm); + set_fs(oldmm); } ksocknal_connsock_decref(conn); @@ -624,20 +855,23 @@ ksocknal_lib_setup_sock (struct socket *sock) } } - rc = libcfs_sock_setbuf(sock, + rc = libcfs_sock_setbuf(sock, *ksocknal_tunables.ksnd_tx_buffer_size, *ksocknal_tunables.ksnd_rx_buffer_size); - if (rc != 0) { - CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", *ksocknal_tunables.ksnd_tx_buffer_size, *ksocknal_tunables.ksnd_rx_buffer_size, rc); - return (rc); - } + return (rc); + } /* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ #ifdef SOCKNAL_BACKOFF if (*ksocknal_tunables.ksnd_backoff_init > 0) { option = *ksocknal_tunables.ksnd_backoff_init; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif set_fs (KERNEL_DS); rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_INIT, @@ -652,6 +886,9 @@ ksocknal_lib_setup_sock (struct socket *sock) if (*ksocknal_tunables.ksnd_backoff_max > 0) { option = *ksocknal_tunables.ksnd_backoff_max; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif set_fs (KERNEL_DS); rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_MAX, @@ -715,30 +952,11 @@ ksocknal_lib_setup_sock (struct socket *sock) return (0); } -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -struct tcp_opt *sock2tcp_opt(struct sock *sk) -{ - return &(sk->tp_pinfo.af_tcp); -} -#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)) -#define sock2tcp_opt(sk) tcp_sk(sk) -#else -struct tcp_opt *sock2tcp_opt(struct sock *sk) -{ - struct tcp_sock *s = (struct tcp_sock *)sk; - return &s->tcp; -} -#endif - void ksocknal_lib_push_conn (ksock_conn_t *conn) { struct sock *sk; -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)) - struct tcp_opt *tp; -#else struct tcp_sock *tp; -#endif int nonagle; int val = 1; int rc; @@ -748,8 +966,8 @@ ksocknal_lib_push_conn (ksock_conn_t *conn) if (rc != 0) /* being shut down */ return; - sk = conn->ksnc_sock->sk; - tp = sock2tcp_opt(sk); + sk = conn->ksnc_sock->sk; + tp = tcp_sk(sk); lock_sock (sk); nonagle = tp->nonagle; @@ -785,16 +1003,16 @@ ksocknal_data_ready (struct sock *sk, int n) /* interleave correctly with closing sockets... */ LASSERT(!in_irq()); - read_lock (&ksocknal_data.ksnd_global_lock); + read_lock(&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; if (conn == NULL) { /* raced with ksocknal_terminate_conn */ LASSERT (sk->sk_data_ready != &ksocknal_data_ready); sk->sk_data_ready (sk, n); } else - ksocknal_read_callback(conn); + ksocknal_read_callback(conn); - read_unlock (&ksocknal_data.ksnd_global_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); EXIT; } @@ -808,7 +1026,7 @@ ksocknal_write_space (struct sock *sk) /* interleave correctly with closing sockets... */ LASSERT(!in_irq()); - read_lock (&ksocknal_data.ksnd_global_lock); + read_lock(&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; wspace = SOCKNAL_WSPACE(sk); @@ -820,60 +1038,86 @@ ksocknal_write_space (struct sock *sk) " ready" : " blocked"), (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? " scheduled" : " idle"), - (conn == NULL) ? "" : (list_empty (&conn->ksnc_tx_queue) ? + (conn == NULL) ? "" : (cfs_list_empty (&conn->ksnc_tx_queue) ? " empty" : " queued")); if (conn == NULL) { /* raced with ksocknal_terminate_conn */ LASSERT (sk->sk_write_space != &ksocknal_write_space); sk->sk_write_space (sk); - read_unlock (&ksocknal_data.ksnd_global_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); return; } if (wspace >= min_wpace) { /* got enough space */ - ksocknal_write_callback(conn); + ksocknal_write_callback(conn); - /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the - * ENOMEM check in ksocknal_transmit is race-free (think about - * it). */ + /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the + * ENOMEM check in ksocknal_transmit is race-free (think about + * it). */ clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); } - read_unlock (&ksocknal_data.ksnd_global_lock); + read_unlock(&ksocknal_data.ksnd_global_lock); } void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) { - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; } void ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) { - sock->sk->sk_user_data = conn; - sock->sk->sk_data_ready = ksocknal_data_ready; - sock->sk->sk_write_space = ksocknal_write_space; - return; + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = ksocknal_data_ready; + sock->sk->sk_write_space = ksocknal_write_space; + return; } void ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) { - /* Remove conn's network callbacks. - * NB I _have_ to restore the callback, rather than storing a noop, - * since the socket could survive past this module being unloaded!! */ - sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - sock->sk->sk_write_space = conn->ksnc_saved_write_space; - - /* A callback could be in progress already; they hold a read lock - * on ksnd_global_lock (to serialise with me) and NOOP if - * sk_user_data is NULL. */ - sock->sk->sk_user_data = NULL; - - return ; + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + sock->sk->sk_user_data = NULL; + + return ; } +int +ksocknal_lib_memory_pressure(ksock_conn_t *conn) +{ + int rc = 0; + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + spin_lock_bh(&sched->kss_lock); + + if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + spin_unlock_bh(&sched->kss_lock); + + return rc; +}