X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd_lib-linux.c;h=5b0a9e9083928429ad26897291cb5f68c39b266c;hp=48a813eb5e8b1527ff1cd731470a99c818490577;hb=f160d81f0adcb46cc0bda256e703aea37c253323;hpb=8dc6467987487a6e656bdcf05e9d4c7b9fcc63ad diff --git a/lnet/klnds/socklnd/socklnd_lib-linux.c b/lnet/klnds/socklnd/socklnd_lib-linux.c index 48a813e..5b0a9e9 100644 --- a/lnet/klnds/socklnd/socklnd_lib-linux.c +++ b/lnet/klnds/socklnd/socklnd_lib-linux.c @@ -1,74 +1,359 @@ -#include "socknal.h" - -#ifdef CONFIG_SYSCTL -#define SOCKNAL_SYSCTL 200 - -#define SOCKNAL_SYSCTL_TIMEOUT 1 -#define SOCKNAL_SYSCTL_EAGER_ACK 2 -#define SOCKNAL_SYSCTL_ZERO_COPY 3 -#define SOCKNAL_SYSCTL_TYPED 4 -#define SOCKNAL_SYSCTL_MIN_BULK 5 -#define SOCKNAL_SYSCTL_BUFFER_SIZE 6 -#define SOCKNAL_SYSCTL_NAGLE 7 -#define SOCKNAL_SYSCTL_IRQ_AFFINITY 8 -#define SOCKNAL_SYSCTL_KEEPALIVE_IDLE 9 -#define SOCKNAL_SYSCTL_KEEPALIVE_COUNT 10 -#define SOCKNAL_SYSCTL_KEEPALIVE_INTVL 11 - -static ctl_table ksocknal_ctl_table[] = { - {SOCKNAL_SYSCTL_TIMEOUT, "timeout", - &ksocknal_tunables.ksnd_io_timeout, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", - &ksocknal_tunables.ksnd_eager_ack, sizeof (int), - 0644, NULL, &proc_dointvec}, -#if SOCKNAL_ZC - {SOCKNAL_SYSCTL_ZERO_COPY, "zero_copy", - &ksocknal_tunables.ksnd_zc_min_frag, sizeof (int), - 0644, NULL, &proc_dointvec}, +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include "socklnd.h" + +# if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM + +#ifndef HAVE_SYSCTL_UNNUMBERED + +enum { + SOCKLND_TIMEOUT = 1, + SOCKLND_CREDITS, + SOCKLND_PEER_CREDITS, + SOCKLND_NCONNDS, + SOCKLND_RECONNECTS_MIN, + SOCKLND_RECONNECTS_MAX, + SOCKLND_EAGER_ACK, + SOCKLND_ZERO_COPY, + SOCKLND_TYPED, + SOCKLND_BULK_MIN, + SOCKLND_RX_BUFFER_SIZE, + SOCKLND_TX_BUFFER_SIZE, + SOCKLND_NAGLE, + SOCKLND_IRQ_AFFINITY, + SOCKLND_KEEPALIVE_IDLE, + SOCKLND_KEEPALIVE_COUNT, + SOCKLND_KEEPALIVE_INTVL, + SOCKLND_BACKOFF_INIT, + SOCKLND_BACKOFF_MAX, + SOCKLND_PROTOCOL, + SOCKLND_ZERO_COPY_RECV, + SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS +}; +#else + +#define SOCKLND_TIMEOUT CTL_UNNUMBERED +#define SOCKLND_CREDITS CTL_UNNUMBERED +#define SOCKLND_PEER_CREDITS CTL_UNNUMBERED +#define SOCKLND_NCONNDS CTL_UNNUMBERED +#define SOCKLND_RECONNECTS_MIN CTL_UNNUMBERED +#define SOCKLND_RECONNECTS_MAX CTL_UNNUMBERED +#define SOCKLND_EAGER_ACK CTL_UNNUMBERED +#define SOCKLND_ZERO_COPY CTL_UNNUMBERED +#define SOCKLND_TYPED CTL_UNNUMBERED +#define SOCKLND_BULK_MIN CTL_UNNUMBERED +#define SOCKLND_RX_BUFFER_SIZE CTL_UNNUMBERED +#define SOCKLND_TX_BUFFER_SIZE CTL_UNNUMBERED +#define SOCKLND_NAGLE CTL_UNNUMBERED +#define SOCKLND_IRQ_AFFINITY CTL_UNNUMBERED +#define SOCKLND_KEEPALIVE_IDLE CTL_UNNUMBERED +#define SOCKLND_KEEPALIVE_COUNT CTL_UNNUMBERED +#define SOCKLND_KEEPALIVE_INTVL CTL_UNNUMBERED +#define SOCKLND_BACKOFF_INIT CTL_UNNUMBERED +#define SOCKLND_BACKOFF_MAX CTL_UNNUMBERED +#define SOCKLND_PROTOCOL CTL_UNNUMBERED +#define SOCKLND_ZERO_COPY_RECV CTL_UNNUMBERED +#define SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS CTL_UNNUMBERED #endif - {SOCKNAL_SYSCTL_TYPED, "typed", - &ksocknal_tunables.ksnd_typed_conns, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_MIN_BULK, "min_bulk", - &ksocknal_tunables.ksnd_min_bulk, sizeof (int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_BUFFER_SIZE, "buffer_size", - &ksocknal_tunables.ksnd_buffer_size, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_NAGLE, "nagle", - &ksocknal_tunables.ksnd_nagle, sizeof(int), - 0644, NULL, &proc_dointvec}, -#if CPU_AFFINITY - {SOCKNAL_SYSCTL_IRQ_AFFINITY, "irq_affinity", - &ksocknal_tunables.ksnd_irq_affinity, sizeof(int), - 0644, NULL, &proc_dointvec}, + +static cfs_sysctl_table_t ksocknal_ctl_table[] = { + { + .ctl_name = SOCKLND_TIMEOUT, + .procname = "timeout", + .data = &ksocknal_tunables.ksnd_timeout, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_CREDITS, + .procname = "credits", + .data = &ksocknal_tunables.ksnd_credits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_PEER_CREDITS, + .procname = "peer_credits", + .data = &ksocknal_tunables.ksnd_peercredits, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_NCONNDS, + .procname = "nconnds", + .data = &ksocknal_tunables.ksnd_nconnds, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RECONNECTS_MIN, + .procname = "min_reconnectms", + .data = &ksocknal_tunables.ksnd_min_reconnectms, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RECONNECTS_MAX, + .procname = "max_reconnectms", + .data = &ksocknal_tunables.ksnd_max_reconnectms, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_EAGER_ACK, + .procname = "eager_ack", + .data = &ksocknal_tunables.ksnd_eager_ack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_ZERO_COPY, + .procname = "zero_copy", + .data = &ksocknal_tunables.ksnd_zc_min_frag, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_ZERO_COPY_RECV, + .procname = "zero_copy_recv", + .data = &ksocknal_tunables.ksnd_zc_recv, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + + { + .ctl_name = SOCKLND_ZERO_COPY_RECV_MIN_NFRAGS, + .procname = "zero_copy_recv", + .data = &ksocknal_tunables.ksnd_zc_recv_min_nfrags, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_TYPED, + .procname = "typed", + .data = &ksocknal_tunables.ksnd_typed_conns, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_BULK_MIN, + .procname = "min_bulk", + .data = &ksocknal_tunables.ksnd_min_bulk, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_RX_BUFFER_SIZE, + .procname = "rx_buffer_size", + .data = &ksocknal_tunables.ksnd_rx_buffer_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_TX_BUFFER_SIZE, + .procname = "tx_buffer_size", + .data = &ksocknal_tunables.ksnd_tx_buffer_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_NAGLE, + .procname = "nagle", + .data = &ksocknal_tunables.ksnd_nagle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, +#ifdef CPU_AFFINITY + { + .ctl_name = SOCKLND_IRQ_AFFINITY, + .procname = "irq_affinity", + .data = &ksocknal_tunables.ksnd_irq_affinity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, #endif - {SOCKNAL_SYSCTL_KEEPALIVE_IDLE, "keepalive_idle", - &ksocknal_tunables.ksnd_keepalive_idle, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_COUNT, "keepalive_count", - &ksocknal_tunables.ksnd_keepalive_count, sizeof(int), - 0644, NULL, &proc_dointvec}, - {SOCKNAL_SYSCTL_KEEPALIVE_INTVL, "keepalive_intvl", - &ksocknal_tunables.ksnd_keepalive_intvl, sizeof(int), - 0644, NULL, &proc_dointvec}, - { 0 } + { + .ctl_name = SOCKLND_KEEPALIVE_IDLE, + .procname = "keepalive_idle", + .data = &ksocknal_tunables.ksnd_keepalive_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_COUNT, + .procname = "keepalive_count", + .data = &ksocknal_tunables.ksnd_keepalive_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_KEEPALIVE_INTVL, + .procname = "keepalive_intvl", + .data = &ksocknal_tunables.ksnd_keepalive_intvl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, +#ifdef SOCKNAL_BACKOFF + { + .ctl_name = SOCKLND_BACKOFF_INIT, + .procname = "backoff_init", + .data = &ksocknal_tunables.ksnd_backoff_init, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + { + .ctl_name = SOCKLND_BACKOFF_MAX, + .procname = "backoff_max", + .data = &ksocknal_tunables.ksnd_backoff_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, +#endif +#if SOCKNAL_VERSION_DEBUG + { + .ctl_name = SOCKLND_PROTOCOL, + .procname = "protocol", + .data = &ksocknal_tunables.ksnd_protocol, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, +#endif + {0} }; -ctl_table ksocknal_top_ctl_table[] = { - {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, + +cfs_sysctl_table_t ksocknal_top_ctl_table[] = { + { + .ctl_name = CTL_SOCKLND, + .procname = "socknal", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = ksocknal_ctl_table + }, { 0 } }; -#endif + +int +ksocknal_lib_tunables_init () +{ + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags < 2) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = 2; + if (*ksocknal_tunables.ksnd_zc_recv_min_nfrags > LNET_MAX_IOV) + *ksocknal_tunables.ksnd_zc_recv_min_nfrags = LNET_MAX_IOV; + + ksocknal_tunables.ksnd_sysctl = + cfs_register_sysctl_table(ksocknal_top_ctl_table, 0); + + if (ksocknal_tunables.ksnd_sysctl == NULL) + CWARN("Can't setup /proc tunables\n"); + + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ + if (ksocknal_tunables.ksnd_sysctl != NULL) + cfs_unregister_sysctl_table(ksocknal_tunables.ksnd_sysctl); +} +#else +int +ksocknal_lib_tunables_init () +{ + return 0; +} + +void +ksocknal_lib_tunables_fini () +{ +} +#endif /* # if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM */ void ksocknal_lib_bind_irq (unsigned int irq) { -#if (defined(CONFIG_SMP) && CPU_AFFINITY) +#if (defined(CONFIG_SMP) && defined(CPU_AFFINITY)) int bind; int cpu; - unsigned long flags; char cmdline[64]; ksock_irqinfo_t *info; char *argv[] = {"/bin/sh", @@ -85,13 +370,13 @@ ksocknal_lib_bind_irq (unsigned int irq) info = &ksocknal_data.ksnd_irqinfo[irq]; - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_lock_bh (&ksocknal_data.ksnd_global_lock); LASSERT (info->ksni_valid); bind = !info->ksni_bound; info->ksni_bound = 1; - write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + cfs_write_unlock_bh (&ksocknal_data.ksnd_global_lock); if (!bind) /* bound already */ return; @@ -100,8 +385,8 @@ ksocknal_lib_bind_irq (unsigned int irq) snprintf (cmdline, sizeof (cmdline), "echo %d > /proc/irq/%u/smp_affinity", 1 << cpu, irq); - printk (KERN_INFO "Lustre: Binding irq %u to CPU %d with cmd: %s\n", - irq, cpu, cmdline); + LCONSOLE_INFO("Binding irq %u to CPU %d with cmd: %s\n", + irq, cpu, cmdline); /* FIXME: Find a better method of setting IRQ affinity... */ @@ -113,12 +398,10 @@ ksocknal_lib_bind_irq (unsigned int irq) int ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) { - struct sockaddr_in sin; - int len = sizeof (sin); - int rc; + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, + &conn->ksnc_port); - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 2); /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ LASSERT (!conn->ksnc_closing); @@ -127,18 +410,13 @@ ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) return rc; } - conn->ksnc_ipaddr = ntohl (sin.sin_addr.s_addr); - conn->ksnc_port = ntohs (sin.sin_port); - - rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, - (struct sockaddr *)&sin, &len, 0); + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); if (rc != 0) { CERROR ("Error %d getting sock local IP\n", rc); return rc; } - conn->ksnc_myipaddr = ntohl (sin.sin_addr.s_addr); - return 0; } @@ -146,9 +424,10 @@ unsigned int ksocknal_lib_sock_irq (struct socket *sock) { int irq = 0; +#ifdef CPU_AFFINITY struct dst_entry *dst; - if (!ksocknal_tunables.ksnd_irq_affinity) + if (!*ksocknal_tunables.ksnd_irq_affinity) return 0; dst = sk_dst_get (sock->sk); @@ -163,76 +442,45 @@ ksocknal_lib_sock_irq (struct socket *sock) dst_release (dst); } - return (irq); +#endif + return irq; } -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) -static struct page * -ksocknal_kvaddr_to_page (unsigned long vaddr) +int +ksocknal_lib_zc_capable(struct socket *sock) { - struct page *page; - - if (vaddr >= VMALLOC_START && - vaddr < VMALLOC_END) - page = vmalloc_to_page ((void *)vaddr); -#if CONFIG_HIGHMEM - else if (vaddr >= PKMAP_BASE && - vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) - page = vmalloc_to_page ((void *)vaddr); - /* in 2.4 ^ just walks the page tables */ -#endif - else - page = virt_to_page (vaddr); + int caps = sock->sk->sk_route_caps; - if (page == NULL || - !VALID_PAGE (page)) - return (NULL); - - return (page); + /* ZC if the socket supports scatter/gather and doesn't need software + * checksums */ + return ((caps & NETIF_F_SG) != 0 && + (caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) != 0); } -#endif int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) { struct socket *sock = conn->ksnc_sock; -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) - unsigned long vaddr = (unsigned long)iov->iov_base - int offset = vaddr & (PAGE_SIZE - 1); - int zcsize = MIN (iov->iov_len, PAGE_SIZE - offset); - struct page *page; -#endif int nob; int rc; + if (*ksocknal_tunables.ksnd_enable_csum && /* checksum enabled */ + conn->ksnc_proto == &ksocknal_protocol_v2x && /* V2.x connection */ + tx->tx_nob == tx->tx_resid && /* frist sending */ + tx->tx_msg.ksm_csum == 0) /* not checksummed */ + ksocknal_lib_csum_tx(tx); + /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ -#if (SOCKNAL_ZC && SOCKNAL_VADDR_ZC) - if (zcsize >= ksocknal_data.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { - int msgflg = MSG_DONTWAIT; - - CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", - (void *)vaddr, page, page_address(page), offset, zcsize); - - if (!list_empty (&conn->ksnc_tx_queue) || - zcsize < tx->tx_resid) - msgflg |= MSG_MORE; - - rc = tcp_sendpage_zccd(sock, page, offset, zcsize, msgflg, &tx->tx_zccd); - } else -#endif { #if SOCKNAL_SINGLE_FRAG_TX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_niov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_niov; #endif struct msghdr msg = { .msg_name = NULL, @@ -259,24 +507,24 @@ ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) rc = sock_sendmsg(sock, &msg, nob); set_fs (oldmm); } - return rc; + return rc; } int ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { struct socket *sock = conn->ksnc_sock; - ptl_kiov_t *kiov = tx->tx_kiov; + lnet_kiov_t *kiov = tx->tx_kiov; int rc; int nob; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ -#if SOCKNAL_ZC - if (kiov->kiov_len >= ksocknal_tunables.ksnd_zc_min_frag && - (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { + if (kiov->kiov_len >= *ksocknal_tunables.ksnd_zc_min_frag && + tx->tx_msg.ksm_zc_req_cookie != 0) { + /* Zero copy is enabled */ + struct sock *sk = sock->sk; struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; int fragsize = kiov->kiov_len; @@ -289,21 +537,23 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) fragsize < tx->tx_resid) msgflg |= MSG_MORE; - rc = tcp_sendpage_zccd(sock, page, offset, fragsize, msgflg, - &tx->tx_zccd); - } else -#endif - { + if (sk->sk_prot->sendpage != NULL) { + rc = sk->sk_prot->sendpage(sk, page, + offset, fragsize, msgflg); + } else { + rc = tcp_sendpage(sock, page, offset, fragsize, msgflg); + } + } else { #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_nkiov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_nkiov; #endif struct msghdr msg = { .msg_name = NULL, @@ -325,7 +575,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) if (!list_empty(&conn->ksnc_tx_queue) || nob < tx->tx_resid) - msg.msg_flags |= MSG_DONTWAIT; + msg.msg_flags |= MSG_MORE; set_fs (KERNEL_DS); rc = sock_sendmsg(sock, &msg, nob); @@ -334,7 +584,7 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) for (i = 0; i < niov; i++) kunmap(kiov[i].kiov_page); } - return rc; + return rc; } void @@ -361,10 +611,10 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) #if SOCKNAL_SINGLE_FRAG_RX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_niov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; struct msghdr msg = { @@ -380,6 +630,9 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) int nob; int i; int rc; + int fragnob; + int sum; + __u32 saved_csum; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ @@ -396,29 +649,96 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) /* NB this is just a boolean..........................^ */ set_fs (oldmm); - return rc; + saved_csum = 0; + if (conn->ksnc_proto == &ksocknal_protocol_v2x) { + saved_csum = conn->ksnc_msg.ksm_csum; + conn->ksnc_msg.ksm_csum = 0; + } + + if (saved_csum != 0) { + /* accumulate checksum */ + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); + + fragnob = iov[i].iov_len; + if (fragnob > sum) + fragnob = sum; + + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + iov[i].iov_base, fragnob); + } + conn->ksnc_msg.ksm_csum = saved_csum; + } + + return rc; +} + +static void +ksocknal_lib_kiov_vunmap(void *addr) +{ + if (addr == NULL) + return; + + vunmap(addr); +} + +static void * +ksocknal_lib_kiov_vmap(lnet_kiov_t *kiov, int niov, + struct iovec *iov, struct page **pages) +{ + void *addr; + int nob; + int i; + + if (!*ksocknal_tunables.ksnd_zc_recv || pages == NULL) + return NULL; + + LASSERT (niov <= LNET_MAX_IOV); + + if (niov < 2 || + niov < *ksocknal_tunables.ksnd_zc_recv_min_nfrags) + return NULL; + + for (nob = i = 0; i < niov; i++) { + if ((kiov[i].kiov_offset != 0 && i > 0) || + (kiov[i].kiov_offset + kiov[i].kiov_len != CFS_PAGE_SIZE && i < niov - 1)) + return NULL; + + pages[i] = kiov[i].kiov_page; + nob += kiov[i].kiov_len; + } + + addr = vmap(pages, niov, VM_MAP, PAGE_KERNEL); + if (addr == NULL) + return NULL; + + iov->iov_base = addr + kiov[0].kiov_offset; + iov->iov_len = nob; + + return addr; } int ksocknal_lib_recv_kiov (ksock_conn_t *conn) { #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK - struct iovec scratch; - struct iovec *scratchiov = &scratch; - int niov = 1; + struct iovec scratch; + struct iovec *scratchiov = &scratch; + struct page **pages = NULL; + unsigned int niov = 1; #else #ifdef CONFIG_HIGHMEM #warning "XXX risk of kmap deadlock on multiple frags..." #endif - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_nkiov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + struct page **pages = conn->ksnc_scheduler->kss_rx_scratch_pgs; + unsigned int niov = conn->ksnc_rx_nkiov; #endif - ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_iov = scratchiov, - .msg_iovlen = niov, .msg_control = NULL, .msg_controllen = 0, .msg_flags = 0 @@ -427,13 +747,26 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) int nob; int i; int rc; + void *base; + void *addr; + int sum; + int fragnob; /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ - for (nob = i = 0; i < niov; i++) { - scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; - nob += scratchiov[i].iov_len = kiov[i].kiov_len; + if ((addr = ksocknal_lib_kiov_vmap(kiov, niov, scratchiov, pages)) != NULL) { + nob = scratchiov[0].iov_len; + msg.msg_iovlen = 1; + + } else { + for (nob = i = 0; i < niov; i++) { + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + scratchiov[i].iov_base = kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + } + msg.msg_iovlen = niov; } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); set_fs (KERNEL_DS); @@ -441,88 +774,73 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) /* NB this is just a boolean.......................^ */ set_fs (oldmm); - for (i = 0; i < niov; i++) - kunmap(kiov[i].kiov_page); - - return (rc); -} - -int -ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - mm_segment_t oldmm = get_fs(); - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; + if (conn->ksnc_msg.ksm_csum != 0) { + for (i = 0, sum = rc; sum > 0; i++, sum -= fragnob) { + LASSERT (i < niov); - set_fs (KERNEL_DS); - rc = sock_sendmsg (sock, &msg, iov.iov_len); - set_fs (oldmm); + /* Dang! have to kmap again because I have nowhere to stash the + * mapped address. But by doing it while the page is still + * mapped, the kernel just bumps the map count and returns me + * the address it stashed. */ + base = kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; + fragnob = kiov[i].kiov_len; + if (fragnob > sum) + fragnob = sum; - if (rc < 0) - return (rc); + conn->ksnc_rx_csum = ksocknal_csum(conn->ksnc_rx_csum, + base, fragnob); - if (rc == 0) { - CERROR ("Unexpected zero rc\n"); - return (-ECONNABORTED); + kunmap(kiov[i].kiov_page); } + } - buffer = ((char *)buffer) + rc; - nob -= rc; + if (addr != NULL) { + ksocknal_lib_kiov_vunmap(addr); + } else { + for (i = 0; i < niov; i++) + kunmap(kiov[i].kiov_page); } - return (0); + return (rc); } -int -ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob) +void +ksocknal_lib_csum_tx(ksock_tx_t *tx) { - int rc; - mm_segment_t oldmm = get_fs(); + int i; + __u32 csum; + void *base; - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_iov = &iov, - .msg_iovlen = 1, - .msg_control = NULL, - .msg_controllen = 0, - .msg_flags = 0 - }; + LASSERT(tx->tx_iov[0].iov_base == (void *)&tx->tx_msg); + LASSERT(tx->tx_conn != NULL); + LASSERT(tx->tx_conn->ksnc_proto == &ksocknal_protocol_v2x); - set_fs (KERNEL_DS); - rc = sock_recvmsg (sock, &msg, iov.iov_len, 0); - set_fs (oldmm); + tx->tx_msg.ksm_csum = 0; - if (rc < 0) - return (rc); + csum = ksocknal_csum(~0, (void *)tx->tx_iov[0].iov_base, + tx->tx_iov[0].iov_len); + + if (tx->tx_kiov != NULL) { + for (i = 0; i < tx->tx_nkiov; i++) { + base = kmap(tx->tx_kiov[i].kiov_page) + + tx->tx_kiov[i].kiov_offset; + + csum = ksocknal_csum(csum, base, tx->tx_kiov[i].kiov_len); - if (rc == 0) - return (-ECONNABORTED); + kunmap(tx->tx_kiov[i].kiov_page); + } + } else { + for (i = 1; i < tx->tx_niov; i++) + csum = ksocknal_csum(csum, tx->tx_iov[i].iov_base, + tx->tx_iov[i].iov_len); + } - buffer = ((char *)buffer) + rc; - nob -= rc; + if (*ksocknal_tunables.ksnd_inject_csum_error) { + csum++; + *ksocknal_tunables.ksnd_inject_csum_error = 0; } - return (0); + tx->tx_msg.ksm_csum = csum; } int @@ -533,31 +851,23 @@ ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int int len; int rc; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) { LASSERT (conn->ksnc_closing); *txmem = *rxmem = *nagle = 0; return (-ESHUTDOWN); } - set_fs (KERNEL_DS); - - len = sizeof(*txmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)txmem, &len); - if (rc == 0) { - len = sizeof(*rxmem); - rc = sock_getsockopt(sock, SOL_SOCKET, SO_RCVBUF, - (char *)rxmem, &len); - } + rc = libcfs_sock_getbuf(sock, txmem, rxmem); if (rc == 0) { len = sizeof(*nagle); + set_fs(KERNEL_DS); rc = sock->ops->getsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)nagle, &len); + set_fs(oldmm); } - set_fs (oldmm); - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); if (rc == 0) *nagle = !*nagle; @@ -606,7 +916,7 @@ ksocknal_lib_setup_sock (struct socket *sock) return (rc); } - if (!ksocknal_tunables.ksnd_nagle) { + if (!*ksocknal_tunables.ksnd_nagle) { option = 1; set_fs (KERNEL_DS); @@ -619,34 +929,57 @@ ksocknal_lib_setup_sock (struct socket *sock) } } - if (ksocknal_tunables.ksnd_buffer_size > 0) { - option = ksocknal_tunables.ksnd_buffer_size; + rc = libcfs_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return (rc); + } + +/* TCP_BACKOFF_* sockopt tunables unsupported in stock kernels */ +#ifdef SOCKNAL_BACKOFF + if (*ksocknal_tunables.ksnd_backoff_init > 0) { + option = *ksocknal_tunables.ksnd_backoff_init; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof (option)); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_INIT, + (char *)&option, sizeof (option)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set send buffer %d: %d\n", + CERROR ("Can't set initial tcp backoff %d: %d\n", option, rc); return (rc); } + } + + if (*ksocknal_tunables.ksnd_backoff_max > 0) { + option = *ksocknal_tunables.ksnd_backoff_max; +#ifdef SOCKNAL_BACKOFF_MS + option *= 1000; +#endif set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVBUF, - (char *)&option, sizeof (option)); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_BACKOFF_MAX, + (char *)&option, sizeof (option)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set receive buffer %d: %d\n", + CERROR ("Can't set maximum tcp backoff %d: %d\n", option, rc); return (rc); } } +#endif /* snapshot tunables */ - keep_idle = ksocknal_tunables.ksnd_keepalive_idle; - keep_count = ksocknal_tunables.ksnd_keepalive_count; - keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl; + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); @@ -693,135 +1026,13 @@ ksocknal_lib_setup_sock (struct socket *sock) return (0); } -int -ksocknal_lib_connect_sock(struct socket **sockp, int *may_retry, - ksock_route_t *route, int local_port) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct socket *sock; - int rc; - int option; - mm_segment_t oldmm = get_fs(); - struct timeval tv; - - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(local_port); - locaddr.sin_addr.s_addr = - (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) - : INADDR_ANY; - - memset (&srvaddr, 0, sizeof (srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (route->ksnr_port); - srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - *may_retry = 0; - - rc = sock_create (PF_INET, SOCK_STREAM, 0, &sock); - *sockp = sock; - if (rc != 0) { - CERROR ("Can't create autoconnect socket: %d\n", rc); - return (rc); - } - - /* Ugh; have to map_fd for compatibility with sockets passed in - * from userspace. And we actually need the sock->file refcounting - * that this gives you :) */ - - rc = sock_map_fd (sock); - if (rc < 0) { - sock_release (sock); - CERROR ("sock_map_fd error %d\n", rc); - return (rc); - } - - /* NB the file descriptor (rc) now owns the ref on sock->file */ - LASSERT (sock->file != NULL); - LASSERT (file_count(sock->file) == 1); - - get_file(sock->file); /* extra ref makes sock->file */ - sys_close(rc); /* survive this close */ - - /* Still got a single ref on sock->file */ - LASSERT (file_count(sock->file) == 1); - - /* Set the socket timeouts, so our connection attempt completes in - * finite time */ - tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; - tv.tv_usec = 0; - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set send timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto failed; - } - - set_fs (KERNEL_DS); - rc = sock_setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof (tv)); - set_fs (oldmm); - if (rc != 0) { - CERROR ("Can't set receive timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto failed; - } - - set_fs (KERNEL_DS); - option = 1; - rc = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof (option)); - set_fs (oldmm); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); - goto failed; - } - - rc = sock->ops->bind(sock, - (struct sockaddr *)&locaddr, sizeof(locaddr)); - if (rc == -EADDRINUSE) { - CDEBUG(D_NET, "Port %d already in use\n", local_port); - *may_retry = 1; - goto failed; - } - if (rc != 0) { - CERROR("Error trying to bind to reserved port %d: %d\n", - local_port, rc); - goto failed; - } - - rc = sock->ops->connect(sock, - (struct sockaddr *)&srvaddr, sizeof(srvaddr), - sock->file->f_flags); - if (rc == 0) - return 0; - - /* EADDRNOTAVAIL probably means we're already connected to the same - * peer/port on the same local port on a differently typed - * connection. Let our caller retry with a different local - * port... */ - *may_retry = (rc == -EADDRNOTAVAIL); - - CDEBUG(*may_retry ? D_NET : D_ERROR, - "Error %d connecting %u.%u.%u.%u/%d -> %u.%u.%u.%u/%d\n", rc, - HIPQUAD(route->ksnr_myipaddr), local_port, - HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); - - failed: - fput(sock->file); - return rc; -} - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct tcp_opt *sock2tcp_opt(struct sock *sk) { return &(sk->tp_pinfo.af_tcp); } +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10)) +#define sock2tcp_opt(sk) tcp_sk(sk) #else struct tcp_opt *sock2tcp_opt(struct sock *sk) { @@ -834,13 +1045,17 @@ void ksocknal_lib_push_conn (ksock_conn_t *conn) { struct sock *sk; +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11)) struct tcp_opt *tp; +#else + struct tcp_sock *tp; +#endif int nonagle; int val = 1; int rc; mm_segment_t oldmm; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) /* being shut down */ return; @@ -865,7 +1080,7 @@ ksocknal_lib_push_conn (ksock_conn_t *conn) tp->nonagle = nonagle; release_sock (sk); - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); } extern void ksocknal_read_callback (ksock_conn_t *conn); @@ -880,36 +1095,38 @@ ksocknal_data_ready (struct sock *sk, int n) ENTRY; /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_global_lock); + LASSERT(!in_irq()); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; if (conn == NULL) { /* raced with ksocknal_terminate_conn */ LASSERT (sk->sk_data_ready != &ksocknal_data_ready); sk->sk_data_ready (sk, n); } else - ksocknal_read_callback(conn); + ksocknal_read_callback(conn); - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); EXIT; } -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,6,7)) -#define tcp_wspace(sk) sk_stream_wspace(sk) -#endif - static void ksocknal_write_space (struct sock *sk) { ksock_conn_t *conn; + int wspace; + int min_wpace; /* interleave correctly with closing sockets... */ - read_lock (&ksocknal_data.ksnd_global_lock); + LASSERT(!in_irq()); + cfs_read_lock (&ksocknal_data.ksnd_global_lock); conn = sk->sk_user_data; + wspace = SOCKNAL_WSPACE(sk); + min_wpace = SOCKNAL_MIN_WSPACE(sk); CDEBUG(D_NET, "sk %p wspace %d low water %d conn %p%s%s%s\n", - sk, tcp_wspace(sk), SOCKNAL_TX_LOW_WATER(sk), conn, + sk, wspace, min_wpace, conn, (conn == NULL) ? "" : (conn->ksnc_tx_ready ? " ready" : " blocked"), (conn == NULL) ? "" : (conn->ksnc_tx_scheduled ? @@ -921,61 +1138,113 @@ ksocknal_write_space (struct sock *sk) LASSERT (sk->sk_write_space != &ksocknal_write_space); sk->sk_write_space (sk); - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); return; } - if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ - ksocknal_write_callback(conn); + if (wspace >= min_wpace) { /* got enough space */ + ksocknal_write_callback(conn); - /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the - * ENOMEM check in ksocknal_transmit is race-free (think about - * it). */ + /* Clear SOCK_NOSPACE _after_ ksocknal_write_callback so the + * ENOMEM check in ksocknal_transmit is race-free (think about + * it). */ clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); } - read_unlock (&ksocknal_data.ksnd_global_lock); + cfs_read_unlock (&ksocknal_data.ksnd_global_lock); } void ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) { - conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; - conn->ksnc_saved_write_space = sock->sk->sk_write_space; + conn->ksnc_saved_data_ready = sock->sk->sk_data_ready; + conn->ksnc_saved_write_space = sock->sk->sk_write_space; } void ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) { - sock->sk->sk_user_data = conn; - sock->sk->sk_data_ready = ksocknal_data_ready; - sock->sk->sk_write_space = ksocknal_write_space; - return; + sock->sk->sk_user_data = conn; + sock->sk->sk_data_ready = ksocknal_data_ready; + sock->sk->sk_write_space = ksocknal_write_space; + return; } void -ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn) +ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) { - ksocknal_data_ready (sock->sk, 0); - ksocknal_write_space (sock->sk); - return; + /* Remove conn's network callbacks. + * NB I _have_ to restore the callback, rather than storing a noop, + * since the socket could survive past this module being unloaded!! */ + sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; + sock->sk->sk_write_space = conn->ksnc_saved_write_space; + + /* A callback could be in progress already; they hold a read lock + * on ksnd_global_lock (to serialise with me) and NOOP if + * sk_user_data is NULL. */ + sock->sk->sk_user_data = NULL; + + return ; } -void -ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) +int +ksocknal_lib_memory_pressure(ksock_conn_t *conn) +{ + int rc = 0; + ksock_sched_t *sched; + + sched = conn->ksnc_scheduler; + cfs_spin_lock_bh (&sched->kss_lock); + + if (!SOCK_TEST_NOSPACE(conn->ksnc_sock) && + !conn->ksnc_tx_ready) { + /* SOCK_NOSPACE is set when the socket fills + * and cleared in the write_space callback + * (which also sets ksnc_tx_ready). If + * SOCK_NOSPACE and ksnc_tx_ready are BOTH + * zero, I didn't fill the socket and + * write_space won't reschedule me, so I + * return -ENOMEM to get my caller to retry + * after a timeout */ + rc = -ENOMEM; + } + + cfs_spin_unlock_bh (&sched->kss_lock); + + return rc; +} + +__u64 +ksocknal_lib_new_incarnation(void) { - /* Remove conn's network callbacks. - * NB I _have_ to restore the callback, rather than storing a noop, - * since the socket could survive past this module being unloaded!! */ - sock->sk->sk_data_ready = conn->ksnc_saved_data_ready; - sock->sk->sk_write_space = conn->ksnc_saved_write_space; - - /* A callback could be in progress already; they hold a read lock - * on ksnd_global_lock (to serialise with me) and NOOP if - * sk_user_data is NULL. */ - sock->sk->sk_user_data = NULL; - - return ; + struct timeval tv; + + /* The incarnation number is the time this module loaded and it + * identifies this particular instance of the socknal. Hopefully + * we won't be able to reboot more frequently than 1MHz for the + * forseeable future :) */ + + do_gettimeofday(&tv); + + return (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; } +int +ksocknal_lib_bind_thread_to_cpu(int id) +{ +#if defined(CONFIG_SMP) && defined(CPU_AFFINITY) + id = ksocknal_sched2cpu(id); + if (cpu_online(id)) { + cpumask_t m = CPU_MASK_NONE; + cpu_set(id, m); + set_cpus_allowed(current, m); + return 0; + } + + return -1; + +#else + return 0; +#endif +}