X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd_lib-darwin.c;h=9148413701e09dbe7ada9619ef23b093ab585cea;hb=98060d83459ba10409f295898f0ec917f938b4d3;hp=ada5b640a1ce8ef559bb681bb64c5a2fb3fedb96;hpb=00f255b8c00dff66481a6ab22391869217b5d8af;p=fs%2Flustre-release.git diff --git a/lnet/klnds/socklnd/socklnd_lib-darwin.c b/lnet/klnds/socklnd/socklnd_lib-darwin.c index ada5b64..9148413 100644 --- a/lnet/klnds/socklnd/socklnd_lib-darwin.c +++ b/lnet/klnds/socklnd/socklnd_lib-darwin.c @@ -1,26 +1,44 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * Copyright (C) 2002 Cluster File Systems, Inc. - * Author: Phil Schwan +/* + * GPL HEADER START * - * This file is part of Lustre, http://www.lustre.org. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lnet/klnds/socklnd/socklnd_lib-darwin.c * * Darwin porting library * Make things easy to port + * + * Author: Phil Schwan */ #include #include @@ -28,174 +46,494 @@ #include #include -#include "socknal.h" +#include "socklnd.h" -#if 0 -#undef SOCKNAL_SINGLE_FRAG_TX -#define SOCKNAL_SINGLE_FRAG_TX 1 -#undef SOCKNAL_SINGLE_FRAG_RX -#define SOCKNAL_SINGLE_FRAG_RX 1 -#endif +# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM -SYSCTL_DECL(_portals); +SYSCTL_DECL(_lnet); -SYSCTL_NODE (_portals, OID_AUTO, ksocknal, CTLFLAG_RW, - 0, "ksocknal_sysctl"); +SYSCTL_NODE (_lnet, OID_AUTO, ksocknal, CTLFLAG_RW, + 0, "ksocknal_sysctl"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, timeout, - CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_io_timeout, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, timeout, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_timeout, 0, "timeout"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, eager_ack, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, credits, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_credits, + 0, "credits"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, peer_credits, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_peertxcredits, + 0, "peer_credits"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, nconnds, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nconnds, + 0, "nconnds"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, min_reconnectms, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_reconnectms, + 0, "min_reconnectms"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, max_reconnectms, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_max_reconnectms, + 0, "max_reconnectms"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, eager_ack, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_eager_ack, 0, "eager_ack"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, typed, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, typed, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_typed_conns, 0, "typed"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, min_bulk, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, min_bulk, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_bulk, 0, "min_bulk"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, buffer_size, - CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_buffer_size, - 0, "buffer_size"); -SYSCTL_INT(_portals_ksocknal, OID_AUTO, nagle, +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, rx_buffer_size, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_rx_buffer_size, + 0, "rx_buffer_size"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, tx_buffer_size, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_tx_buffer_size, + 0, "tx_buffer_size"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, nagle, CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nagle, 0, "nagle"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_idle, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_idle, + 0, "keepalive_idle"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_count, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_count, + 0, "keepalive_count"); +SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_intvl, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_intvl, + 0, "keepalive_intvl"); cfs_sysctl_table_t ksocknal_top_ctl_table [] = { - &sysctl__portals_ksocknal, - &sysctl__portals_ksocknal_timeout, - &sysctl__portals_ksocknal_eager_ack, - &sysctl__portals_ksocknal_typed, - &sysctl__portals_ksocknal_min_bulk, - &sysctl__portals_ksocknal_buffer_size, - &sysctl__portals_ksocknal_nagle, + &sysctl__lnet_ksocknal, + &sysctl__lnet_ksocknal_timeout, + &sysctl__lnet_ksocknal_credits, + &sysctl__lnet_ksocknal_peer_credits, + &sysctl__lnet_ksocknal_nconnds, + &sysctl__lnet_ksocknal_min_reconnectms, + &sysctl__lnet_ksocknal_max_reconnectms, + &sysctl__lnet_ksocknal_eager_ack, + &sysctl__lnet_ksocknal_typed, + &sysctl__lnet_ksocknal_min_bulk, + &sysctl__lnet_ksocknal_rx_buffer_size, + &sysctl__lnet_ksocknal_tx_buffer_size, + &sysctl__lnet_ksocknal_nagle, + &sysctl__lnet_ksocknal_keepalive_idle, + &sysctl__lnet_ksocknal_keepalive_count, + &sysctl__lnet_ksocknal_keepalive_intvl, NULL }; -static unsigned long ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); +int +ksocknal_lib_tunables_init () +{ + ksocknal_tunables.ksnd_sysctl = + cfs_register_sysctl_table (ksocknal_top_ctl_table, 0); + + if (ksocknal_tunables.ksnd_sysctl == NULL) + return -ENOMEM; -struct socket * -sockfd_lookup(int fd, void *foo) + return 0; +} + +void +ksocknal_lib_tunables_fini () { - struct socket *so; - struct file *fp; - CFS_DECL_FUNNEL_DATA; + if (ksocknal_tunables.ksnd_sysctl != NULL) + cfs_unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl); +} +#else +int +ksocknal_lib_tunables_init () +{ + return 0; +} - CFS_NET_IN; - getsock(current_proc()->p_fd, fd, &fp); - CFS_NET_EX; - so = (struct socket *)fp->f_data; - so->reserved4 = fp; - CFS_CONE_IN; - fref(fp); - CFS_CONE_EX; - return so; +void +ksocknal_lib_tunables_fini () +{ } +#endif + +/* + * To use bigger buffer for socket: + * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so + * we must patch kernel). + * 2. Increase net.inet.tcp.reass.maxsegments + * 3. Increase net.inet.tcp.sendspace + * 4. Increase net.inet.tcp.recvspace + * 5. Increase kern.ipc.maxsockbuf + */ +#define KSOCKNAL_MAX_BUFFER (1152*1024) + +int +ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) +{ + int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1, + &conn->ksnc_ipaddr, + &conn->ksnc_port); -extern struct fileops socketops; + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); + + if (rc != 0) { + CERROR ("Error %d getting sock peer IP\n", rc); + return rc; + } -static int -sock_map_fd (struct socket *so) + rc = libcfs_sock_getaddr(conn->ksnc_sock, 0, + &conn->ksnc_myipaddr, NULL); + if (rc != 0) { + CERROR ("Error %d getting sock local IP\n", rc); + return rc; + } + + return 0; +} + +#ifdef __DARWIN8__ + +int +ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) { - struct file *fp; - int fd; - CFS_DECL_FUNNEL_DATA; - - CFS_CONE_IN; - falloc(current_proc(), &fp, &fd); - fp->f_flag = FREAD|FWRITE; - fp->f_type = DTYPE_SOCKET; - fp->f_ops = &socketops; - fp->f_data = (caddr_t)so; - so->reserved4 = fp; - *fdflags(current_proc(), fd) &= ~UF_RESERVED; - CFS_CONE_EX; - - return fd; + socket_t sock = C2B_SOCK(conn->ksnc_sock); + size_t sndlen; + int nob; + int rc; + +#if SOCKNAL_SINGLE_FRAG_TX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_niov; +#endif + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT + }; + + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = tx->tx_iov[i]; + nob += scratchiov[i].iov_len; + } + + /* + * XXX Liang: + * Linux has MSG_MORE, do we have anything to + * reduce number of partial TCP segments sent? + */ + rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen); + if (rc == 0) + rc = sndlen; + return rc; } -static void -sock_release(struct socket *so) +int +ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) { - struct file *fp; - CFS_DECL_FUNNEL_DATA; - - fp = (struct file *)so->reserved4; - so->reserved4 = NULL; - fp->f_data = NULL; - CFS_CONE_IN; - frele(fp); - CFS_CONE_EX; - CFS_NET_IN; - soshutdown(so, 0); - CFS_NET_EX; + socket_t sock = C2B_SOCK(conn->ksnc_sock); + lnet_kiov_t *kiov = tx->tx_kiov; + int rc; + int nob; + size_t sndlen; + +#if SOCKNAL_SINGLE_FRAG_TX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_nkiov; +#endif + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = MSG_DONTWAIT + }; + + int i; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + + /* + * XXX Liang: + * Linux has MSG_MORE, do wen have anyting to + * reduce number of partial TCP segments sent? + */ + rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen); + for (i = 0; i < niov; i++) + cfs_kunmap(kiov[i].kiov_page); + if (rc == 0) + rc = sndlen; + return rc; } -static void -sock_fdrelse(int fd) -{ - CFS_DECL_FUNNEL_DATA; +int +ksocknal_lib_recv_iov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_niov; +#endif + struct iovec *iov = conn->ksnc_rx_iov; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + size_t rcvlen; + int nob; + int i; + int rc; + + LASSERT (niov > 0); + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + rc = -sock_receive (C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen); + if (rc == 0) + rc = rcvlen; + + return rc; +} + +int +ksocknal_lib_recv_kiov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + unsigned int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_nkiov; +#endif + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_iov = scratchiov, + .msg_iovlen = niov, + .msg_control = NULL, + .msg_controllen = 0, + .msg_flags = 0 + }; + int nob; + int i; + size_t rcvlen; + int rc; - CFS_CONE_IN; - fdrelse(current_proc(), fd); - CFS_CONE_EX; + /* NB we can't trust socket ops to either consume our iovs + * or leave them alone. */ + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + \ + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + rc = -sock_receive(C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen); + for (i = 0; i < niov; i++) + cfs_kunmap(kiov[i].kiov_page); + if (rc == 0) + rc = rcvlen; + return (rc); } void -ksocknal_lib_bind_irq (unsigned int irq) +ksocknal_lib_eager_ack (ksock_conn_t *conn) { - return; + /* XXX Liang: */ } -unsigned int -ksocknal_lib_sock_irq (struct socket *sock) +int +ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) { - return 0; + socket_t sock = C2B_SOCK(conn->ksnc_sock); + int len; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + *txmem = *rxmem = *nagle = 0; + return (-ESHUTDOWN); + } + rc = libcfs_sock_getbuf(conn->ksnc_sock, txmem, rxmem); + if (rc == 0) { + len = sizeof(*nagle); + rc = -sock_getsockopt(sock, IPPROTO_TCP, TCP_NODELAY, + nagle, &len); + } + ksocknal_connsock_decref(conn); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; + + return (rc); } int -ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) -{ - struct sockaddr_in *sin; - struct sockaddr *sa; - int rc; - CFS_DECL_NET_DATA; +ksocknal_lib_setup_sock (cfs_socket_t *sock) +{ + int rc; + int option; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + socket_t so = C2B_SOCK(sock); + struct linger linger; - CFS_NET_IN; - rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa); - LASSERT (!conn->ksnc_closing); - if (rc != 0) { - CFS_NET_EX; - if (sa) FREE(sa, M_SONAME); - CERROR ("Error %d getting sock peer IP\n", rc); - return rc; + /* Ensure this socket aborts active sends immediately when we close + * it. */ + linger.l_onoff = 0; + linger.l_linger = 0; + rc = -sock_setsockopt(so, SOL_SOCKET, SO_LINGER, &linger, sizeof(linger)); + if (rc != 0) { + CERROR ("Can't set SO_LINGER: %d\n", rc); + return (rc); + } + + if (!*ksocknal_tunables.ksnd_nagle) { + option = 1; + rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &option, sizeof(option)); + if (rc != 0) { + CERROR ("Can't disable nagle: %d\n", rc); + return (rc); + } } - sin = (struct sockaddr_in *)sa; - conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr); - conn->ksnc_port = ntohs (sin->sin_port); - if (sa) FREE(sa, M_SONAME); - rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa); - CFS_NET_EX; + + rc = libcfs_sock_setbuf(sock, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return (rc); + } + + /* snapshot tunables */ + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + option = (do_keepalive ? 1 : 0); + + rc = -sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &option, sizeof(option)); if (rc != 0) { - if (sa) FREE(sa, M_SONAME); - CERROR ("Error %d getting sock local IP\n", rc); - return rc; - } - conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr); + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + return (rc); + } + + if (!do_keepalive) + return (rc); + rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_KEEPALIVE, + &keep_idle, sizeof(keep_idle)); + + return (rc); +} - return 0; +void +ksocknal_lib_push_conn(ksock_conn_t *conn) +{ + socket_t sock; + int val = 1; + int rc; + + rc = ksocknal_connsock_addref(conn); + if (rc != 0) /* being shut down */ + return; + sock = C2B_SOCK(conn->ksnc_sock); + + rc = -sock_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val)); + LASSERT(rc == 0); + + ksocknal_connsock_decref(conn); + return; } +extern void ksocknal_read_callback (ksock_conn_t *conn); +extern void ksocknal_write_callback (ksock_conn_t *conn); + +static void +ksocknal_upcall(socket_t so, void *arg, int waitf) +{ + ksock_conn_t *conn = (ksock_conn_t *)arg; + ENTRY; + + read_lock (&ksocknal_data.ksnd_global_lock); + if (conn == NULL) + goto out; + + ksocknal_read_callback (conn); + /* XXX Liang */ + ksocknal_write_callback (conn); +out: + read_unlock (&ksocknal_data.ksnd_global_lock); + EXIT; +} + +void +ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn) +{ + /* No callback need to save in osx */ + return; +} + +void +ksocknal_lib_set_callback(cfs_socket_t *sock, ksock_conn_t *conn) +{ + libcfs_sock_set_cb(sock, ksocknal_upcall, (void *)conn); + return; +} + +void +ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn) +{ + libcfs_sock_reset_cb(sock); +} + +#else /* !__DARWIN8__ */ + int ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) { #if SOCKNAL_SINGLE_FRAG_TX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_niov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_niov; #endif struct socket *sock = conn->ksnc_sock; int nob; @@ -248,13 +586,13 @@ ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) #if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; - int niov = tx->tx_nkiov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = tx->tx_nkiov; #endif struct socket *sock = conn->ksnc_sock; - ptl_kiov_t *kiov = tx->tx_kiov; + lnet_kiov_t *kiov = tx->tx_kiov; int nob; int rc; int i; @@ -364,6 +702,10 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn) CFS_NET_IN; s = splnet(); + /* + * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo + * to send immediate ACK. + */ if (tp && tp->t_flags & TF_DELACK){ tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; @@ -371,14 +713,6 @@ ksocknal_lib_eager_ack (ksock_conn_t *conn) } splx(s); - /* - * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo - * to send immediate ACK. It's not the best resolution because - * tcp_fasttimo will send out ACK for all delayed-ack tcp socket. - * Anyway, it's working now. - * extern void tcp_fasttimo(); - * tcp_fasttimo(); - */ CFS_NET_EX; return; @@ -390,10 +724,10 @@ ksocknal_lib_recv_iov (ksock_conn_t *conn) #if SOCKNAL_SINGLE_FRAG_RX struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_niov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_niov; #endif struct iovec *iov = conn->ksnc_rx_iov; int nob; @@ -444,12 +778,12 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) #if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK struct iovec scratch; struct iovec *scratchiov = &scratch; - int niov = 1; + unsigned int niov = 1; #else - struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; - int niov = conn->ksnc_rx_nkiov; + struct iovec *scratchiov = conn->ksnc_scheduler->kss_scratch_iov; + unsigned int niov = conn->ksnc_rx_nkiov; #endif - ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + lnet_kiov_t *kiov = conn->ksnc_rx_kiov; int nob; int rc; int i; @@ -497,138 +831,43 @@ ksocknal_lib_recv_kiov (ksock_conn_t *conn) } int -ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob) -{ - int rc; - CFS_DECL_NET_DATA; - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct uio suio = { - .uio_iov = &iov, - .uio_iovcnt = 1, - .uio_offset = 0, - .uio_resid = nob, - .uio_segflg = UIO_SYSSPACE, - .uio_rw = UIO_WRITE, - .uio_procp = NULL - }; - - CFS_NET_IN; - rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0); - CFS_NET_EX; - - if (rc != 0) { - if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ - rc == EWOULDBLOCK)) - rc = 0; - if ( rc != 0 ) - return -rc; - rc = nob - suio.uio_resid; - buffer = ((char *)buffer) + rc; - nob = suio.uio_resid; - continue; - } - break; - } - - return (0); -} - -int -ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob) -{ - int rc; - CFS_DECL_NET_DATA; - - while (nob > 0) { - struct iovec iov = { - .iov_base = buffer, - .iov_len = nob - }; - struct uio ruio = { - .uio_iov = &iov, - .uio_iovcnt = 1, - .uio_offset = 0, - .uio_resid = nob, - .uio_segflg = UIO_SYSSPACE, - .uio_rw = UIO_READ, - .uio_procp = NULL - }; - - CFS_NET_IN; - rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0); - CFS_NET_EX; - - if (rc != 0) { - if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ - rc == EWOULDBLOCK)) - rc = 0; - if (rc != 0) - return -rc; - rc = nob - ruio.uio_resid; - buffer = ((char *)buffer) + rc; - nob = ruio.uio_resid; - continue; - } - break; - } - - return (0); -} - -int ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) { - struct sockopt sopt; struct socket *sock = conn->ksnc_sock; - int len; int rc; - CFS_DECL_NET_DATA; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) { LASSERT (conn->ksnc_closing); *txmem = *rxmem = *nagle = 0; - rc = -ESHUTDOWN; - goto out; - } - len = sizeof(*txmem); - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_GET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_SNDBUF; - sopt.sopt_val = txmem; - sopt.sopt_valsize = len; - - CFS_NET_IN; - rc = sogetopt(sock, &sopt); - if (rc == 0) { - len = sizeof(*rxmem); - sopt.sopt_name = SO_RCVBUF; - sopt.sopt_val = rxmem; - rc = sogetopt(sock, &sopt); + return -ESHUTDOWN; } + rc = libcfs_sock_getbuf(sock, txmem, rxmem); if (rc == 0) { + struct sockopt sopt; + int len; + CFS_DECL_NET_DATA; + len = sizeof(*nagle); + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_GET; sopt.sopt_level = IPPROTO_TCP; sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = nagle; - rc = sogetopt(sock, &sopt); + sopt.sopt_valsize = len; + + CFS_NET_IN; + rc = -sogetopt(sock, &sopt); + CFS_NET_EX; } - CFS_NET_EX; - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); if (rc == 0) *nagle = !*nagle; else *txmem = *rxmem = *nagle = 0; -out: - return (-rc); + return (rc); } int @@ -644,9 +883,18 @@ ksocknal_lib_setup_sock (struct socket *so) struct linger linger; CFS_DECL_NET_DATA; + rc = libcfs_sock_setbuf(so, + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size); + if (rc != 0) { + CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n", + *ksocknal_tunables.ksnd_tx_buffer_size, + *ksocknal_tunables.ksnd_rx_buffer_size, rc); + return (rc); + } + /* Ensure this socket aborts active sends immediately when we close * it. */ - bzero(&sopt, sizeof sopt); linger.l_onoff = 0; @@ -658,14 +906,13 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_valsize = sizeof(linger); CFS_NET_IN; - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't set SO_LINGER: %d\n", rc); goto out; } - - if (!ksocknal_tunables.ksnd_nagle) { + if (!*ksocknal_tunables.ksnd_nagle) { option = 1; bzero(&sopt, sizeof sopt); sopt.sopt_dir = SOPT_SET; @@ -673,41 +920,17 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_name = TCP_NODELAY; sopt.sopt_val = &option; sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't disable nagle: %d\n", rc); goto out; } } - if (ksocknal_tunables.ksnd_buffer_size > 0) { - option = ksocknal_tunables.ksnd_buffer_size; - if (option > ksocknal_mbuf_size) - option = ksocknal_mbuf_size; - - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_SNDBUF; - sopt.sopt_val = &option; - sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); - if (rc != 0) { - CERROR ("Can't set send buffer %d: %d\n", - option, rc); - goto out; - } - - sopt.sopt_name = SO_RCVBUF; - rc = sosetopt(so, &sopt); - if (rc != 0) { - CERROR ("Can't set receive buffer %d: %d\n", - option, rc); - goto out; - } - } + /* snapshot tunables */ - keep_idle = ksocknal_tunables.ksnd_keepalive_idle; - keep_count = ksocknal_tunables.ksnd_keepalive_count; - keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl; + keep_idle = *ksocknal_tunables.ksnd_keepalive_idle; + keep_count = *ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl; do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); option = (do_keepalive ? 1 : 0); @@ -717,7 +940,7 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_name = SO_KEEPALIVE; sopt.sopt_val = &option; sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); goto out; @@ -735,161 +958,14 @@ ksocknal_lib_setup_sock (struct socket *so) sopt.sopt_name = TCP_KEEPALIVE; sopt.sopt_val = &keep_idle; sopt.sopt_valsize = sizeof(keep_idle); - rc = sosetopt(so, &sopt); + rc = -sosetopt(so, &sopt); if (rc != 0) { CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc); goto out; } out: CFS_NET_EX; - return (-rc); -} - -int -ksocknal_lib_connect_sock (struct socket **sockp, int *may_retry, - ksock_route_t *route, int local_port) -{ - struct sockaddr_in locaddr; - struct sockaddr_in srvaddr; - struct timeval tv; - int fd; - struct socket *so; - struct sockopt sopt; - int option; - int rc; - int s; - CFS_DECL_FUNNEL_DATA; - - ENTRY; - bzero (&locaddr, sizeof (locaddr)); - locaddr.sin_len = sizeof(struct sockaddr_in); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons (local_port); - locaddr.sin_addr.s_addr = - (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) - : INADDR_ANY; - bzero(&srvaddr, sizeof(srvaddr)); - srvaddr.sin_len = sizeof(struct sockaddr_in); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons (route->ksnr_port); - srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); - - *may_retry = 0; - - CFS_NET_IN; - rc = socreate(PF_INET, &so, SOCK_STREAM, 0); - CFS_NET_EX; - *sockp = so; - if (rc != 0) { - CERROR ("Can't create autoconnect socket: %d\n", rc); - return (-rc); - } - - /* - * XXX - * Liang: what do we need here? - */ - fd = sock_map_fd (so); - if (fd < 0) { - sock_release (so); - CERROR ("sock_map_fd error %d\n", fd); - return (fd); - } - sock_fdrelse(fd); - - /* Set the socket timeouts, so our connection attempt completes in - * finite time */ - tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; - tv.tv_usec = 0; - bzero(&sopt, sizeof sopt); - sopt.sopt_dir = SOPT_SET; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_SNDTIMEO; - sopt.sopt_val = &tv; - sopt.sopt_valsize = sizeof(tv); - - CFS_NET_IN; - rc = sosetopt(so, &sopt); - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't set send timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto out; - } - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_RCVTIMEO; - rc = sosetopt(so, &sopt); - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't set receive timeout %d: %d\n", - ksocknal_tunables.ksnd_io_timeout, rc); - goto out; - } - option = 1; - sopt.sopt_level = SOL_SOCKET; - sopt.sopt_name = SO_REUSEADDR; - sopt.sopt_val = &option; - sopt.sopt_valsize = sizeof(option); - rc = sosetopt(so, &sopt); - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't set sock reuse address: %d\n", rc); - goto out; - } - rc = sobind(so, (struct sockaddr *)&locaddr); - if (rc == EADDRINUSE) { - CFS_NET_EX; - CDEBUG(D_NET, "Port %d already in use\n", local_port); - *may_retry = 1; - goto out; - } - if (rc != 0) { - CFS_NET_EX; - CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n", - HIPQUAD(route->ksnr_myipaddr), rc); - goto out; - } - rc = soconnect(so, (struct sockaddr *)&srvaddr); - *may_retry = (rc == EADDRNOTAVAIL || rc == EADDRINUSE); - if (rc != 0) { - CFS_NET_EX; - if (rc != EADDRNOTAVAIL && rc != EADDRINUSE) - CERROR ("Can't connect to nid "LPX64 - " local IP: %u.%u.%u.%u," - " remote IP: %u.%u.%u.%u/%d: %d\n", - route->ksnr_peer->ksnp_nid, - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port, rc); - goto out; - } - - s = splnet(); - while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { - CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n"); - (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz); - } - LASSERT((so->so_state & SS_ISCONNECTED)); - splx(s); - CFS_NET_EX; - - rc = so->so_error; - if (rc != 0) { - CERROR ("Error %d waiting for connection to nid "LPX64 - " local IP: %u.%u.%u.%u," - " remote IP: %u.%u.%u.%u/%d: %d\n", rc, - route->ksnr_peer->ksnp_nid, - HIPQUAD(route->ksnr_myipaddr), - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port, rc); - goto out; - } - return (-rc); - - out: - rele_file(KSN_SOCK2FILE(so)); - - return (-rc); + return (rc); } void @@ -901,7 +977,7 @@ ksocknal_lib_push_conn(ksock_conn_t *conn) int rc; CFS_DECL_NET_DATA; - rc = ksocknal_getconnsock (conn); + rc = ksocknal_connsock_addref(conn); if (rc != 0) /* being shut down */ return; sock = conn->ksnc_sock; @@ -916,47 +992,36 @@ ksocknal_lib_push_conn(ksock_conn_t *conn) sosetopt(sock, &sopt); CFS_NET_EX; - ksocknal_putconnsock (conn); + ksocknal_connsock_decref(conn); return; } + extern void ksocknal_read_callback (ksock_conn_t *conn); extern void ksocknal_write_callback (ksock_conn_t *conn); static void ksocknal_upcall(struct socket *so, caddr_t arg, int waitf) { - ksock_conn_t *conn; - CFS_DECL_NET_DATA; + ksock_conn_t *conn = (ksock_conn_t *)arg; ENTRY; read_lock (&ksocknal_data.ksnd_global_lock); - conn = so->reserved3; - - if (conn == NULL){ - /* More processing is needed? */ + if (conn == NULL) goto out; - } - if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) { + + if (so->so_rcv.sb_flags & SB_UPCALL) { extern int soreadable(struct socket *so); - CFS_NET_IN; - if (conn->ksnc_rx_nob_wanted && soreadable(so)){ + if (conn->ksnc_rx_nob_wanted && soreadable(so)) /* To verify whether the upcall is for receive */ - CFS_NET_EX; ksocknal_read_callback (conn); - }else - CFS_NET_EX; } /* go foward? */ - if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){ + if (so->so_snd.sb_flags & SB_UPCALL){ extern int sowriteable(struct socket *so); - CFS_NET_IN; - if (sowriteable(so)){ + if (sowriteable(so)) /* socket is writable */ - CFS_NET_EX; ksocknal_write_callback(conn); - } else - CFS_NET_EX; } out: read_unlock (&ksocknal_data.ksnd_global_lock); @@ -977,22 +1042,24 @@ ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) CFS_DECL_NET_DATA; CFS_NET_IN; - sock->so_upcallarg = (void *)sock; /* anything not NULL */ + sock->so_upcallarg = (void *)conn; sock->so_upcall = ksocknal_upcall; sock->so_snd.sb_timeo = 0; - sock->so_rcv.sb_timeo = 2 * HZ; + sock->so_rcv.sb_timeo = cfs_time_seconds(2); sock->so_rcv.sb_flags |= SB_UPCALL; sock->so_snd.sb_flags |= SB_UPCALL; - sock->reserved3 = conn; CFS_NET_EX; return; } void -ksocknal_lib_act_callback(struct socket *sock) +ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn) { - /* upcall will take the network funnel */ - ksocknal_upcall (sock, 0, 0); + CFS_DECL_NET_DATA; + + CFS_NET_IN; + ksocknal_upcall (sock, (void *)conn, 0); + CFS_NET_EX; } void @@ -1001,11 +1068,11 @@ ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) CFS_DECL_NET_DATA; CFS_NET_IN; - sock->so_upcall = NULL; - sock->so_upcallarg = NULL; sock->so_rcv.sb_flags &= ~SB_UPCALL; sock->so_snd.sb_flags &= ~SB_UPCALL; + sock->so_upcall = NULL; + sock->so_upcallarg = NULL; CFS_NET_EX; } - +#endif /* !__DARWIN8__ */