/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
* vim:expandtab:shiftwidth=8:tabstop=8:
*
- * Copyright (C) 2002 Cluster File Systems, Inc.
- * Author: Phil Schwan <phil@clusterfs.com>
+ * GPL HEADER START
*
- * This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
- * Lustre is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
*
- * Lustre is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
*
* You should have received a copy of the GNU General Public License
- * along with Lustre; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/socklnd/socklnd_lib-darwin.c
*
* Darwin porting library
* Make things easy to port
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
*/
#include <mach/mach_types.h>
#include <string.h>
#include <netinet/tcp.h>
#include <sys/file.h>
-#include "socknal.h"
+#include "socklnd.h"
-#if 0
-#undef SOCKNAL_SINGLE_FRAG_TX
-#define SOCKNAL_SINGLE_FRAG_TX 1
-#undef SOCKNAL_SINGLE_FRAG_RX
-#define SOCKNAL_SINGLE_FRAG_RX 1
-#endif
+# if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
-SYSCTL_DECL(_portals);
+SYSCTL_DECL(_lnet);
-SYSCTL_NODE (_portals, OID_AUTO, ksocknal, CTLFLAG_RW,
- 0, "ksocknal_sysctl");
+SYSCTL_NODE (_lnet, OID_AUTO, ksocknal, CTLFLAG_RW,
+ 0, "ksocknal_sysctl");
-SYSCTL_INT(_portals_ksocknal, OID_AUTO, timeout,
- CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_io_timeout,
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, timeout,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_timeout,
0, "timeout");
-SYSCTL_INT(_portals_ksocknal, OID_AUTO, eager_ack,
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, credits,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_credits,
+ 0, "credits");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, peer_credits,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_peercredits,
+ 0, "peer_credits");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, nconnds,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nconnds,
+ 0, "nconnds");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, min_reconnectms,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_reconnectms,
+ 0, "min_reconnectms");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, max_reconnectms,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_max_reconnectms,
+ 0, "max_reconnectms");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, eager_ack,
CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_eager_ack,
0, "eager_ack");
-SYSCTL_INT(_portals_ksocknal, OID_AUTO, typed,
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, typed,
CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_typed_conns,
0, "typed");
-SYSCTL_INT(_portals_ksocknal, OID_AUTO, min_bulk,
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, min_bulk,
CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_bulk,
0, "min_bulk");
-SYSCTL_INT(_portals_ksocknal, OID_AUTO, buffer_size,
- CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_buffer_size,
- 0, "buffer_size");
-SYSCTL_INT(_portals_ksocknal, OID_AUTO, nagle,
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, rx_buffer_size,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_rx_buffer_size,
+ 0, "rx_buffer_size");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, tx_buffer_size,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_tx_buffer_size,
+ 0, "tx_buffer_size");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, nagle,
CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nagle,
0, "nagle");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_idle,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_idle,
+ 0, "keepalive_idle");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_count,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_count,
+ 0, "keepalive_count");
+SYSCTL_INT(_lnet_ksocknal, OID_AUTO, keepalive_intvl,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_keepalive_intvl,
+ 0, "keepalive_intvl");
cfs_sysctl_table_t ksocknal_top_ctl_table [] = {
- &sysctl__portals_ksocknal,
- &sysctl__portals_ksocknal_timeout,
- &sysctl__portals_ksocknal_eager_ack,
- &sysctl__portals_ksocknal_typed,
- &sysctl__portals_ksocknal_min_bulk,
- &sysctl__portals_ksocknal_buffer_size,
- &sysctl__portals_ksocknal_nagle,
+ &sysctl__lnet_ksocknal,
+ &sysctl__lnet_ksocknal_timeout,
+ &sysctl__lnet_ksocknal_credits,
+ &sysctl__lnet_ksocknal_peer_credits,
+ &sysctl__lnet_ksocknal_nconnds,
+ &sysctl__lnet_ksocknal_min_reconnectms,
+ &sysctl__lnet_ksocknal_max_reconnectms,
+ &sysctl__lnet_ksocknal_eager_ack,
+ &sysctl__lnet_ksocknal_typed,
+ &sysctl__lnet_ksocknal_min_bulk,
+ &sysctl__lnet_ksocknal_rx_buffer_size,
+ &sysctl__lnet_ksocknal_tx_buffer_size,
+ &sysctl__lnet_ksocknal_nagle,
+ &sysctl__lnet_ksocknal_keepalive_idle,
+ &sysctl__lnet_ksocknal_keepalive_count,
+ &sysctl__lnet_ksocknal_keepalive_intvl,
NULL
};
-static unsigned long ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES);
-
-struct socket *
-sockfd_lookup(int fd, void *foo)
+int
+ksocknal_lib_tunables_init ()
{
- struct socket *so;
- struct file *fp;
- CFS_DECL_FUNNEL_DATA;
+ ksocknal_tunables.ksnd_sysctl =
+ cfs_register_sysctl_table (ksocknal_top_ctl_table, 0);
- CFS_NET_IN;
- getsock(current_proc()->p_fd, fd, &fp);
- CFS_NET_EX;
- so = (struct socket *)fp->f_data;
- so->reserved4 = fp;
- CFS_CONE_IN;
- fref(fp);
- CFS_CONE_EX;
- return so;
-}
+ if (ksocknal_tunables.ksnd_sysctl == NULL)
+ return -ENOMEM;
-extern struct fileops socketops;
+ return 0;
+}
-static int
-sock_map_fd (struct socket *so)
+void
+ksocknal_lib_tunables_fini ()
{
- struct file *fp;
- int fd;
- CFS_DECL_FUNNEL_DATA;
-
- CFS_CONE_IN;
- falloc(current_proc(), &fp, &fd);
- fp->f_flag = FREAD|FWRITE;
- fp->f_type = DTYPE_SOCKET;
- fp->f_ops = &socketops;
- fp->f_data = (caddr_t)so;
- so->reserved4 = fp;
- *fdflags(current_proc(), fd) &= ~UF_RESERVED;
- CFS_CONE_EX;
-
- return fd;
+ if (ksocknal_tunables.ksnd_sysctl != NULL)
+ cfs_unregister_sysctl_table (ksocknal_tunables.ksnd_sysctl);
}
-
-static void
-sock_release(struct socket *so)
+#else
+int
+ksocknal_lib_tunables_init ()
{
- struct file *fp;
- CFS_DECL_FUNNEL_DATA;
-
- fp = (struct file *)so->reserved4;
- so->reserved4 = NULL;
- fp->f_data = NULL;
- CFS_CONE_IN;
- frele(fp);
- CFS_CONE_EX;
- CFS_NET_IN;
- soshutdown(so, 0);
- CFS_NET_EX;
+ return 0;
}
-static void
-sock_fdrelse(int fd)
-{
- CFS_DECL_FUNNEL_DATA;
-
- CFS_CONE_IN;
- fdrelse(current_proc(), fd);
- CFS_CONE_EX;
+void
+ksocknal_lib_tunables_fini ()
+{
}
+#endif
+
+/*
+ * To use bigger buffer for socket:
+ * 1. Increase nmbclusters (Cannot increased by sysctl because it's ready only, so
+ * we must patch kernel).
+ * 2. Increase net.inet.tcp.reass.maxsegments
+ * 3. Increase net.inet.tcp.sendspace
+ * 4. Increase net.inet.tcp.recvspace
+ * 5. Increase kern.ipc.maxsockbuf
+ */
+#define KSOCKNAL_MAX_BUFFER (1152*1024)
void
ksocknal_lib_bind_irq (unsigned int irq)
}
unsigned int
-ksocknal_lib_sock_irq (struct socket *sock)
+ksocknal_lib_sock_irq (cfs_socket_t *sock)
{
return 0;
}
int
ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
{
- struct sockaddr_in *sin;
- struct sockaddr *sa;
- int rc;
- CFS_DECL_NET_DATA;
+ int rc = libcfs_sock_getaddr(conn->ksnc_sock, 1,
+ &conn->ksnc_ipaddr,
+ &conn->ksnc_port);
- CFS_NET_IN;
- rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa);
- LASSERT (!conn->ksnc_closing);
- if (rc != 0) {
- CFS_NET_EX;
- if (sa) FREE(sa, M_SONAME);
- CERROR ("Error %d getting sock peer IP\n", rc);
- return rc;
+ /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */
+ LASSERT (!conn->ksnc_closing);
+
+ if (rc != 0) {
+ CERROR ("Error %d getting sock peer IP\n", rc);
+ return rc;
+ }
+
+ rc = libcfs_sock_getaddr(conn->ksnc_sock, 0,
+ &conn->ksnc_myipaddr, NULL);
+ if (rc != 0) {
+ CERROR ("Error %d getting sock local IP\n", rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+#ifdef __DARWIN8__
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ socket_t sock = C2B_SOCK(conn->ksnc_sock);
+ size_t sndlen;
+ int nob;
+ int rc;
+
+#if SOCKNAL_SINGLE_FRAG_TX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+ unsigned int niov = tx->tx_niov;
+#endif
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_iovlen = niov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = MSG_DONTWAIT
+ };
+
+ int i;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = tx->tx_iov[i];
+ nob += scratchiov[i].iov_len;
}
- sin = (struct sockaddr_in *)sa;
- conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr);
- conn->ksnc_port = ntohs (sin->sin_port);
- if (sa) FREE(sa, M_SONAME);
- rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa);
- CFS_NET_EX;
- if (rc != 0) {
- if (sa) FREE(sa, M_SONAME);
- CERROR ("Error %d getting sock local IP\n", rc);
- return rc;
+
+ /*
+ * XXX Liang:
+ * Linux has MSG_MORE, do we have anything to
+ * reduce number of partial TCP segments sent?
+ */
+ rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
+ if (rc == 0)
+ rc = sndlen;
+ return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+ socket_t sock = C2B_SOCK(conn->ksnc_sock);
+ lnet_kiov_t *kiov = tx->tx_kiov;
+ int rc;
+ int nob;
+ size_t sndlen;
+
+#if SOCKNAL_SINGLE_FRAG_TX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+ unsigned int niov = tx->tx_nkiov;
+#endif
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_iovlen = niov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = MSG_DONTWAIT
+ };
+
+ int i;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) +
+ kiov[i].kiov_offset;
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ }
+
+ /*
+ * XXX Liang:
+ * Linux has MSG_MORE, do wen have anyting to
+ * reduce number of partial TCP segments sent?
+ */
+ rc = -sock_send(sock, &msg, MSG_DONTWAIT, &sndlen);
+ for (i = 0; i < niov; i++)
+ cfs_kunmap(kiov[i].kiov_page);
+ if (rc == 0)
+ rc = sndlen;
+ return rc;
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+ unsigned int niov = conn->ksnc_rx_niov;
+#endif
+ struct iovec *iov = conn->ksnc_rx_iov;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_iovlen = niov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = 0
+ };
+ size_t rcvlen;
+ int nob;
+ int i;
+ int rc;
+
+ LASSERT (niov > 0);
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = iov[i];
+ nob += scratchiov[i].iov_len;
+ }
+ LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+ rc = -sock_receive (C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen);
+ if (rc == 0)
+ rc = rcvlen;
+
+ return rc;
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ unsigned int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+ unsigned int niov = conn->ksnc_rx_nkiov;
+#endif
+ lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = scratchiov,
+ .msg_iovlen = niov,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = 0
+ };
+ int nob;
+ int i;
+ size_t rcvlen;
+ int rc;
+
+ /* NB we can't trust socket ops to either consume our iovs
+ * or leave them alone. */
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + \
+ kiov[i].kiov_offset;
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ }
+ LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+ rc = -sock_receive(C2B_SOCK(conn->ksnc_sock), &msg, MSG_DONTWAIT, &rcvlen);
+ for (i = 0; i < niov; i++)
+ cfs_kunmap(kiov[i].kiov_page);
+ if (rc == 0)
+ rc = rcvlen;
+ return (rc);
+}
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+ /* XXX Liang: */
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+ socket_t sock = C2B_SOCK(conn->ksnc_sock);
+ int len;
+ int rc;
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) {
+ LASSERT (conn->ksnc_closing);
+ *txmem = *rxmem = *nagle = 0;
+ return (-ESHUTDOWN);
+ }
+ rc = libcfs_sock_getbuf(conn->ksnc_sock, txmem, rxmem);
+ if (rc == 0) {
+ len = sizeof(*nagle);
+ rc = -sock_getsockopt(sock, IPPROTO_TCP, TCP_NODELAY,
+ nagle, &len);
+ }
+ ksocknal_connsock_decref(conn);
+
+ if (rc == 0)
+ *nagle = !*nagle;
+ else
+ *txmem = *rxmem = *nagle = 0;
+
+ return (rc);
+}
+
+int
+ksocknal_lib_setup_sock (cfs_socket_t *sock)
+{
+ int rc;
+ int option;
+ int keep_idle;
+ int keep_intvl;
+ int keep_count;
+ int do_keepalive;
+ socket_t so = C2B_SOCK(sock);
+ struct linger linger;
+
+ /* Ensure this socket aborts active sends immediately when we close
+ * it. */
+ linger.l_onoff = 0;
+ linger.l_linger = 0;
+ rc = -sock_setsockopt(so, SOL_SOCKET, SO_LINGER, &linger, sizeof(linger));
+ if (rc != 0) {
+ CERROR ("Can't set SO_LINGER: %d\n", rc);
+ return (rc);
+ }
+
+ if (!*ksocknal_tunables.ksnd_nagle) {
+ option = 1;
+ rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_NODELAY, &option, sizeof(option));
+ if (rc != 0) {
+ CERROR ("Can't disable nagle: %d\n", rc);
+ return (rc);
+ }
}
- conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr);
- return 0;
+ rc = libcfs_sock_setbuf(sock,
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size);
+ if (rc != 0) {
+ CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+ return (rc);
+ }
+
+ /* snapshot tunables */
+ keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
+ keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+ keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
+
+ do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+ option = (do_keepalive ? 1 : 0);
+
+ rc = -sock_setsockopt(so, SOL_SOCKET, SO_KEEPALIVE, &option, sizeof(option));
+ if (rc != 0) {
+ CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+ return (rc);
+ }
+
+ if (!do_keepalive)
+ return (rc);
+ rc = -sock_setsockopt(so, IPPROTO_TCP, TCP_KEEPALIVE,
+ &keep_idle, sizeof(keep_idle));
+
+ return (rc);
}
+void
+ksocknal_lib_push_conn(ksock_conn_t *conn)
+{
+ socket_t sock;
+ int val = 1;
+ int rc;
+
+ rc = ksocknal_connsock_addref(conn);
+ if (rc != 0) /* being shut down */
+ return;
+ sock = C2B_SOCK(conn->ksnc_sock);
+
+ rc = -sock_setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, &val, sizeof(val));
+ LASSERT(rc == 0);
+
+ ksocknal_connsock_decref(conn);
+ return;
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+
+static void
+ksocknal_upcall(socket_t so, void *arg, int waitf)
+{
+ ksock_conn_t *conn = (ksock_conn_t *)arg;
+ ENTRY;
+
+ read_lock (&ksocknal_data.ksnd_global_lock);
+ if (conn == NULL)
+ goto out;
+
+ ksocknal_read_callback (conn);
+ /* XXX Liang */
+ ksocknal_write_callback (conn);
+out:
+ read_unlock (&ksocknal_data.ksnd_global_lock);
+ EXIT;
+}
+
+void
+ksocknal_lib_save_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{
+ /* No callback need to save in osx */
+ return;
+}
+
+void
+ksocknal_lib_set_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{
+ libcfs_sock_set_cb(sock, ksocknal_upcall, (void *)conn);
+ return;
+}
+
+void
+ksocknal_lib_reset_callback(cfs_socket_t *sock, ksock_conn_t *conn)
+{
+ libcfs_sock_reset_cb(sock);
+}
+
+#else /* !__DARWIN8__ */
+
int
ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
{
#if SOCKNAL_SINGLE_FRAG_TX
struct iovec scratch;
struct iovec *scratchiov = &scratch;
- int niov = 1;
+ unsigned int niov = 1;
#else
struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
- int niov = tx->tx_niov;
+ unsigned int niov = tx->tx_niov;
#endif
struct socket *sock = conn->ksnc_sock;
int nob;
#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
struct iovec scratch;
struct iovec *scratchiov = &scratch;
- int niov = 1;
+ unsigned int niov = 1;
#else
struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
- int niov = tx->tx_nkiov;
+ unsigned int niov = tx->tx_nkiov;
#endif
struct socket *sock = conn->ksnc_sock;
- ptl_kiov_t *kiov = tx->tx_kiov;
+ lnet_kiov_t *kiov = tx->tx_kiov;
int nob;
int rc;
int i;
CFS_NET_IN;
s = splnet();
+ /*
+ * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
+ * to send immediate ACK.
+ */
if (tp && tp->t_flags & TF_DELACK){
tp->t_flags &= ~TF_DELACK;
tp->t_flags |= TF_ACKNOW;
}
splx(s);
- /*
- * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
- * to send immediate ACK. It's not the best resolution because
- * tcp_fasttimo will send out ACK for all delayed-ack tcp socket.
- * Anyway, it's working now.
- * extern void tcp_fasttimo();
- * tcp_fasttimo();
- */
CFS_NET_EX;
return;
#if SOCKNAL_SINGLE_FRAG_RX
struct iovec scratch;
struct iovec *scratchiov = &scratch;
- int niov = 1;
+ unsigned int niov = 1;
#else
struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
- int niov = conn->ksnc_rx_niov;
+ unsigned int niov = conn->ksnc_rx_niov;
#endif
struct iovec *iov = conn->ksnc_rx_iov;
int nob;
#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
struct iovec scratch;
struct iovec *scratchiov = &scratch;
- int niov = 1;
+ unsigned int niov = 1;
#else
struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
- int niov = conn->ksnc_rx_nkiov;
+ unsigned int niov = conn->ksnc_rx_nkiov;
#endif
- ptl_kiov_t *kiov = conn->ksnc_rx_kiov;
+ lnet_kiov_t *kiov = conn->ksnc_rx_kiov;
int nob;
int rc;
int i;
}
int
-ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob)
-{
- int rc;
- CFS_DECL_NET_DATA;
-
- while (nob > 0) {
- struct iovec iov = {
- .iov_base = buffer,
- .iov_len = nob
- };
- struct uio suio = {
- .uio_iov = &iov,
- .uio_iovcnt = 1,
- .uio_offset = 0,
- .uio_resid = nob,
- .uio_segflg = UIO_SYSSPACE,
- .uio_rw = UIO_WRITE,
- .uio_procp = NULL
- };
-
- CFS_NET_IN;
- rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0);
- CFS_NET_EX;
-
- if (rc != 0) {
- if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
- rc == EWOULDBLOCK))
- rc = 0;
- if ( rc != 0 )
- return -rc;
- rc = nob - suio.uio_resid;
- buffer = ((char *)buffer) + rc;
- nob = suio.uio_resid;
- continue;
- }
- break;
- }
-
- return (0);
-}
-
-int
-ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob)
-{
- int rc;
- CFS_DECL_NET_DATA;
-
- while (nob > 0) {
- struct iovec iov = {
- .iov_base = buffer,
- .iov_len = nob
- };
- struct uio ruio = {
- .uio_iov = &iov,
- .uio_iovcnt = 1,
- .uio_offset = 0,
- .uio_resid = nob,
- .uio_segflg = UIO_SYSSPACE,
- .uio_rw = UIO_READ,
- .uio_procp = NULL
- };
-
- CFS_NET_IN;
- rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0);
- CFS_NET_EX;
-
- if (rc != 0) {
- if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
- rc == EWOULDBLOCK))
- rc = 0;
- if (rc != 0)
- return -rc;
- rc = nob - ruio.uio_resid;
- buffer = ((char *)buffer) + rc;
- nob = ruio.uio_resid;
- continue;
- }
- break;
- }
-
- return (0);
-}
-
-int
ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
{
- struct sockopt sopt;
struct socket *sock = conn->ksnc_sock;
- int len;
int rc;
- CFS_DECL_NET_DATA;
- rc = ksocknal_getconnsock (conn);
+ rc = ksocknal_connsock_addref(conn);
if (rc != 0) {
LASSERT (conn->ksnc_closing);
*txmem = *rxmem = *nagle = 0;
- rc = -ESHUTDOWN;
- goto out;
- }
- len = sizeof(*txmem);
- bzero(&sopt, sizeof sopt);
- sopt.sopt_dir = SOPT_GET;
- sopt.sopt_level = SOL_SOCKET;
- sopt.sopt_name = SO_SNDBUF;
- sopt.sopt_val = txmem;
- sopt.sopt_valsize = len;
-
- CFS_NET_IN;
- rc = sogetopt(sock, &sopt);
- if (rc == 0) {
- len = sizeof(*rxmem);
- sopt.sopt_name = SO_RCVBUF;
- sopt.sopt_val = rxmem;
- rc = sogetopt(sock, &sopt);
+ return -ESHUTDOWN;
}
+ rc = libcfs_sock_getbuf(sock, txmem, rxmem);
if (rc == 0) {
+ struct sockopt sopt;
+ int len;
+ CFS_DECL_NET_DATA;
+
len = sizeof(*nagle);
+ bzero(&sopt, sizeof sopt);
+ sopt.sopt_dir = SOPT_GET;
sopt.sopt_level = IPPROTO_TCP;
sopt.sopt_name = TCP_NODELAY;
sopt.sopt_val = nagle;
- rc = sogetopt(sock, &sopt);
+ sopt.sopt_valsize = len;
+
+ CFS_NET_IN;
+ rc = -sogetopt(sock, &sopt);
+ CFS_NET_EX;
}
- CFS_NET_EX;
- ksocknal_putconnsock (conn);
+ ksocknal_connsock_decref(conn);
if (rc == 0)
*nagle = !*nagle;
else
*txmem = *rxmem = *nagle = 0;
-out:
- return (-rc);
+ return (rc);
}
int
struct linger linger;
CFS_DECL_NET_DATA;
+ rc = libcfs_sock_setbuf(so,
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size);
+ if (rc != 0) {
+ CERROR ("Can't set buffer tx %d, rx %d buffers: %d\n",
+ *ksocknal_tunables.ksnd_tx_buffer_size,
+ *ksocknal_tunables.ksnd_rx_buffer_size, rc);
+ return (rc);
+ }
+
/* Ensure this socket aborts active sends immediately when we close
* it. */
-
bzero(&sopt, sizeof sopt);
linger.l_onoff = 0;
sopt.sopt_valsize = sizeof(linger);
CFS_NET_IN;
- rc = sosetopt(so, &sopt);
+ rc = -sosetopt(so, &sopt);
if (rc != 0) {
CERROR ("Can't set SO_LINGER: %d\n", rc);
goto out;
}
-
- if (!ksocknal_tunables.ksnd_nagle) {
+ if (!*ksocknal_tunables.ksnd_nagle) {
option = 1;
bzero(&sopt, sizeof sopt);
sopt.sopt_dir = SOPT_SET;
sopt.sopt_name = TCP_NODELAY;
sopt.sopt_val = &option;
sopt.sopt_valsize = sizeof(option);
- rc = sosetopt(so, &sopt);
+ rc = -sosetopt(so, &sopt);
if (rc != 0) {
CERROR ("Can't disable nagle: %d\n", rc);
goto out;
}
}
- if (ksocknal_tunables.ksnd_buffer_size > 0) {
- option = ksocknal_tunables.ksnd_buffer_size;
- if (option > ksocknal_mbuf_size)
- option = ksocknal_mbuf_size;
-
- sopt.sopt_dir = SOPT_SET;
- sopt.sopt_level = SOL_SOCKET;
- sopt.sopt_name = SO_SNDBUF;
- sopt.sopt_val = &option;
- sopt.sopt_valsize = sizeof(option);
- rc = sosetopt(so, &sopt);
- if (rc != 0) {
- CERROR ("Can't set send buffer %d: %d\n",
- option, rc);
- goto out;
- }
-
- sopt.sopt_name = SO_RCVBUF;
- rc = sosetopt(so, &sopt);
- if (rc != 0) {
- CERROR ("Can't set receive buffer %d: %d\n",
- option, rc);
- goto out;
- }
- }
+
/* snapshot tunables */
- keep_idle = ksocknal_tunables.ksnd_keepalive_idle;
- keep_count = ksocknal_tunables.ksnd_keepalive_count;
- keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl;
+ keep_idle = *ksocknal_tunables.ksnd_keepalive_idle;
+ keep_count = *ksocknal_tunables.ksnd_keepalive_count;
+ keep_intvl = *ksocknal_tunables.ksnd_keepalive_intvl;
do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
option = (do_keepalive ? 1 : 0);
sopt.sopt_name = SO_KEEPALIVE;
sopt.sopt_val = &option;
sopt.sopt_valsize = sizeof(option);
- rc = sosetopt(so, &sopt);
+ rc = -sosetopt(so, &sopt);
if (rc != 0) {
CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
goto out;
sopt.sopt_name = TCP_KEEPALIVE;
sopt.sopt_val = &keep_idle;
sopt.sopt_valsize = sizeof(keep_idle);
- rc = sosetopt(so, &sopt);
+ rc = -sosetopt(so, &sopt);
if (rc != 0) {
CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc);
goto out;
}
out:
CFS_NET_EX;
- return (-rc);
-}
-
-int
-ksocknal_lib_connect_sock (struct socket **sockp, int *may_retry,
- ksock_route_t *route, int local_port)
-{
- struct sockaddr_in locaddr;
- struct sockaddr_in srvaddr;
- struct timeval tv;
- int fd;
- struct socket *so;
- struct sockopt sopt;
- int option;
- int rc;
- int s;
- CFS_DECL_FUNNEL_DATA;
-
- ENTRY;
- bzero (&locaddr, sizeof (locaddr));
- locaddr.sin_len = sizeof(struct sockaddr_in);
- locaddr.sin_family = AF_INET;
- locaddr.sin_port = htons (local_port);
- locaddr.sin_addr.s_addr =
- (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
- : INADDR_ANY;
- bzero(&srvaddr, sizeof(srvaddr));
- srvaddr.sin_len = sizeof(struct sockaddr_in);
- srvaddr.sin_family = AF_INET;
- srvaddr.sin_port = htons (route->ksnr_port);
- srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
-
- *may_retry = 0;
-
- CFS_NET_IN;
- rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
- CFS_NET_EX;
- *sockp = so;
- if (rc != 0) {
- CERROR ("Can't create autoconnect socket: %d\n", rc);
- return (-rc);
- }
-
- /*
- * XXX
- * Liang: what do we need here?
- */
- fd = sock_map_fd (so);
- if (fd < 0) {
- sock_release (so);
- CERROR ("sock_map_fd error %d\n", fd);
- return (fd);
- }
- sock_fdrelse(fd);
-
- /* Set the socket timeouts, so our connection attempt completes in
- * finite time */
- tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
- tv.tv_usec = 0;
- bzero(&sopt, sizeof sopt);
- sopt.sopt_dir = SOPT_SET;
- sopt.sopt_level = SOL_SOCKET;
- sopt.sopt_name = SO_SNDTIMEO;
- sopt.sopt_val = &tv;
- sopt.sopt_valsize = sizeof(tv);
-
- CFS_NET_IN;
- rc = sosetopt(so, &sopt);
- if (rc != 0) {
- CFS_NET_EX;
- CERROR ("Can't set send timeout %d: %d\n",
- ksocknal_tunables.ksnd_io_timeout, rc);
- goto out;
- }
- sopt.sopt_level = SOL_SOCKET;
- sopt.sopt_name = SO_RCVTIMEO;
- rc = sosetopt(so, &sopt);
- if (rc != 0) {
- CFS_NET_EX;
- CERROR ("Can't set receive timeout %d: %d\n",
- ksocknal_tunables.ksnd_io_timeout, rc);
- goto out;
- }
- option = 1;
- sopt.sopt_level = SOL_SOCKET;
- sopt.sopt_name = SO_REUSEADDR;
- sopt.sopt_val = &option;
- sopt.sopt_valsize = sizeof(option);
- rc = sosetopt(so, &sopt);
- if (rc != 0) {
- CFS_NET_EX;
- CERROR ("Can't set sock reuse address: %d\n", rc);
- goto out;
- }
- rc = sobind(so, (struct sockaddr *)&locaddr);
- if (rc == EADDRINUSE) {
- CFS_NET_EX;
- CDEBUG(D_NET, "Port %d already in use\n", local_port);
- *may_retry = 1;
- goto out;
- }
- if (rc != 0) {
- CFS_NET_EX;
- CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n",
- HIPQUAD(route->ksnr_myipaddr), rc);
- goto out;
- }
- rc = soconnect(so, (struct sockaddr *)&srvaddr);
- *may_retry = (rc == EADDRNOTAVAIL || rc == EADDRINUSE);
- if (rc != 0) {
- CFS_NET_EX;
- if (rc != EADDRNOTAVAIL && rc != EADDRINUSE)
- CERROR ("Can't connect to nid "LPX64
- " local IP: %u.%u.%u.%u,"
- " remote IP: %u.%u.%u.%u/%d: %d\n",
- route->ksnr_peer->ksnp_nid,
- HIPQUAD(route->ksnr_myipaddr),
- HIPQUAD(route->ksnr_ipaddr),
- route->ksnr_port, rc);
- goto out;
- }
-
- s = splnet();
- while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
- CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n");
- (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz);
- }
- LASSERT((so->so_state & SS_ISCONNECTED));
- splx(s);
- CFS_NET_EX;
-
- rc = so->so_error;
- if (rc != 0) {
- CERROR ("Error %d waiting for connection to nid "LPX64
- " local IP: %u.%u.%u.%u,"
- " remote IP: %u.%u.%u.%u/%d: %d\n", rc,
- route->ksnr_peer->ksnp_nid,
- HIPQUAD(route->ksnr_myipaddr),
- HIPQUAD(route->ksnr_ipaddr),
- route->ksnr_port, rc);
- goto out;
- }
- return (-rc);
-
- out:
- rele_file(KSN_SOCK2FILE(so));
-
- return (-rc);
+ return (rc);
}
void
int rc;
CFS_DECL_NET_DATA;
- rc = ksocknal_getconnsock (conn);
+ rc = ksocknal_connsock_addref(conn);
if (rc != 0) /* being shut down */
return;
sock = conn->ksnc_sock;
sosetopt(sock, &sopt);
CFS_NET_EX;
- ksocknal_putconnsock (conn);
+ ksocknal_connsock_decref(conn);
return;
}
+
extern void ksocknal_read_callback (ksock_conn_t *conn);
extern void ksocknal_write_callback (ksock_conn_t *conn);
static void
ksocknal_upcall(struct socket *so, caddr_t arg, int waitf)
{
- ksock_conn_t *conn;
- CFS_DECL_NET_DATA;
+ ksock_conn_t *conn = (ksock_conn_t *)arg;
ENTRY;
read_lock (&ksocknal_data.ksnd_global_lock);
- conn = so->reserved3;
-
- if (conn == NULL){
- /* More processing is needed? */
+ if (conn == NULL)
goto out;
- }
- if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) {
+
+ if (so->so_rcv.sb_flags & SB_UPCALL) {
extern int soreadable(struct socket *so);
- CFS_NET_IN;
- if (conn->ksnc_rx_nob_wanted && soreadable(so)){
+ if (conn->ksnc_rx_nob_wanted && soreadable(so))
/* To verify whether the upcall is for receive */
- CFS_NET_EX;
ksocknal_read_callback (conn);
- }else
- CFS_NET_EX;
}
/* go foward? */
- if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){
+ if (so->so_snd.sb_flags & SB_UPCALL){
extern int sowriteable(struct socket *so);
- CFS_NET_IN;
- if (sowriteable(so)){
+ if (sowriteable(so))
/* socket is writable */
- CFS_NET_EX;
ksocknal_write_callback(conn);
- } else
- CFS_NET_EX;
}
out:
read_unlock (&ksocknal_data.ksnd_global_lock);
CFS_DECL_NET_DATA;
CFS_NET_IN;
- sock->so_upcallarg = (void *)sock; /* anything not NULL */
+ sock->so_upcallarg = (void *)conn;
sock->so_upcall = ksocknal_upcall;
sock->so_snd.sb_timeo = 0;
- sock->so_rcv.sb_timeo = 2 * HZ;
+ sock->so_rcv.sb_timeo = cfs_time_seconds(2);
sock->so_rcv.sb_flags |= SB_UPCALL;
sock->so_snd.sb_flags |= SB_UPCALL;
- sock->reserved3 = conn;
CFS_NET_EX;
return;
}
void
-ksocknal_lib_act_callback(struct socket *sock)
+ksocknal_lib_act_callback(struct socket *sock, ksock_conn_t *conn)
{
- /* upcall will take the network funnel */
- ksocknal_upcall (sock, 0, 0);
+ CFS_DECL_NET_DATA;
+
+ CFS_NET_IN;
+ ksocknal_upcall (sock, (void *)conn, 0);
+ CFS_NET_EX;
}
void
CFS_DECL_NET_DATA;
CFS_NET_IN;
- sock->so_upcall = NULL;
- sock->so_upcallarg = NULL;
sock->so_rcv.sb_flags &= ~SB_UPCALL;
sock->so_snd.sb_flags &= ~SB_UPCALL;
+ sock->so_upcall = NULL;
+ sock->so_upcallarg = NULL;
CFS_NET_EX;
}
-
+#endif /* !__DARWIN8__ */