--- /dev/null
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Darwin porting library
+ * Make things easy to port
+ */
+#include <mach/mach_types.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/file.h>
+
+#include "socknal.h"
+
+#if 0
+#undef SOCKNAL_SINGLE_FRAG_TX
+#define SOCKNAL_SINGLE_FRAG_TX 1
+#undef SOCKNAL_SINGLE_FRAG_RX
+#define SOCKNAL_SINGLE_FRAG_RX 1
+#endif
+
+SYSCTL_DECL(_portals);
+
+SYSCTL_NODE (_portals, OID_AUTO, ksocknal, CTLFLAG_RW,
+ 0, "ksocknal_sysctl");
+
+SYSCTL_INT(_portals_ksocknal, OID_AUTO, timeout,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_io_timeout,
+ 0, "timeout");
+SYSCTL_INT(_portals_ksocknal, OID_AUTO, eager_ack,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_eager_ack,
+ 0, "eager_ack");
+SYSCTL_INT(_portals_ksocknal, OID_AUTO, typed,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_typed_conns,
+ 0, "typed");
+SYSCTL_INT(_portals_ksocknal, OID_AUTO, min_bulk,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_bulk,
+ 0, "min_bulk");
+SYSCTL_INT(_portals_ksocknal, OID_AUTO, buffer_size,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_buffer_size,
+ 0, "buffer_size");
+SYSCTL_INT(_portals_ksocknal, OID_AUTO, nagle,
+ CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nagle,
+ 0, "nagle");
+
+cfs_sysctl_table_t ksocknal_top_ctl_table [] = {
+ &sysctl__portals_ksocknal,
+ &sysctl__portals_ksocknal_timeout,
+ &sysctl__portals_ksocknal_eager_ack,
+ &sysctl__portals_ksocknal_typed,
+ &sysctl__portals_ksocknal_min_bulk,
+ &sysctl__portals_ksocknal_buffer_size,
+ &sysctl__portals_ksocknal_nagle,
+ NULL
+};
+
+static unsigned long ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES);
+
+struct socket *
+sockfd_lookup(int fd, void *foo)
+{
+ struct socket *so;
+ struct file *fp;
+ CFS_DECL_FUNNEL_DATA;
+
+ CFS_NET_IN;
+ getsock(current_proc()->p_fd, fd, &fp);
+ CFS_NET_EX;
+ so = (struct socket *)fp->f_data;
+ so->reserved4 = fp;
+ CFS_CONE_IN;
+ fref(fp);
+ CFS_CONE_EX;
+ return so;
+}
+
+extern struct fileops socketops;
+
+static int
+sock_map_fd (struct socket *so)
+{
+ struct file *fp;
+ int fd;
+ CFS_DECL_FUNNEL_DATA;
+
+ CFS_CONE_IN;
+ falloc(current_proc(), &fp, &fd);
+ fp->f_flag = FREAD|FWRITE;
+ fp->f_type = DTYPE_SOCKET;
+ fp->f_ops = &socketops;
+ fp->f_data = (caddr_t)so;
+ so->reserved4 = fp;
+ *fdflags(current_proc(), fd) &= ~UF_RESERVED;
+ CFS_CONE_EX;
+
+ return fd;
+}
+
+static void
+sock_release(struct socket *so)
+{
+ struct file *fp;
+ CFS_DECL_FUNNEL_DATA;
+
+ fp = (struct file *)so->reserved4;
+ so->reserved4 = NULL;
+ fp->f_data = NULL;
+ CFS_CONE_IN;
+ frele(fp);
+ CFS_CONE_EX;
+ CFS_NET_IN;
+ soshutdown(so, 0);
+ CFS_NET_EX;
+}
+
+static void
+sock_fdrelse(int fd)
+{
+ CFS_DECL_FUNNEL_DATA;
+
+ CFS_CONE_IN;
+ fdrelse(current_proc(), fd);
+ CFS_CONE_EX;
+}
+
+void
+ksocknal_lib_bind_irq (unsigned int irq)
+{
+ return;
+}
+
+unsigned int
+ksocknal_lib_sock_irq (struct socket *sock)
+{
+ return 0;
+}
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{
+ struct sockaddr_in *sin;
+ struct sockaddr *sa;
+ int rc;
+ CFS_DECL_NET_DATA;
+
+ CFS_NET_IN;
+ rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa);
+ LASSERT (!conn->ksnc_closing);
+ if (rc != 0) {
+ CFS_NET_EX;
+ if (sa) FREE(sa, M_SONAME);
+ CERROR ("Error %d getting sock peer IP\n", rc);
+ return rc;
+ }
+ sin = (struct sockaddr_in *)sa;
+ conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr);
+ conn->ksnc_port = ntohs (sin->sin_port);
+ if (sa) FREE(sa, M_SONAME);
+ rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa);
+ CFS_NET_EX;
+ if (rc != 0) {
+ if (sa) FREE(sa, M_SONAME);
+ CERROR ("Error %d getting sock local IP\n", rc);
+ return rc;
+ }
+ conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr);
+
+ return 0;
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+#if SOCKNAL_SINGLE_FRAG_TX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+ int niov = tx->tx_niov;
+#endif
+ struct socket *sock = conn->ksnc_sock;
+ int nob;
+ int rc;
+ int i;
+ struct uio suio = {
+ .uio_iov = scratchiov,
+ .uio_iovcnt = niov,
+ .uio_offset = 0,
+ .uio_resid = 0, /* This will be valued after a while */
+ .uio_segflg = UIO_SYSSPACE,
+ .uio_rw = UIO_WRITE,
+ .uio_procp = NULL
+ };
+ int flags = MSG_DONTWAIT;
+ CFS_DECL_NET_DATA;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = tx->tx_iov[i];
+ nob += scratchiov[i].iov_len;
+ }
+ suio.uio_resid = nob;
+
+ CFS_NET_IN;
+ rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags);
+ CFS_NET_EX;
+
+ /* NB there is no return value can indicate how many
+ * have been sent and how many resid, we have to get
+ * sent bytes from suio. */
+ if (rc != 0) {
+ if (suio.uio_resid != nob &&\
+ (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+ /* We have sent something */
+ rc = nob - suio.uio_resid;
+ else if ( rc == EWOULDBLOCK )
+ /* Actually, EAGAIN and EWOULDBLOCK have same value in OSX */
+ rc = -EAGAIN;
+ else
+ rc = -rc;
+ } else /* rc == 0 */
+ rc = nob - suio.uio_resid;
+
+ return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_tx_scratch_iov;
+ int niov = tx->tx_nkiov;
+#endif
+ struct socket *sock = conn->ksnc_sock;
+ ptl_kiov_t *kiov = tx->tx_kiov;
+ int nob;
+ int rc;
+ int i;
+ struct uio suio = {
+ .uio_iov = scratchiov,
+ .uio_iovcnt = niov,
+ .uio_offset = 0,
+ .uio_resid = 0, /* It should be valued after a while */
+ .uio_segflg = UIO_SYSSPACE,
+ .uio_rw = UIO_WRITE,
+ .uio_procp = NULL
+ };
+ int flags = MSG_DONTWAIT;
+ CFS_DECL_NET_DATA;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) +
+ kiov[i].kiov_offset;
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ }
+ suio.uio_resid = nob;
+
+ CFS_NET_IN;
+ rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags);
+ CFS_NET_EX;
+
+ for (i = 0; i < niov; i++)
+ cfs_kunmap(kiov[i].kiov_page);
+
+ if (rc != 0) {
+ if (suio.uio_resid != nob &&\
+ (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+ /* We have sent something */
+ rc = nob - suio.uio_resid;
+ else if ( rc == EWOULDBLOCK )
+ /* EAGAIN and EWOULD BLOCK have same value in OSX */
+ rc = -EAGAIN;
+ else
+ rc = -rc;
+ } else /* rc == 0 */
+ rc = nob - suio.uio_resid;
+
+ return rc;
+}
+
+/*
+ * liang: Hack of inpcb and tcpcb.
+ * To get tcpcb of a socket, and call tcp_output
+ * to send quick ack.
+ */
+struct ks_tseg_qent{
+ int foo;
+};
+
+struct ks_tcptemp{
+ int foo;
+};
+
+LIST_HEAD(ks_tsegqe_head, ks_tseg_qent);
+
+struct ks_tcpcb {
+ struct ks_tsegqe_head t_segq;
+ int t_dupacks;
+ struct ks_tcptemp *unused;
+ int t_timer[4];
+ struct inpcb *t_inpcb;
+ int t_state;
+ u_int t_flags;
+ /*
+ * There are more fields but we dont need
+ * ......
+ */
+};
+
+#define TF_ACKNOW 0x00001
+#define TF_DELACK 0x00002
+
+struct ks_inpcb {
+ LIST_ENTRY(ks_inpcb) inp_hash;
+ struct in_addr reserved1;
+ struct in_addr reserved2;
+ u_short inp_fport;
+ u_short inp_lport;
+ LIST_ENTRY(inpcb) inp_list;
+ caddr_t inp_ppcb;
+ /*
+ * There are more fields but we dont need
+ * ......
+ */
+};
+
+#define ks_sotoinpcb(so) ((struct ks_inpcb *)(so)->so_pcb)
+#define ks_intotcpcb(ip) ((struct ks_tcpcb *)(ip)->inp_ppcb)
+#define ks_sototcpcb(so) (intotcpcb(sotoinpcb(so)))
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+ struct socket *sock = conn->ksnc_sock;
+ struct ks_inpcb *inp = ks_sotoinpcb(sock);
+ struct ks_tcpcb *tp = ks_intotcpcb(inp);
+ int s;
+ CFS_DECL_NET_DATA;
+
+ extern int tcp_output(register struct ks_tcpcb *tp);
+
+ CFS_NET_IN;
+ s = splnet();
+
+ if (tp && tp->t_flags & TF_DELACK){
+ tp->t_flags &= ~TF_DELACK;
+ tp->t_flags |= TF_ACKNOW;
+ (void) tcp_output(tp);
+ }
+ splx(s);
+
+ /*
+ * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
+ * to send immediate ACK. It's not the best resolution because
+ * tcp_fasttimo will send out ACK for all delayed-ack tcp socket.
+ * Anyway, it's working now.
+ * extern void tcp_fasttimo();
+ * tcp_fasttimo();
+ */
+ CFS_NET_EX;
+
+ return;
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+ int niov = conn->ksnc_rx_niov;
+#endif
+ struct iovec *iov = conn->ksnc_rx_iov;
+ int nob;
+ int rc;
+ int i;
+ struct uio ruio = {
+ .uio_iov = scratchiov,
+ .uio_iovcnt = niov,
+ .uio_offset = 0,
+ .uio_resid = 0, /* It should be valued after a while */
+ .uio_segflg = UIO_SYSSPACE,
+ .uio_rw = UIO_READ,
+ .uio_procp = NULL
+ };
+ int flags = MSG_DONTWAIT;
+ CFS_DECL_NET_DATA;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i] = iov[i];
+ nob += scratchiov[i].iov_len;
+ }
+ LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+ ruio.uio_resid = nob;
+
+ CFS_NET_IN;
+ rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, &flags);
+ CFS_NET_EX;
+ if (rc){
+ if (ruio.uio_resid != nob && \
+ (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK || rc == EAGAIN))
+ /* data particially received */
+ rc = nob - ruio.uio_resid;
+ else if (rc == EWOULDBLOCK)
+ /* EAGAIN and EWOULD BLOCK have same value in OSX */
+ rc = -EAGAIN;
+ else
+ rc = -rc;
+ } else
+ rc = nob - ruio.uio_resid;
+
+ return (rc);
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK
+ struct iovec scratch;
+ struct iovec *scratchiov = &scratch;
+ int niov = 1;
+#else
+ struct iovec *scratchiov = conn->ksnc_rx_scratch_iov;
+ int niov = conn->ksnc_rx_nkiov;
+#endif
+ ptl_kiov_t *kiov = conn->ksnc_rx_kiov;
+ int nob;
+ int rc;
+ int i;
+ struct uio ruio = {
+ .uio_iov = scratchiov,
+ .uio_iovcnt = niov,
+ .uio_offset = 0,
+ .uio_resid = 0,
+ .uio_segflg = UIO_SYSSPACE,
+ .uio_rw = UIO_READ,
+ .uio_procp = NULL
+ };
+ int flags = MSG_DONTWAIT;
+ CFS_DECL_NET_DATA;
+
+ for (nob = i = 0; i < niov; i++) {
+ scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + kiov[i].kiov_offset;
+ nob += scratchiov[i].iov_len = kiov[i].kiov_len;
+ }
+ LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+ ruio.uio_resid = nob;
+
+ CFS_NET_IN;
+ rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, NULL, &flags);
+ CFS_NET_EX;
+
+ for (i = 0; i < niov; i++)
+ cfs_kunmap(kiov[i].kiov_page);
+
+ if (rc){
+ if (ruio.uio_resid != nob && \
+ (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+ /* data particially received */
+ rc = nob - ruio.uio_resid;
+ else if (rc == EWOULDBLOCK)
+ /* receive blocked, EWOULDBLOCK == EAGAIN */
+ rc = -EAGAIN;
+ else
+ rc = -rc;
+ } else
+ rc = nob - ruio.uio_resid;
+
+ return (rc);
+}
+
+int
+ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob)
+{
+ int rc;
+ CFS_DECL_NET_DATA;
+
+ while (nob > 0) {
+ struct iovec iov = {
+ .iov_base = buffer,
+ .iov_len = nob
+ };
+ struct uio suio = {
+ .uio_iov = &iov,
+ .uio_iovcnt = 1,
+ .uio_offset = 0,
+ .uio_resid = nob,
+ .uio_segflg = UIO_SYSSPACE,
+ .uio_rw = UIO_WRITE,
+ .uio_procp = NULL
+ };
+
+ CFS_NET_IN;
+ rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0);
+ CFS_NET_EX;
+
+ if (rc != 0) {
+ if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+ rc == EWOULDBLOCK))
+ rc = 0;
+ if ( rc != 0 )
+ return -rc;
+ rc = nob - suio.uio_resid;
+ buffer = ((char *)buffer) + rc;
+ nob = suio.uio_resid;
+ continue;
+ }
+ break;
+ }
+
+ return (0);
+}
+
+int
+ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob)
+{
+ int rc;
+ CFS_DECL_NET_DATA;
+
+ while (nob > 0) {
+ struct iovec iov = {
+ .iov_base = buffer,
+ .iov_len = nob
+ };
+ struct uio ruio = {
+ .uio_iov = &iov,
+ .uio_iovcnt = 1,
+ .uio_offset = 0,
+ .uio_resid = nob,
+ .uio_segflg = UIO_SYSSPACE,
+ .uio_rw = UIO_READ,
+ .uio_procp = NULL
+ };
+
+ CFS_NET_IN;
+ rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0);
+ CFS_NET_EX;
+
+ if (rc != 0) {
+ if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+ rc == EWOULDBLOCK))
+ rc = 0;
+ if (rc != 0)
+ return -rc;
+ rc = nob - ruio.uio_resid;
+ buffer = ((char *)buffer) + rc;
+ nob = ruio.uio_resid;
+ continue;
+ }
+ break;
+ }
+
+ return (0);
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+ struct sockopt sopt;
+ struct socket *sock = conn->ksnc_sock;
+ int len;
+ int rc;
+ CFS_DECL_NET_DATA;
+
+ rc = ksocknal_getconnsock (conn);
+ if (rc != 0) {
+ LASSERT (conn->ksnc_closing);
+ *txmem = *rxmem = *nagle = 0;
+ rc = -ESHUTDOWN;
+ goto out;
+ }
+ len = sizeof(*txmem);
+ bzero(&sopt, sizeof sopt);
+ sopt.sopt_dir = SOPT_GET;
+ sopt.sopt_level = SOL_SOCKET;
+ sopt.sopt_name = SO_SNDBUF;
+ sopt.sopt_val = txmem;
+ sopt.sopt_valsize = len;
+
+ CFS_NET_IN;
+ rc = sogetopt(sock, &sopt);
+ if (rc == 0) {
+ len = sizeof(*rxmem);
+ sopt.sopt_name = SO_RCVBUF;
+ sopt.sopt_val = rxmem;
+ rc = sogetopt(sock, &sopt);
+ }
+ if (rc == 0) {
+ len = sizeof(*nagle);
+ sopt.sopt_level = IPPROTO_TCP;
+ sopt.sopt_name = TCP_NODELAY;
+ sopt.sopt_val = nagle;
+ rc = sogetopt(sock, &sopt);
+ }
+ CFS_NET_EX;
+
+ ksocknal_putconnsock (conn);
+
+ if (rc == 0)
+ *nagle = !*nagle;
+ else
+ *txmem = *rxmem = *nagle = 0;
+out:
+ return (-rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *so)
+{
+ struct sockopt sopt;
+ int rc;
+ int option;
+ int keep_idle;
+ int keep_intvl;
+ int keep_count;
+ int do_keepalive;
+ struct linger linger;
+ CFS_DECL_NET_DATA;
+
+ /* Ensure this socket aborts active sends immediately when we close
+ * it. */
+
+ bzero(&sopt, sizeof sopt);
+
+ linger.l_onoff = 0;
+ linger.l_linger = 0;
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = SOL_SOCKET;
+ sopt.sopt_name = SO_LINGER;
+ sopt.sopt_val = &linger;
+ sopt.sopt_valsize = sizeof(linger);
+
+ CFS_NET_IN;
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CERROR ("Can't set SO_LINGER: %d\n", rc);
+ goto out;
+ }
+
+
+ if (!ksocknal_tunables.ksnd_nagle) {
+ option = 1;
+ bzero(&sopt, sizeof sopt);
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = IPPROTO_TCP;
+ sopt.sopt_name = TCP_NODELAY;
+ sopt.sopt_val = &option;
+ sopt.sopt_valsize = sizeof(option);
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CERROR ("Can't disable nagle: %d\n", rc);
+ goto out;
+ }
+ }
+ if (ksocknal_tunables.ksnd_buffer_size > 0) {
+ option = ksocknal_tunables.ksnd_buffer_size;
+ if (option > ksocknal_mbuf_size)
+ option = ksocknal_mbuf_size;
+
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = SOL_SOCKET;
+ sopt.sopt_name = SO_SNDBUF;
+ sopt.sopt_val = &option;
+ sopt.sopt_valsize = sizeof(option);
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CERROR ("Can't set send buffer %d: %d\n",
+ option, rc);
+ goto out;
+ }
+
+ sopt.sopt_name = SO_RCVBUF;
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CERROR ("Can't set receive buffer %d: %d\n",
+ option, rc);
+ goto out;
+ }
+ }
+ /* snapshot tunables */
+ keep_idle = ksocknal_tunables.ksnd_keepalive_idle;
+ keep_count = ksocknal_tunables.ksnd_keepalive_count;
+ keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl;
+
+ do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0);
+ option = (do_keepalive ? 1 : 0);
+ bzero(&sopt, sizeof sopt);
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = SOL_SOCKET;
+ sopt.sopt_name = SO_KEEPALIVE;
+ sopt.sopt_val = &option;
+ sopt.sopt_valsize = sizeof(option);
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CERROR ("Can't set SO_KEEPALIVE: %d\n", rc);
+ goto out;
+ }
+
+ if (!do_keepalive) {
+ /* no more setting, just return */
+ rc = 0;
+ goto out;
+ }
+
+ bzero(&sopt, sizeof sopt);
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = IPPROTO_TCP;
+ sopt.sopt_name = TCP_KEEPALIVE;
+ sopt.sopt_val = &keep_idle;
+ sopt.sopt_valsize = sizeof(keep_idle);
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc);
+ goto out;
+ }
+out:
+ CFS_NET_EX;
+ return (-rc);
+}
+
+int
+ksocknal_lib_connect_sock (struct socket **sockp, int *may_retry,
+ ksock_route_t *route, int local_port)
+{
+ struct sockaddr_in locaddr;
+ struct sockaddr_in srvaddr;
+ struct timeval tv;
+ int fd;
+ struct socket *so;
+ struct sockopt sopt;
+ int option;
+ int rc;
+ int s;
+ CFS_DECL_FUNNEL_DATA;
+
+ ENTRY;
+ bzero (&locaddr, sizeof (locaddr));
+ locaddr.sin_len = sizeof(struct sockaddr_in);
+ locaddr.sin_family = AF_INET;
+ locaddr.sin_port = htons (local_port);
+ locaddr.sin_addr.s_addr =
+ (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
+ : INADDR_ANY;
+ bzero(&srvaddr, sizeof(srvaddr));
+ srvaddr.sin_len = sizeof(struct sockaddr_in);
+ srvaddr.sin_family = AF_INET;
+ srvaddr.sin_port = htons (route->ksnr_port);
+ srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
+
+ *may_retry = 0;
+
+ CFS_NET_IN;
+ rc = socreate(PF_INET, &so, SOCK_STREAM, 0);
+ CFS_NET_EX;
+ *sockp = so;
+ if (rc != 0) {
+ CERROR ("Can't create autoconnect socket: %d\n", rc);
+ return (-rc);
+ }
+
+ /*
+ * XXX
+ * Liang: what do we need here?
+ */
+ fd = sock_map_fd (so);
+ if (fd < 0) {
+ sock_release (so);
+ CERROR ("sock_map_fd error %d\n", fd);
+ return (fd);
+ }
+ sock_fdrelse(fd);
+
+ /* Set the socket timeouts, so our connection attempt completes in
+ * finite time */
+ tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
+ tv.tv_usec = 0;
+ bzero(&sopt, sizeof sopt);
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = SOL_SOCKET;
+ sopt.sopt_name = SO_SNDTIMEO;
+ sopt.sopt_val = &tv;
+ sopt.sopt_valsize = sizeof(tv);
+
+ CFS_NET_IN;
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CFS_NET_EX;
+ CERROR ("Can't set send timeout %d: %d\n",
+ ksocknal_tunables.ksnd_io_timeout, rc);
+ goto out;
+ }
+ sopt.sopt_level = SOL_SOCKET;
+ sopt.sopt_name = SO_RCVTIMEO;
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CFS_NET_EX;
+ CERROR ("Can't set receive timeout %d: %d\n",
+ ksocknal_tunables.ksnd_io_timeout, rc);
+ goto out;
+ }
+ option = 1;
+ sopt.sopt_level = SOL_SOCKET;
+ sopt.sopt_name = SO_REUSEADDR;
+ sopt.sopt_val = &option;
+ sopt.sopt_valsize = sizeof(option);
+ rc = sosetopt(so, &sopt);
+ if (rc != 0) {
+ CFS_NET_EX;
+ CERROR ("Can't set sock reuse address: %d\n", rc);
+ goto out;
+ }
+ rc = sobind(so, (struct sockaddr *)&locaddr);
+ if (rc == EADDRINUSE) {
+ CFS_NET_EX;
+ CDEBUG(D_NET, "Port %d already in use\n", local_port);
+ *may_retry = 1;
+ goto out;
+ }
+ if (rc != 0) {
+ CFS_NET_EX;
+ CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n",
+ HIPQUAD(route->ksnr_myipaddr), rc);
+ goto out;
+ }
+ rc = soconnect(so, (struct sockaddr *)&srvaddr);
+ *may_retry = (rc == EADDRNOTAVAIL || rc == EADDRINUSE);
+ if (rc != 0) {
+ CFS_NET_EX;
+ if (rc != EADDRNOTAVAIL && rc != EADDRINUSE)
+ CERROR ("Can't connect to nid "LPX64
+ " local IP: %u.%u.%u.%u,"
+ " remote IP: %u.%u.%u.%u/%d: %d\n",
+ route->ksnr_peer->ksnp_nid,
+ HIPQUAD(route->ksnr_myipaddr),
+ HIPQUAD(route->ksnr_ipaddr),
+ route->ksnr_port, rc);
+ goto out;
+ }
+
+ s = splnet();
+ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+ CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n");
+ (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz);
+ }
+ LASSERT((so->so_state & SS_ISCONNECTED));
+ splx(s);
+ CFS_NET_EX;
+
+ rc = so->so_error;
+ if (rc != 0) {
+ CERROR ("Error %d waiting for connection to nid "LPX64
+ " local IP: %u.%u.%u.%u,"
+ " remote IP: %u.%u.%u.%u/%d: %d\n", rc,
+ route->ksnr_peer->ksnp_nid,
+ HIPQUAD(route->ksnr_myipaddr),
+ HIPQUAD(route->ksnr_ipaddr),
+ route->ksnr_port, rc);
+ goto out;
+ }
+ return (-rc);
+
+ out:
+ rele_file(KSN_SOCK2FILE(so));
+
+ return (-rc);
+}
+
+void
+ksocknal_lib_push_conn(ksock_conn_t *conn)
+{
+ struct socket *sock;
+ struct sockopt sopt;
+ int val = 1;
+ int rc;
+ CFS_DECL_NET_DATA;
+
+ rc = ksocknal_getconnsock (conn);
+ if (rc != 0) /* being shut down */
+ return;
+ sock = conn->ksnc_sock;
+ bzero(&sopt, sizeof sopt);
+ sopt.sopt_dir = SOPT_SET;
+ sopt.sopt_level = IPPROTO_TCP;
+ sopt.sopt_name = TCP_NODELAY;
+ sopt.sopt_val = &val;
+ sopt.sopt_valsize = sizeof val;
+
+ CFS_NET_IN;
+ sosetopt(sock, &sopt);
+ CFS_NET_EX;
+
+ ksocknal_putconnsock (conn);
+ return;
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+
+static void
+ksocknal_upcall(struct socket *so, caddr_t arg, int waitf)
+{
+ ksock_conn_t *conn;
+ CFS_DECL_NET_DATA;
+ ENTRY;
+
+ read_lock (&ksocknal_data.ksnd_global_lock);
+ conn = so->reserved3;
+
+ if (conn == NULL){
+ /* More processing is needed? */
+ goto out;
+ }
+ if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) {
+ extern int soreadable(struct socket *so);
+ CFS_NET_IN;
+ if (conn->ksnc_rx_nob_wanted && soreadable(so)){
+ /* To verify whether the upcall is for receive */
+ CFS_NET_EX;
+ ksocknal_read_callback (conn);
+ }else
+ CFS_NET_EX;
+ }
+ /* go foward? */
+ if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){
+ extern int sowriteable(struct socket *so);
+ CFS_NET_IN;
+ if (sowriteable(so)){
+ /* socket is writable */
+ CFS_NET_EX;
+ ksocknal_write_callback(conn);
+ } else
+ CFS_NET_EX;
+ }
+out:
+ read_unlock (&ksocknal_data.ksnd_global_lock);
+
+ EXIT;
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ /* No callback need to save in osx */
+ return;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ CFS_DECL_NET_DATA;
+
+ CFS_NET_IN;
+ sock->so_upcallarg = (void *)sock; /* anything not NULL */
+ sock->so_upcall = ksocknal_upcall;
+ sock->so_snd.sb_timeo = 0;
+ sock->so_rcv.sb_timeo = 2 * HZ;
+ sock->so_rcv.sb_flags |= SB_UPCALL;
+ sock->so_snd.sb_flags |= SB_UPCALL;
+ sock->reserved3 = conn;
+ CFS_NET_EX;
+ return;
+}
+
+void
+ksocknal_lib_act_callback(struct socket *sock)
+{
+ /* upcall will take the network funnel */
+ ksocknal_upcall (sock, 0, 0);
+}
+
+void
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{
+ CFS_DECL_NET_DATA;
+
+ CFS_NET_IN;
+ sock->so_upcall = NULL;
+ sock->so_upcallarg = NULL;
+ sock->so_rcv.sb_flags &= ~SB_UPCALL;
+ sock->so_snd.sb_flags &= ~SB_UPCALL;
+ CFS_NET_EX;
+}
+
+