X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd_lib-darwin.c;fp=lnet%2Fklnds%2Fsocklnd%2Fsocklnd_lib-darwin.c;h=ada5b640a1ce8ef559bb681bb64c5a2fb3fedb96;hb=00f255b8c00dff66481a6ab22391869217b5d8af;hp=0000000000000000000000000000000000000000;hpb=64cd6738edccfefb928825112da62b2a44db284e;p=fs%2Flustre-release.git diff --git a/lnet/klnds/socklnd/socklnd_lib-darwin.c b/lnet/klnds/socklnd/socklnd_lib-darwin.c new file mode 100644 index 0000000..ada5b64 --- /dev/null +++ b/lnet/klnds/socklnd/socklnd_lib-darwin.c @@ -0,0 +1,1011 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2002 Cluster File Systems, Inc. + * Author: Phil Schwan + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Darwin porting library + * Make things easy to port + */ +#include +#include +#include +#include +#include + +#include "socknal.h" + +#if 0 +#undef SOCKNAL_SINGLE_FRAG_TX +#define SOCKNAL_SINGLE_FRAG_TX 1 +#undef SOCKNAL_SINGLE_FRAG_RX +#define SOCKNAL_SINGLE_FRAG_RX 1 +#endif + +SYSCTL_DECL(_portals); + +SYSCTL_NODE (_portals, OID_AUTO, ksocknal, CTLFLAG_RW, + 0, "ksocknal_sysctl"); + +SYSCTL_INT(_portals_ksocknal, OID_AUTO, timeout, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_io_timeout, + 0, "timeout"); +SYSCTL_INT(_portals_ksocknal, OID_AUTO, eager_ack, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_eager_ack, + 0, "eager_ack"); +SYSCTL_INT(_portals_ksocknal, OID_AUTO, typed, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_typed_conns, + 0, "typed"); +SYSCTL_INT(_portals_ksocknal, OID_AUTO, min_bulk, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_min_bulk, + 0, "min_bulk"); +SYSCTL_INT(_portals_ksocknal, OID_AUTO, buffer_size, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_buffer_size, + 0, "buffer_size"); +SYSCTL_INT(_portals_ksocknal, OID_AUTO, nagle, + CTLTYPE_INT | CTLFLAG_RW , &ksocknal_tunables.ksnd_nagle, + 0, "nagle"); + +cfs_sysctl_table_t ksocknal_top_ctl_table [] = { + &sysctl__portals_ksocknal, + &sysctl__portals_ksocknal_timeout, + &sysctl__portals_ksocknal_eager_ack, + &sysctl__portals_ksocknal_typed, + &sysctl__portals_ksocknal_min_bulk, + &sysctl__portals_ksocknal_buffer_size, + &sysctl__portals_ksocknal_nagle, + NULL +}; + +static unsigned long ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); + +struct socket * +sockfd_lookup(int fd, void *foo) +{ + struct socket *so; + struct file *fp; + CFS_DECL_FUNNEL_DATA; + + CFS_NET_IN; + getsock(current_proc()->p_fd, fd, &fp); + CFS_NET_EX; + so = (struct socket *)fp->f_data; + so->reserved4 = fp; + CFS_CONE_IN; + fref(fp); + CFS_CONE_EX; + return so; +} + +extern struct fileops socketops; + +static int +sock_map_fd (struct socket *so) +{ + struct file *fp; + int fd; + CFS_DECL_FUNNEL_DATA; + + CFS_CONE_IN; + falloc(current_proc(), &fp, &fd); + fp->f_flag = FREAD|FWRITE; + fp->f_type = DTYPE_SOCKET; + fp->f_ops = &socketops; + fp->f_data = (caddr_t)so; + so->reserved4 = fp; + *fdflags(current_proc(), fd) &= ~UF_RESERVED; + CFS_CONE_EX; + + return fd; +} + +static void +sock_release(struct socket *so) +{ + struct file *fp; + CFS_DECL_FUNNEL_DATA; + + fp = (struct file *)so->reserved4; + so->reserved4 = NULL; + fp->f_data = NULL; + CFS_CONE_IN; + frele(fp); + CFS_CONE_EX; + CFS_NET_IN; + soshutdown(so, 0); + CFS_NET_EX; +} + +static void +sock_fdrelse(int fd) +{ + CFS_DECL_FUNNEL_DATA; + + CFS_CONE_IN; + fdrelse(current_proc(), fd); + CFS_CONE_EX; +} + +void +ksocknal_lib_bind_irq (unsigned int irq) +{ + return; +} + +unsigned int +ksocknal_lib_sock_irq (struct socket *sock) +{ + return 0; +} + +int +ksocknal_lib_get_conn_addrs (ksock_conn_t *conn) +{ + struct sockaddr_in *sin; + struct sockaddr *sa; + int rc; + CFS_DECL_NET_DATA; + + CFS_NET_IN; + rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa); + LASSERT (!conn->ksnc_closing); + if (rc != 0) { + CFS_NET_EX; + if (sa) FREE(sa, M_SONAME); + CERROR ("Error %d getting sock peer IP\n", rc); + return rc; + } + sin = (struct sockaddr_in *)sa; + conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr); + conn->ksnc_port = ntohs (sin->sin_port); + if (sa) FREE(sa, M_SONAME); + rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa); + CFS_NET_EX; + if (rc != 0) { + if (sa) FREE(sa, M_SONAME); + CERROR ("Error %d getting sock local IP\n", rc); + return rc; + } + conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr); + + return 0; +} + +int +ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) +{ +#if SOCKNAL_SINGLE_FRAG_TX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + int niov = tx->tx_niov; +#endif + struct socket *sock = conn->ksnc_sock; + int nob; + int rc; + int i; + struct uio suio = { + .uio_iov = scratchiov, + .uio_iovcnt = niov, + .uio_offset = 0, + .uio_resid = 0, /* This will be valued after a while */ + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_WRITE, + .uio_procp = NULL + }; + int flags = MSG_DONTWAIT; + CFS_DECL_NET_DATA; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = tx->tx_iov[i]; + nob += scratchiov[i].iov_len; + } + suio.uio_resid = nob; + + CFS_NET_IN; + rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags); + CFS_NET_EX; + + /* NB there is no return value can indicate how many + * have been sent and how many resid, we have to get + * sent bytes from suio. */ + if (rc != 0) { + if (suio.uio_resid != nob &&\ + (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK)) + /* We have sent something */ + rc = nob - suio.uio_resid; + else if ( rc == EWOULDBLOCK ) + /* Actually, EAGAIN and EWOULDBLOCK have same value in OSX */ + rc = -EAGAIN; + else + rc = -rc; + } else /* rc == 0 */ + rc = nob - suio.uio_resid; + + return rc; +} + +int +ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) +{ +#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct iovec scratch; + struct iovec *scratchiov = &scratch; + int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; + int niov = tx->tx_nkiov; +#endif + struct socket *sock = conn->ksnc_sock; + ptl_kiov_t *kiov = tx->tx_kiov; + int nob; + int rc; + int i; + struct uio suio = { + .uio_iov = scratchiov, + .uio_iovcnt = niov, + .uio_offset = 0, + .uio_resid = 0, /* It should be valued after a while */ + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_WRITE, + .uio_procp = NULL + }; + int flags = MSG_DONTWAIT; + CFS_DECL_NET_DATA; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + suio.uio_resid = nob; + + CFS_NET_IN; + rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags); + CFS_NET_EX; + + for (i = 0; i < niov; i++) + cfs_kunmap(kiov[i].kiov_page); + + if (rc != 0) { + if (suio.uio_resid != nob &&\ + (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK)) + /* We have sent something */ + rc = nob - suio.uio_resid; + else if ( rc == EWOULDBLOCK ) + /* EAGAIN and EWOULD BLOCK have same value in OSX */ + rc = -EAGAIN; + else + rc = -rc; + } else /* rc == 0 */ + rc = nob - suio.uio_resid; + + return rc; +} + +/* + * liang: Hack of inpcb and tcpcb. + * To get tcpcb of a socket, and call tcp_output + * to send quick ack. + */ +struct ks_tseg_qent{ + int foo; +}; + +struct ks_tcptemp{ + int foo; +}; + +LIST_HEAD(ks_tsegqe_head, ks_tseg_qent); + +struct ks_tcpcb { + struct ks_tsegqe_head t_segq; + int t_dupacks; + struct ks_tcptemp *unused; + int t_timer[4]; + struct inpcb *t_inpcb; + int t_state; + u_int t_flags; + /* + * There are more fields but we dont need + * ...... + */ +}; + +#define TF_ACKNOW 0x00001 +#define TF_DELACK 0x00002 + +struct ks_inpcb { + LIST_ENTRY(ks_inpcb) inp_hash; + struct in_addr reserved1; + struct in_addr reserved2; + u_short inp_fport; + u_short inp_lport; + LIST_ENTRY(inpcb) inp_list; + caddr_t inp_ppcb; + /* + * There are more fields but we dont need + * ...... + */ +}; + +#define ks_sotoinpcb(so) ((struct ks_inpcb *)(so)->so_pcb) +#define ks_intotcpcb(ip) ((struct ks_tcpcb *)(ip)->inp_ppcb) +#define ks_sototcpcb(so) (intotcpcb(sotoinpcb(so))) + +void +ksocknal_lib_eager_ack (ksock_conn_t *conn) +{ + struct socket *sock = conn->ksnc_sock; + struct ks_inpcb *inp = ks_sotoinpcb(sock); + struct ks_tcpcb *tp = ks_intotcpcb(inp); + int s; + CFS_DECL_NET_DATA; + + extern int tcp_output(register struct ks_tcpcb *tp); + + CFS_NET_IN; + s = splnet(); + + if (tp && tp->t_flags & TF_DELACK){ + tp->t_flags &= ~TF_DELACK; + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + } + splx(s); + + /* + * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo + * to send immediate ACK. It's not the best resolution because + * tcp_fasttimo will send out ACK for all delayed-ack tcp socket. + * Anyway, it's working now. + * extern void tcp_fasttimo(); + * tcp_fasttimo(); + */ + CFS_NET_EX; + + return; +} + +int +ksocknal_lib_recv_iov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX + struct iovec scratch; + struct iovec *scratchiov = &scratch; + int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + int niov = conn->ksnc_rx_niov; +#endif + struct iovec *iov = conn->ksnc_rx_iov; + int nob; + int rc; + int i; + struct uio ruio = { + .uio_iov = scratchiov, + .uio_iovcnt = niov, + .uio_offset = 0, + .uio_resid = 0, /* It should be valued after a while */ + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_READ, + .uio_procp = NULL + }; + int flags = MSG_DONTWAIT; + CFS_DECL_NET_DATA; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i] = iov[i]; + nob += scratchiov[i].iov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + ruio.uio_resid = nob; + + CFS_NET_IN; + rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, &flags); + CFS_NET_EX; + if (rc){ + if (ruio.uio_resid != nob && \ + (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK || rc == EAGAIN)) + /* data particially received */ + rc = nob - ruio.uio_resid; + else if (rc == EWOULDBLOCK) + /* EAGAIN and EWOULD BLOCK have same value in OSX */ + rc = -EAGAIN; + else + rc = -rc; + } else + rc = nob - ruio.uio_resid; + + return (rc); +} + +int +ksocknal_lib_recv_kiov (ksock_conn_t *conn) +{ +#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK + struct iovec scratch; + struct iovec *scratchiov = &scratch; + int niov = 1; +#else + struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; + int niov = conn->ksnc_rx_nkiov; +#endif + ptl_kiov_t *kiov = conn->ksnc_rx_kiov; + int nob; + int rc; + int i; + struct uio ruio = { + .uio_iov = scratchiov, + .uio_iovcnt = niov, + .uio_offset = 0, + .uio_resid = 0, + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_READ, + .uio_procp = NULL + }; + int flags = MSG_DONTWAIT; + CFS_DECL_NET_DATA; + + for (nob = i = 0; i < niov; i++) { + scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; + nob += scratchiov[i].iov_len = kiov[i].kiov_len; + } + LASSERT (nob <= conn->ksnc_rx_nob_wanted); + + ruio.uio_resid = nob; + + CFS_NET_IN; + rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, NULL, &flags); + CFS_NET_EX; + + for (i = 0; i < niov; i++) + cfs_kunmap(kiov[i].kiov_page); + + if (rc){ + if (ruio.uio_resid != nob && \ + (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK)) + /* data particially received */ + rc = nob - ruio.uio_resid; + else if (rc == EWOULDBLOCK) + /* receive blocked, EWOULDBLOCK == EAGAIN */ + rc = -EAGAIN; + else + rc = -rc; + } else + rc = nob - ruio.uio_resid; + + return (rc); +} + +int +ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob) +{ + int rc; + CFS_DECL_NET_DATA; + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct uio suio = { + .uio_iov = &iov, + .uio_iovcnt = 1, + .uio_offset = 0, + .uio_resid = nob, + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_WRITE, + .uio_procp = NULL + }; + + CFS_NET_IN; + rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0); + CFS_NET_EX; + + if (rc != 0) { + if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ + rc == EWOULDBLOCK)) + rc = 0; + if ( rc != 0 ) + return -rc; + rc = nob - suio.uio_resid; + buffer = ((char *)buffer) + rc; + nob = suio.uio_resid; + continue; + } + break; + } + + return (0); +} + +int +ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob) +{ + int rc; + CFS_DECL_NET_DATA; + + while (nob > 0) { + struct iovec iov = { + .iov_base = buffer, + .iov_len = nob + }; + struct uio ruio = { + .uio_iov = &iov, + .uio_iovcnt = 1, + .uio_offset = 0, + .uio_resid = nob, + .uio_segflg = UIO_SYSSPACE, + .uio_rw = UIO_READ, + .uio_procp = NULL + }; + + CFS_NET_IN; + rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0); + CFS_NET_EX; + + if (rc != 0) { + if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\ + rc == EWOULDBLOCK)) + rc = 0; + if (rc != 0) + return -rc; + rc = nob - ruio.uio_resid; + buffer = ((char *)buffer) + rc; + nob = ruio.uio_resid; + continue; + } + break; + } + + return (0); +} + +int +ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) +{ + struct sockopt sopt; + struct socket *sock = conn->ksnc_sock; + int len; + int rc; + CFS_DECL_NET_DATA; + + rc = ksocknal_getconnsock (conn); + if (rc != 0) { + LASSERT (conn->ksnc_closing); + *txmem = *rxmem = *nagle = 0; + rc = -ESHUTDOWN; + goto out; + } + len = sizeof(*txmem); + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_GET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_SNDBUF; + sopt.sopt_val = txmem; + sopt.sopt_valsize = len; + + CFS_NET_IN; + rc = sogetopt(sock, &sopt); + if (rc == 0) { + len = sizeof(*rxmem); + sopt.sopt_name = SO_RCVBUF; + sopt.sopt_val = rxmem; + rc = sogetopt(sock, &sopt); + } + if (rc == 0) { + len = sizeof(*nagle); + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_NODELAY; + sopt.sopt_val = nagle; + rc = sogetopt(sock, &sopt); + } + CFS_NET_EX; + + ksocknal_putconnsock (conn); + + if (rc == 0) + *nagle = !*nagle; + else + *txmem = *rxmem = *nagle = 0; +out: + return (-rc); +} + +int +ksocknal_lib_setup_sock (struct socket *so) +{ + struct sockopt sopt; + int rc; + int option; + int keep_idle; + int keep_intvl; + int keep_count; + int do_keepalive; + struct linger linger; + CFS_DECL_NET_DATA; + + /* Ensure this socket aborts active sends immediately when we close + * it. */ + + bzero(&sopt, sizeof sopt); + + linger.l_onoff = 0; + linger.l_linger = 0; + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_LINGER; + sopt.sopt_val = &linger; + sopt.sopt_valsize = sizeof(linger); + + CFS_NET_IN; + rc = sosetopt(so, &sopt); + if (rc != 0) { + CERROR ("Can't set SO_LINGER: %d\n", rc); + goto out; + } + + + if (!ksocknal_tunables.ksnd_nagle) { + option = 1; + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_NODELAY; + sopt.sopt_val = &option; + sopt.sopt_valsize = sizeof(option); + rc = sosetopt(so, &sopt); + if (rc != 0) { + CERROR ("Can't disable nagle: %d\n", rc); + goto out; + } + } + if (ksocknal_tunables.ksnd_buffer_size > 0) { + option = ksocknal_tunables.ksnd_buffer_size; + if (option > ksocknal_mbuf_size) + option = ksocknal_mbuf_size; + + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_SNDBUF; + sopt.sopt_val = &option; + sopt.sopt_valsize = sizeof(option); + rc = sosetopt(so, &sopt); + if (rc != 0) { + CERROR ("Can't set send buffer %d: %d\n", + option, rc); + goto out; + } + + sopt.sopt_name = SO_RCVBUF; + rc = sosetopt(so, &sopt); + if (rc != 0) { + CERROR ("Can't set receive buffer %d: %d\n", + option, rc); + goto out; + } + } + /* snapshot tunables */ + keep_idle = ksocknal_tunables.ksnd_keepalive_idle; + keep_count = ksocknal_tunables.ksnd_keepalive_count; + keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl; + + do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); + option = (do_keepalive ? 1 : 0); + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_KEEPALIVE; + sopt.sopt_val = &option; + sopt.sopt_valsize = sizeof(option); + rc = sosetopt(so, &sopt); + if (rc != 0) { + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + goto out; + } + + if (!do_keepalive) { + /* no more setting, just return */ + rc = 0; + goto out; + } + + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_KEEPALIVE; + sopt.sopt_val = &keep_idle; + sopt.sopt_valsize = sizeof(keep_idle); + rc = sosetopt(so, &sopt); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc); + goto out; + } +out: + CFS_NET_EX; + return (-rc); +} + +int +ksocknal_lib_connect_sock (struct socket **sockp, int *may_retry, + ksock_route_t *route, int local_port) +{ + struct sockaddr_in locaddr; + struct sockaddr_in srvaddr; + struct timeval tv; + int fd; + struct socket *so; + struct sockopt sopt; + int option; + int rc; + int s; + CFS_DECL_FUNNEL_DATA; + + ENTRY; + bzero (&locaddr, sizeof (locaddr)); + locaddr.sin_len = sizeof(struct sockaddr_in); + locaddr.sin_family = AF_INET; + locaddr.sin_port = htons (local_port); + locaddr.sin_addr.s_addr = + (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr) + : INADDR_ANY; + bzero(&srvaddr, sizeof(srvaddr)); + srvaddr.sin_len = sizeof(struct sockaddr_in); + srvaddr.sin_family = AF_INET; + srvaddr.sin_port = htons (route->ksnr_port); + srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr); + + *may_retry = 0; + + CFS_NET_IN; + rc = socreate(PF_INET, &so, SOCK_STREAM, 0); + CFS_NET_EX; + *sockp = so; + if (rc != 0) { + CERROR ("Can't create autoconnect socket: %d\n", rc); + return (-rc); + } + + /* + * XXX + * Liang: what do we need here? + */ + fd = sock_map_fd (so); + if (fd < 0) { + sock_release (so); + CERROR ("sock_map_fd error %d\n", fd); + return (fd); + } + sock_fdrelse(fd); + + /* Set the socket timeouts, so our connection attempt completes in + * finite time */ + tv.tv_sec = ksocknal_tunables.ksnd_io_timeout; + tv.tv_usec = 0; + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_SNDTIMEO; + sopt.sopt_val = &tv; + sopt.sopt_valsize = sizeof(tv); + + CFS_NET_IN; + rc = sosetopt(so, &sopt); + if (rc != 0) { + CFS_NET_EX; + CERROR ("Can't set send timeout %d: %d\n", + ksocknal_tunables.ksnd_io_timeout, rc); + goto out; + } + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_RCVTIMEO; + rc = sosetopt(so, &sopt); + if (rc != 0) { + CFS_NET_EX; + CERROR ("Can't set receive timeout %d: %d\n", + ksocknal_tunables.ksnd_io_timeout, rc); + goto out; + } + option = 1; + sopt.sopt_level = SOL_SOCKET; + sopt.sopt_name = SO_REUSEADDR; + sopt.sopt_val = &option; + sopt.sopt_valsize = sizeof(option); + rc = sosetopt(so, &sopt); + if (rc != 0) { + CFS_NET_EX; + CERROR ("Can't set sock reuse address: %d\n", rc); + goto out; + } + rc = sobind(so, (struct sockaddr *)&locaddr); + if (rc == EADDRINUSE) { + CFS_NET_EX; + CDEBUG(D_NET, "Port %d already in use\n", local_port); + *may_retry = 1; + goto out; + } + if (rc != 0) { + CFS_NET_EX; + CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n", + HIPQUAD(route->ksnr_myipaddr), rc); + goto out; + } + rc = soconnect(so, (struct sockaddr *)&srvaddr); + *may_retry = (rc == EADDRNOTAVAIL || rc == EADDRINUSE); + if (rc != 0) { + CFS_NET_EX; + if (rc != EADDRNOTAVAIL && rc != EADDRINUSE) + CERROR ("Can't connect to nid "LPX64 + " local IP: %u.%u.%u.%u," + " remote IP: %u.%u.%u.%u/%d: %d\n", + route->ksnr_peer->ksnp_nid, + HIPQUAD(route->ksnr_myipaddr), + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_port, rc); + goto out; + } + + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n"); + (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz); + } + LASSERT((so->so_state & SS_ISCONNECTED)); + splx(s); + CFS_NET_EX; + + rc = so->so_error; + if (rc != 0) { + CERROR ("Error %d waiting for connection to nid "LPX64 + " local IP: %u.%u.%u.%u," + " remote IP: %u.%u.%u.%u/%d: %d\n", rc, + route->ksnr_peer->ksnp_nid, + HIPQUAD(route->ksnr_myipaddr), + HIPQUAD(route->ksnr_ipaddr), + route->ksnr_port, rc); + goto out; + } + return (-rc); + + out: + rele_file(KSN_SOCK2FILE(so)); + + return (-rc); +} + +void +ksocknal_lib_push_conn(ksock_conn_t *conn) +{ + struct socket *sock; + struct sockopt sopt; + int val = 1; + int rc; + CFS_DECL_NET_DATA; + + rc = ksocknal_getconnsock (conn); + if (rc != 0) /* being shut down */ + return; + sock = conn->ksnc_sock; + bzero(&sopt, sizeof sopt); + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = IPPROTO_TCP; + sopt.sopt_name = TCP_NODELAY; + sopt.sopt_val = &val; + sopt.sopt_valsize = sizeof val; + + CFS_NET_IN; + sosetopt(sock, &sopt); + CFS_NET_EX; + + ksocknal_putconnsock (conn); + return; +} + +extern void ksocknal_read_callback (ksock_conn_t *conn); +extern void ksocknal_write_callback (ksock_conn_t *conn); + +static void +ksocknal_upcall(struct socket *so, caddr_t arg, int waitf) +{ + ksock_conn_t *conn; + CFS_DECL_NET_DATA; + ENTRY; + + read_lock (&ksocknal_data.ksnd_global_lock); + conn = so->reserved3; + + if (conn == NULL){ + /* More processing is needed? */ + goto out; + } + if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) { + extern int soreadable(struct socket *so); + CFS_NET_IN; + if (conn->ksnc_rx_nob_wanted && soreadable(so)){ + /* To verify whether the upcall is for receive */ + CFS_NET_EX; + ksocknal_read_callback (conn); + }else + CFS_NET_EX; + } + /* go foward? */ + if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){ + extern int sowriteable(struct socket *so); + CFS_NET_IN; + if (sowriteable(so)){ + /* socket is writable */ + CFS_NET_EX; + ksocknal_write_callback(conn); + } else + CFS_NET_EX; + } +out: + read_unlock (&ksocknal_data.ksnd_global_lock); + + EXIT; +} + +void +ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn) +{ + /* No callback need to save in osx */ + return; +} + +void +ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn) +{ + CFS_DECL_NET_DATA; + + CFS_NET_IN; + sock->so_upcallarg = (void *)sock; /* anything not NULL */ + sock->so_upcall = ksocknal_upcall; + sock->so_snd.sb_timeo = 0; + sock->so_rcv.sb_timeo = 2 * HZ; + sock->so_rcv.sb_flags |= SB_UPCALL; + sock->so_snd.sb_flags |= SB_UPCALL; + sock->reserved3 = conn; + CFS_NET_EX; + return; +} + +void +ksocknal_lib_act_callback(struct socket *sock) +{ + /* upcall will take the network funnel */ + ksocknal_upcall (sock, 0, 0); +} + +void +ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn) +{ + CFS_DECL_NET_DATA; + + CFS_NET_IN; + sock->so_upcall = NULL; + sock->so_upcallarg = NULL; + sock->so_rcv.sb_flags &= ~SB_UPCALL; + sock->so_snd.sb_flags &= ~SB_UPCALL; + CFS_NET_EX; +} + +