Whamcloud - gitweb
* Landed portals:b_port_step as follows...
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd_lib-darwin.c
diff --git a/lnet/klnds/socklnd/socklnd_lib-darwin.c b/lnet/klnds/socklnd/socklnd_lib-darwin.c
new file mode 100644 (file)
index 0000000..ada5b64
--- /dev/null
@@ -0,0 +1,1011 @@
+/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
+ * vim:expandtab:shiftwidth=8:tabstop=8:
+ *
+ * Copyright (C) 2002 Cluster File Systems, Inc.
+ * Author: Phil Schwan <phil@clusterfs.com>
+ *
+ * This file is part of Lustre, http://www.lustre.org.
+ *
+ * Lustre is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Lustre is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with Lustre; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Darwin porting library
+ * Make things easy to port
+ */
+#include <mach/mach_types.h>
+#include <string.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sys/file.h>
+
+#include "socknal.h"
+
+#if 0
+#undef SOCKNAL_SINGLE_FRAG_TX
+#define SOCKNAL_SINGLE_FRAG_TX  1
+#undef SOCKNAL_SINGLE_FRAG_RX
+#define SOCKNAL_SINGLE_FRAG_RX  1
+#endif
+
+SYSCTL_DECL(_portals);
+
+SYSCTL_NODE (_portals,           OID_AUTO,       ksocknal,        CTLFLAG_RW, 
+             0,                 "ksocknal_sysctl");
+
+SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       timeout, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_io_timeout, 
+           0,                                   "timeout");
+SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       eager_ack, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_eager_ack, 
+           0,                                   "eager_ack");
+SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       typed, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_typed_conns, 
+           0,                                   "typed");
+SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       min_bulk, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_min_bulk, 
+           0,                                   "min_bulk");
+SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       buffer_size, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_buffer_size, 
+           0,                                   "buffer_size");
+SYSCTL_INT(_portals_ksocknal,    OID_AUTO,       nagle, 
+           CTLTYPE_INT | CTLFLAG_RW ,           &ksocknal_tunables.ksnd_nagle, 
+           0,                                   "nagle");
+
+cfs_sysctl_table_t      ksocknal_top_ctl_table [] = {
+        &sysctl__portals_ksocknal,
+        &sysctl__portals_ksocknal_timeout,
+        &sysctl__portals_ksocknal_eager_ack,
+        &sysctl__portals_ksocknal_typed,
+        &sysctl__portals_ksocknal_min_bulk,
+        &sysctl__portals_ksocknal_buffer_size,
+        &sysctl__portals_ksocknal_nagle,
+        NULL
+};
+
+static unsigned long  ksocknal_mbuf_size = (u_quad_t)SB_MAX * MCLBYTES / (MSIZE + MCLBYTES);
+
+struct socket *
+sockfd_lookup(int fd, void *foo)
+{
+       struct socket *so;
+       struct file *fp;
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_NET_IN;
+       getsock(current_proc()->p_fd, fd, &fp);
+        CFS_NET_EX;
+       so = (struct socket *)fp->f_data;
+       so->reserved4 = fp;
+        CFS_CONE_IN;
+       fref(fp);
+        CFS_CONE_EX;
+       return so;
+}
+
+extern struct fileops socketops;
+
+static int
+sock_map_fd (struct socket *so)
+{
+       struct file *fp;
+       int fd;
+        CFS_DECL_FUNNEL_DATA;
+       
+        CFS_CONE_IN;
+       falloc(current_proc(), &fp, &fd);
+       fp->f_flag = FREAD|FWRITE;
+       fp->f_type = DTYPE_SOCKET;
+       fp->f_ops = &socketops;
+       fp->f_data = (caddr_t)so;
+       so->reserved4 = fp;
+       *fdflags(current_proc(), fd) &= ~UF_RESERVED;
+        CFS_CONE_EX;
+
+       return fd;
+}
+
+static void
+sock_release(struct socket *so)
+{
+       struct file *fp;
+        CFS_DECL_FUNNEL_DATA;
+
+       fp = (struct file *)so->reserved4;
+       so->reserved4 = NULL;
+       fp->f_data = NULL;
+        CFS_CONE_IN;
+       frele(fp);
+        CFS_CONE_EX;
+        CFS_NET_IN;
+       soshutdown(so, 0);
+        CFS_NET_EX;
+}
+
+static void
+sock_fdrelse(int fd)
+{ 
+        CFS_DECL_FUNNEL_DATA;
+
+        CFS_CONE_IN;
+        fdrelse(current_proc(), fd);
+        CFS_CONE_EX;
+}
+
+void
+ksocknal_lib_bind_irq (unsigned int irq)
+{
+        return;
+}
+
+unsigned int
+ksocknal_lib_sock_irq (struct socket *sock)
+{
+        return 0;
+}
+
+int
+ksocknal_lib_get_conn_addrs (ksock_conn_t *conn)
+{ 
+        struct sockaddr_in *sin; 
+        struct sockaddr    *sa; 
+        int                rc; 
+        CFS_DECL_NET_DATA;
+
+        CFS_NET_IN; 
+        rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_peeraddr(conn->ksnc_sock, &sa); 
+        LASSERT (!conn->ksnc_closing); 
+        if (rc != 0) { 
+                CFS_NET_EX; 
+                if (sa) FREE(sa, M_SONAME); 
+                CERROR ("Error %d getting sock peer IP\n", rc); 
+                return rc; 
+        } 
+        sin = (struct sockaddr_in *)sa; 
+        conn->ksnc_ipaddr = ntohl (sin->sin_addr.s_addr); 
+        conn->ksnc_port = ntohs (sin->sin_port); 
+        if (sa) FREE(sa, M_SONAME); 
+        rc = conn->ksnc_sock->so_proto->pr_usrreqs->pru_sockaddr(conn->ksnc_sock, &sa); 
+        CFS_NET_EX; 
+        if (rc != 0) { 
+                if (sa) FREE(sa, M_SONAME); 
+                CERROR ("Error %d getting sock local IP\n", rc); 
+                return rc; 
+        } 
+        conn->ksnc_myipaddr = ntohl (sin->sin_addr.s_addr);
+
+        return 0;
+}
+
+int
+ksocknal_lib_send_iov (ksock_conn_t *conn, ksock_tx_t *tx)
+{ 
+#if SOCKNAL_SINGLE_FRAG_TX 
+        struct iovec    scratch; 
+        struct iovec   *scratchiov = &scratch; 
+        int             niov = 1;
+#else 
+        struct iovec   *scratchiov = conn->ksnc_tx_scratch_iov; 
+        int             niov = tx->tx_niov;
+#endif
+        struct socket *sock = conn->ksnc_sock;
+        int            nob;
+        int            rc;
+        int            i;
+        struct uio  suio = {
+                .uio_iov        = scratchiov,
+                .uio_iovcnt     = niov,
+                .uio_offset     = 0,
+                .uio_resid      = 0,            /* This will be valued after a while */
+                .uio_segflg     = UIO_SYSSPACE,
+                .uio_rw         = UIO_WRITE,
+                .uio_procp      = NULL
+        };
+        int  flags = MSG_DONTWAIT;
+        CFS_DECL_NET_DATA;
+
+        for (nob = i = 0; i < niov; i++) { 
+                scratchiov[i] = tx->tx_iov[i]; 
+                nob += scratchiov[i].iov_len; 
+        }
+        suio.uio_resid = nob;
+
+        CFS_NET_IN;
+        rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags);
+        CFS_NET_EX; 
+
+        /* NB there is no return value can indicate how many 
+         * have been sent and how many resid, we have to get 
+         * sent bytes from suio. */
+        if (rc != 0) {
+                if (suio.uio_resid != nob &&\
+                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+                        /* We have sent something */
+                        rc = nob - suio.uio_resid;
+                else if ( rc == EWOULDBLOCK ) 
+                        /* Actually, EAGAIN and EWOULDBLOCK have same value in OSX */
+                        rc = -EAGAIN;   
+                else 
+                        rc = -rc;
+        } else  /* rc == 0 */
+                rc = nob - suio.uio_resid;
+
+        return rc;
+}
+
+int
+ksocknal_lib_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx)
+{
+#if SOCKNAL_SINGLE_FRAG_TX || !SOCKNAL_RISK_KMAP_DEADLOCK 
+        struct iovec  scratch; 
+        struct iovec *scratchiov = &scratch; 
+        int           niov = 1;
+#else
+        struct iovec *scratchiov = conn->ksnc_tx_scratch_iov; 
+        int           niov = tx->tx_nkiov;
+#endif
+        struct socket *sock = conn->ksnc_sock;
+        ptl_kiov_t    *kiov = tx->tx_kiov;
+        int            nob;
+        int            rc;
+        int            i;
+        struct  uio suio = {
+                .uio_iov        = scratchiov,
+                .uio_iovcnt     = niov,
+                .uio_offset     = 0, 
+                .uio_resid      = 0,    /* It should be valued after a while */
+                .uio_segflg     = UIO_SYSSPACE,
+                .uio_rw         = UIO_WRITE,
+                .uio_procp      = NULL
+        };
+        int  flags = MSG_DONTWAIT;
+        CFS_DECL_NET_DATA; 
+        
+        for (nob = i = 0; i < niov; i++) { 
+                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + 
+                                         kiov[i].kiov_offset; 
+                nob += scratchiov[i].iov_len = kiov[i].kiov_len; 
+        }
+        suio.uio_resid = nob;
+
+        CFS_NET_IN;
+        rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, flags);
+        CFS_NET_EX;
+
+        for (i = 0; i < niov; i++) 
+                cfs_kunmap(kiov[i].kiov_page);
+
+        if (rc != 0) {
+                if (suio.uio_resid != nob &&\
+                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+                        /* We have sent something */
+                        rc = nob - suio.uio_resid; 
+                else if ( rc == EWOULDBLOCK ) 
+                        /* EAGAIN and EWOULD BLOCK have same value in OSX */
+                        rc = -EAGAIN;   
+                else 
+                        rc = -rc;
+        } else  /* rc == 0 */
+                rc = nob - suio.uio_resid;
+
+        return rc;
+}
+
+/*
+ * liang: Hack of inpcb and tcpcb.
+ * To get tcpcb of a socket, and call tcp_output
+ * to send quick ack.
+ */
+struct ks_tseg_qent{
+        int foo;
+};
+
+struct ks_tcptemp{
+        int foo;
+};
+
+LIST_HEAD(ks_tsegqe_head, ks_tseg_qent);
+
+struct ks_tcpcb {
+        struct ks_tsegqe_head t_segq;
+        int     t_dupacks;
+        struct ks_tcptemp *unused;
+        int    t_timer[4];
+        struct inpcb *t_inpcb;
+        int    t_state;
+        u_int  t_flags;
+        /*
+         * There are more fields but we dont need
+         * ......
+         */
+};
+
+#define TF_ACKNOW       0x00001
+#define TF_DELACK       0x00002
+
+struct ks_inpcb {
+        LIST_ENTRY(ks_inpcb) inp_hash;
+        struct  in_addr reserved1;
+        struct  in_addr reserved2;
+        u_short inp_fport;
+        u_short inp_lport;
+        LIST_ENTRY(inpcb) inp_list;
+        caddr_t inp_ppcb;
+        /*
+         * There are more fields but we dont need
+         * ......
+         */
+};
+
+#define ks_sotoinpcb(so)   ((struct ks_inpcb *)(so)->so_pcb)
+#define ks_intotcpcb(ip)   ((struct ks_tcpcb *)(ip)->inp_ppcb)
+#define ks_sototcpcb(so)   (intotcpcb(sotoinpcb(so)))
+
+void
+ksocknal_lib_eager_ack (ksock_conn_t *conn)
+{
+        struct socket *sock = conn->ksnc_sock;
+        struct ks_inpcb  *inp = ks_sotoinpcb(sock);
+        struct ks_tcpcb  *tp = ks_intotcpcb(inp);
+        int s;
+        CFS_DECL_NET_DATA;
+
+        extern int tcp_output(register struct ks_tcpcb *tp);
+
+        CFS_NET_IN;
+        s = splnet();
+
+        if (tp && tp->t_flags & TF_DELACK){
+                tp->t_flags &= ~TF_DELACK;
+                tp->t_flags |= TF_ACKNOW;
+                (void) tcp_output(tp);
+        }
+        splx(s);
+
+        /*
+         * No TCP_QUICKACK supported in BSD, so I have to call tcp_fasttimo
+         * to send immediate ACK. It's not the best resolution because
+         * tcp_fasttimo will send out ACK for all delayed-ack tcp socket.
+         * Anyway, it's working now. 
+         * extern void tcp_fasttimo(); 
+         * tcp_fasttimo();
+         */
+        CFS_NET_EX;
+
+        return;
+}
+
+int
+ksocknal_lib_recv_iov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX 
+        struct iovec  scratch; 
+        struct iovec *scratchiov = &scratch; 
+        int           niov = 1;
+#else 
+        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; 
+        int           niov = conn->ksnc_rx_niov;
+#endif
+        struct iovec *iov = conn->ksnc_rx_iov;
+        int          nob;
+        int          rc;
+        int          i;
+        struct uio  ruio = {
+                .uio_iov        = scratchiov,
+                .uio_iovcnt     = niov,
+                .uio_offset     = 0,
+                .uio_resid      = 0,    /* It should be valued after a while */
+                .uio_segflg     = UIO_SYSSPACE,
+                .uio_rw         = UIO_READ,
+                .uio_procp      = NULL
+        };
+        int         flags = MSG_DONTWAIT;
+        CFS_DECL_NET_DATA;
+
+        for (nob = i = 0; i < niov; i++) { 
+                scratchiov[i] = iov[i]; 
+                nob += scratchiov[i].iov_len; 
+        } 
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+        ruio.uio_resid = nob;
+
+        CFS_NET_IN;
+        rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, &flags);
+        CFS_NET_EX;
+        if (rc){
+                if (ruio.uio_resid != nob && \
+                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK || rc == EAGAIN))
+                        /* data particially received */
+                        rc = nob - ruio.uio_resid; 
+                else if (rc == EWOULDBLOCK) 
+                        /* EAGAIN and EWOULD BLOCK have same value in OSX */
+                        rc = -EAGAIN; 
+                else
+                        rc = -rc;
+        } else 
+                rc = nob - ruio.uio_resid;
+
+        return (rc);
+}
+
+int
+ksocknal_lib_recv_kiov (ksock_conn_t *conn)
+{
+#if SOCKNAL_SINGLE_FRAG_RX || !SOCKNAL_RISK_KMAP_DEADLOCK 
+        struct iovec  scratch; 
+        struct iovec *scratchiov = &scratch; 
+        int           niov = 1;
+#else 
+        struct iovec *scratchiov = conn->ksnc_rx_scratch_iov; 
+        int           niov = conn->ksnc_rx_nkiov;
+#endif
+        ptl_kiov_t    *kiov = conn->ksnc_rx_kiov;
+        int           nob;
+        int           rc;
+        int           i;
+        struct uio  ruio = {
+                .uio_iov        = scratchiov,
+                .uio_iovcnt     = niov,
+                .uio_offset     = 0,
+                .uio_resid      = 0,
+                .uio_segflg     = UIO_SYSSPACE,
+                .uio_rw         = UIO_READ,
+                .uio_procp      = NULL
+        };
+        int         flags = MSG_DONTWAIT;
+        CFS_DECL_NET_DATA;
+
+        for (nob = i = 0; i < niov; i++) { 
+                scratchiov[i].iov_base = cfs_kmap(kiov[i].kiov_page) + kiov[i].kiov_offset; 
+                nob += scratchiov[i].iov_len = kiov[i].kiov_len; 
+        } 
+        LASSERT (nob <= conn->ksnc_rx_nob_wanted);
+
+        ruio.uio_resid = nob;
+
+        CFS_NET_IN;
+        rc = soreceive(conn->ksnc_sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, NULL, &flags);
+        CFS_NET_EX;
+
+        for (i = 0; i < niov; i++) 
+                cfs_kunmap(kiov[i].kiov_page);
+
+        if (rc){
+                if (ruio.uio_resid != nob && \
+                    (rc == ERESTART || rc == EINTR || rc == EWOULDBLOCK))
+                        /* data particially received */
+                        rc = nob - ruio.uio_resid; 
+                else if (rc == EWOULDBLOCK)
+                        /* receive blocked, EWOULDBLOCK == EAGAIN */ 
+                        rc = -EAGAIN; 
+                else
+                        rc = -rc;
+        } else
+                rc = nob - ruio.uio_resid;
+
+        return (rc);
+}
+
+int
+ksocknal_lib_sock_write (struct socket *sock, void *buffer, int nob)
+{
+        int           rc;
+        CFS_DECL_NET_DATA;
+
+        while (nob > 0) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct  uio suio = {
+                        .uio_iov        = &iov,
+                        .uio_iovcnt     = 1,
+                        .uio_offset     = 0,
+                        .uio_resid      = nob,
+                        .uio_segflg     = UIO_SYSSPACE,
+                        .uio_rw         = UIO_WRITE,
+                        .uio_procp      = NULL
+                };
+
+                CFS_NET_IN;
+                rc = sosend(sock, NULL, &suio, (struct mbuf *)0, (struct mbuf *)0, 0);
+                CFS_NET_EX;
+
+                if (rc != 0) {
+                        if ( suio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+                                                rc == EWOULDBLOCK))
+                                rc = 0;
+                        if ( rc != 0 )
+                                return -rc;
+                        rc = nob - suio.uio_resid;
+                        buffer = ((char *)buffer) + rc;
+                        nob = suio.uio_resid;
+                        continue;
+                }
+                break;
+        }
+
+        return (0);
+}
+
+int
+ksocknal_lib_sock_read (struct socket *sock, void *buffer, int nob)
+{
+        int           rc;
+        CFS_DECL_NET_DATA;
+
+        while (nob > 0) {
+                struct iovec  iov = {
+                        .iov_base = buffer,
+                        .iov_len  = nob
+                };
+                struct uio  ruio = {
+                        .uio_iov        = &iov,
+                        .uio_iovcnt     = 1,
+                        .uio_offset     = 0,
+                        .uio_resid      = nob,
+                        .uio_segflg     = UIO_SYSSPACE,
+                        .uio_rw         = UIO_READ,
+                        .uio_procp      = NULL
+                };
+
+                CFS_NET_IN;
+                rc = soreceive(sock, (struct sockaddr **)0, &ruio, (struct mbuf **)0, (struct mbuf **)0, (int *)0);
+                CFS_NET_EX;
+
+                if (rc != 0) {
+                        if ( ruio.uio_resid != nob && ( rc == ERESTART || rc == EINTR ||\
+                                                rc == EWOULDBLOCK))
+                                rc = 0;
+                        if (rc != 0)
+                                return -rc;
+                        rc = nob - ruio.uio_resid;
+                        buffer = ((char *)buffer) + rc;
+                        nob = ruio.uio_resid;
+                        continue;
+                }
+                break;
+        }
+
+        return (0);
+}
+
+int
+ksocknal_lib_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle)
+{
+        struct sockopt  sopt;
+        struct socket *sock = conn->ksnc_sock;
+        int            len;
+        int            rc;
+        CFS_DECL_NET_DATA;
+
+        rc = ksocknal_getconnsock (conn);
+        if (rc != 0) {
+                LASSERT (conn->ksnc_closing);
+                *txmem = *rxmem = *nagle = 0;
+                rc = -ESHUTDOWN;
+                goto out;
+        }
+        len = sizeof(*txmem);
+        bzero(&sopt, sizeof sopt);
+        sopt.sopt_dir = SOPT_GET; 
+        sopt.sopt_level = SOL_SOCKET; 
+        sopt.sopt_name = SO_SNDBUF; 
+        sopt.sopt_val = txmem; 
+        sopt.sopt_valsize = len;
+
+        CFS_NET_IN;
+        rc = sogetopt(sock, &sopt);
+        if (rc == 0) {
+                len = sizeof(*rxmem);
+                sopt.sopt_name = SO_RCVBUF;
+                sopt.sopt_val = rxmem;
+                rc = sogetopt(sock, &sopt);
+        }
+        if (rc == 0) {
+                len = sizeof(*nagle);
+                sopt.sopt_level = IPPROTO_TCP;
+                sopt.sopt_name = TCP_NODELAY;
+                sopt.sopt_val = nagle;
+                rc = sogetopt(sock, &sopt);
+        }
+        CFS_NET_EX;
+
+        ksocknal_putconnsock (conn);
+
+        if (rc == 0)
+                *nagle = !*nagle;
+        else
+                *txmem = *rxmem = *nagle = 0;
+out:
+        return (-rc);
+}
+
+int
+ksocknal_lib_setup_sock (struct socket *so)
+{
+        struct sockopt  sopt;
+        int             rc; 
+        int             option; 
+        int             keep_idle; 
+        int             keep_intvl; 
+        int             keep_count; 
+        int             do_keepalive; 
+        struct linger   linger;
+        CFS_DECL_NET_DATA;
+
+        /* Ensure this socket aborts active sends immediately when we close
+         * it. */
+
+        bzero(&sopt, sizeof sopt);
+
+        linger.l_onoff = 0;
+        linger.l_linger = 0;
+        sopt.sopt_dir = SOPT_SET;
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_name = SO_LINGER;
+        sopt.sopt_val = &linger;
+        sopt.sopt_valsize = sizeof(linger);
+
+        CFS_NET_IN;
+        rc = sosetopt(so, &sopt);
+        if (rc != 0) {
+                CERROR ("Can't set SO_LINGER: %d\n", rc);
+                goto out;
+        }
+
+
+        if (!ksocknal_tunables.ksnd_nagle) { 
+                option = 1; 
+                bzero(&sopt, sizeof sopt);
+                sopt.sopt_dir = SOPT_SET; 
+                sopt.sopt_level = IPPROTO_TCP;
+                sopt.sopt_name = TCP_NODELAY; 
+                sopt.sopt_val = &option; 
+                sopt.sopt_valsize = sizeof(option);
+                rc = sosetopt(so, &sopt);
+                if (rc != 0) { 
+                        CERROR ("Can't disable nagle: %d\n", rc); 
+                        goto out;
+                } 
+        } 
+        if (ksocknal_tunables.ksnd_buffer_size > 0) { 
+                option = ksocknal_tunables.ksnd_buffer_size; 
+                if (option > ksocknal_mbuf_size) 
+                        option = ksocknal_mbuf_size; 
+                                                
+                sopt.sopt_dir = SOPT_SET; 
+                sopt.sopt_level = SOL_SOCKET; 
+                sopt.sopt_name = SO_SNDBUF; 
+                sopt.sopt_val = &option; 
+                sopt.sopt_valsize = sizeof(option); 
+                rc = sosetopt(so, &sopt); 
+                if (rc != 0) { 
+                        CERROR ("Can't set send buffer %d: %d\n", 
+                                        option, rc); 
+                        goto out;
+                } 
+                
+                sopt.sopt_name = SO_RCVBUF; 
+                rc = sosetopt(so, &sopt); 
+                if (rc != 0) { 
+                        CERROR ("Can't set receive buffer %d: %d\n", 
+                                        option, rc); 
+                        goto out;
+                }
+        } 
+        /* snapshot tunables */ 
+        keep_idle  = ksocknal_tunables.ksnd_keepalive_idle; 
+        keep_count = ksocknal_tunables.ksnd_keepalive_count; 
+        keep_intvl = ksocknal_tunables.ksnd_keepalive_intvl;
+
+        do_keepalive = (keep_idle > 0 && keep_count > 0 && keep_intvl > 0); 
+        option = (do_keepalive ? 1 : 0); 
+        bzero(&sopt, sizeof sopt); 
+        sopt.sopt_dir = SOPT_SET; 
+        sopt.sopt_level = SOL_SOCKET; 
+        sopt.sopt_name = SO_KEEPALIVE; 
+        sopt.sopt_val = &option; 
+        sopt.sopt_valsize = sizeof(option); 
+        rc = sosetopt(so, &sopt); 
+        if (rc != 0) { 
+                CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); 
+                goto out; 
+        }
+        
+        if (!do_keepalive) { 
+                /* no more setting, just return */
+                rc = 0;
+                goto out;
+        } 
+        
+        bzero(&sopt, sizeof sopt); 
+        sopt.sopt_dir = SOPT_SET; 
+        sopt.sopt_level = IPPROTO_TCP; 
+        sopt.sopt_name = TCP_KEEPALIVE; 
+        sopt.sopt_val = &keep_idle; 
+        sopt.sopt_valsize = sizeof(keep_idle); 
+        rc = sosetopt(so, &sopt); 
+        if (rc != 0) { 
+                CERROR ("Can't set TCP_KEEPALIVE : %d\n", rc); 
+                goto out; 
+        }
+out:
+        CFS_NET_EX;
+        return (-rc);
+}
+
+int
+ksocknal_lib_connect_sock (struct socket **sockp, int *may_retry, 
+                           ksock_route_t *route, int local_port)
+{
+        struct sockaddr_in  locaddr;
+        struct sockaddr_in  srvaddr;
+        struct timeval      tv;
+        int                 fd;
+        struct socket      *so;
+        struct sockopt      sopt;
+        int                 option;
+        int                 rc;
+        int                 s;
+        CFS_DECL_FUNNEL_DATA;
+
+        ENTRY; 
+        bzero (&locaddr, sizeof (locaddr)); 
+        locaddr.sin_len = sizeof(struct sockaddr_in); 
+        locaddr.sin_family = AF_INET; 
+        locaddr.sin_port = htons (local_port);
+        locaddr.sin_addr.s_addr = 
+                (route->ksnr_myipaddr != 0) ? htonl(route->ksnr_myipaddr)
+                                            : INADDR_ANY;
+        bzero(&srvaddr, sizeof(srvaddr));
+        srvaddr.sin_len = sizeof(struct sockaddr_in);
+        srvaddr.sin_family = AF_INET;
+        srvaddr.sin_port = htons (route->ksnr_port);
+        srvaddr.sin_addr.s_addr = htonl (route->ksnr_ipaddr);
+
+        *may_retry = 0;
+
+        CFS_NET_IN;
+        rc = socreate(PF_INET, &so, SOCK_STREAM, 0); 
+        CFS_NET_EX;
+        *sockp = so;
+        if (rc != 0) {
+                CERROR ("Can't create autoconnect socket: %d\n", rc);
+                return (-rc);
+        }
+
+        /*
+         * XXX
+         * Liang: what do we need here? 
+         */
+        fd = sock_map_fd (so);
+        if (fd < 0) {
+                sock_release (so);
+                CERROR ("sock_map_fd error %d\n", fd);
+                return (fd);
+        }
+        sock_fdrelse(fd);
+
+        /* Set the socket timeouts, so our connection attempt completes in
+         * finite time */
+        tv.tv_sec = ksocknal_tunables.ksnd_io_timeout;
+        tv.tv_usec = 0;
+        bzero(&sopt, sizeof sopt);
+        sopt.sopt_dir = SOPT_SET;
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_name = SO_SNDTIMEO;
+        sopt.sopt_val = &tv;
+        sopt.sopt_valsize = sizeof(tv);
+
+        CFS_NET_IN;
+        rc = sosetopt(so, &sopt);
+        if (rc != 0) { 
+                CFS_NET_EX;
+                CERROR ("Can't set send timeout %d: %d\n",
+                        ksocknal_tunables.ksnd_io_timeout, rc);
+                goto out;
+        }
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_name = SO_RCVTIMEO;
+        rc = sosetopt(so, &sopt);
+        if (rc != 0) {
+                CFS_NET_EX;
+                CERROR ("Can't set receive timeout %d: %d\n",
+                        ksocknal_tunables.ksnd_io_timeout, rc);
+                goto out;
+        } 
+        option = 1;
+        sopt.sopt_level = SOL_SOCKET;
+        sopt.sopt_name = SO_REUSEADDR;
+        sopt.sopt_val = &option;
+        sopt.sopt_valsize = sizeof(option);
+        rc = sosetopt(so, &sopt);
+        if (rc != 0) {
+                CFS_NET_EX;
+                CERROR ("Can't set sock reuse address: %d\n", rc);
+                goto out;
+        } 
+        rc = sobind(so, (struct sockaddr *)&locaddr); 
+        if (rc == EADDRINUSE) { 
+                CFS_NET_EX; 
+                CDEBUG(D_NET, "Port %d already in use\n", local_port); 
+                *may_retry = 1; 
+                goto out;
+        }
+        if (rc != 0) { 
+                CFS_NET_EX; 
+                CERROR ("Can't bind to local IP Address %u.%u.%u.%u: %d\n", 
+                        HIPQUAD(route->ksnr_myipaddr), rc); 
+                goto out; 
+        }
+        rc = soconnect(so, (struct sockaddr *)&srvaddr);
+        *may_retry = (rc == EADDRNOTAVAIL || rc == EADDRINUSE);
+        if (rc != 0) { 
+                CFS_NET_EX;
+                if (rc != EADDRNOTAVAIL && rc != EADDRINUSE)
+                        CERROR ("Can't connect to nid "LPX64 
+                                " local IP: %u.%u.%u.%u," 
+                                " remote IP: %u.%u.%u.%u/%d: %d\n", 
+                                route->ksnr_peer->ksnp_nid, 
+                                HIPQUAD(route->ksnr_myipaddr), 
+                                HIPQUAD(route->ksnr_ipaddr), 
+                                route->ksnr_port, rc); 
+                goto out;
+        }
+
+        s = splnet();
+        while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+                CDEBUG(D_NET, "ksocknal sleep for waiting auto_connect.\n");
+                (void) tsleep((caddr_t)&so->so_timeo, PSOCK, "ksocknal_conn", hz);
+        }
+        LASSERT((so->so_state & SS_ISCONNECTED));
+        splx(s);
+        CFS_NET_EX;
+
+        rc = so->so_error; 
+        if (rc != 0) { 
+                CERROR ("Error %d waiting for connection to nid "LPX64 
+                        " local IP: %u.%u.%u.%u," 
+                        " remote IP: %u.%u.%u.%u/%d: %d\n", rc,
+                        route->ksnr_peer->ksnp_nid, 
+                        HIPQUAD(route->ksnr_myipaddr), 
+                        HIPQUAD(route->ksnr_ipaddr), 
+                        route->ksnr_port, rc); 
+                goto out; 
+        }
+        return (-rc);
+
+ out:
+        rele_file(KSN_SOCK2FILE(so));
+
+        return (-rc);
+}
+
+void
+ksocknal_lib_push_conn(ksock_conn_t *conn)
+{ 
+        struct socket   *sock; 
+        struct sockopt  sopt; 
+        int             val = 1; 
+        int             rc; 
+        CFS_DECL_NET_DATA; 
+        
+        rc = ksocknal_getconnsock (conn); 
+        if (rc != 0)            /* being shut down */ 
+                return; 
+        sock = conn->ksnc_sock; 
+        bzero(&sopt, sizeof sopt); 
+        sopt.sopt_dir = SOPT_SET; 
+        sopt.sopt_level = IPPROTO_TCP; 
+        sopt.sopt_name = TCP_NODELAY; 
+        sopt.sopt_val = &val; 
+        sopt.sopt_valsize = sizeof val; 
+
+        CFS_NET_IN; 
+        sosetopt(sock, &sopt); 
+        CFS_NET_EX; 
+
+        ksocknal_putconnsock (conn);
+        return;
+}
+
+extern void ksocknal_read_callback (ksock_conn_t *conn);
+extern void ksocknal_write_callback (ksock_conn_t *conn);
+
+static void
+ksocknal_upcall(struct socket *so, caddr_t arg, int waitf)
+{
+        ksock_conn_t  *conn;
+        CFS_DECL_NET_DATA;
+        ENTRY;
+
+        read_lock (&ksocknal_data.ksnd_global_lock);
+        conn = so->reserved3;
+
+        if (conn == NULL){
+                /* More processing is needed?  */
+                goto out;
+        }
+        if ((so->so_rcv.sb_flags & SB_UPCALL) || !arg ) {
+                extern int soreadable(struct socket *so);
+                CFS_NET_IN;
+                if (conn->ksnc_rx_nob_wanted && soreadable(so)){
+                        /* To verify whether the upcall is for receive */
+                        CFS_NET_EX;
+                        ksocknal_read_callback (conn);
+                }else
+                        CFS_NET_EX;
+        }
+        /* go foward? */
+        if ((so->so_snd.sb_flags & SB_UPCALL) || !arg){
+                extern int sowriteable(struct socket *so);
+                CFS_NET_IN;
+                if (sowriteable(so)){
+                        /* socket is writable */
+                        CFS_NET_EX;
+                        ksocknal_write_callback(conn);
+                } else 
+                        CFS_NET_EX;
+        }
+out:
+        read_unlock (&ksocknal_data.ksnd_global_lock);
+
+        EXIT;
+}
+
+void
+ksocknal_lib_save_callback(struct socket *sock, ksock_conn_t *conn)
+{ 
+        /* No callback need to save in osx */
+        return;
+}
+
+void
+ksocknal_lib_set_callback(struct socket *sock, ksock_conn_t *conn)
+{ 
+        CFS_DECL_NET_DATA;
+
+        CFS_NET_IN;
+        sock->so_upcallarg = (void *)sock;  /* anything not NULL */ 
+        sock->so_upcall = ksocknal_upcall; 
+        sock->so_snd.sb_timeo = 0; 
+        sock->so_rcv.sb_timeo = 2 * HZ; 
+        sock->so_rcv.sb_flags |= SB_UPCALL; 
+        sock->so_snd.sb_flags |= SB_UPCALL; 
+        sock->reserved3 = conn;
+        CFS_NET_EX;
+        return;
+}
+
+void
+ksocknal_lib_act_callback(struct socket *sock)
+{
+        /* upcall will take the network funnel */
+        ksocknal_upcall (sock, 0, 0);
+}
+
+void 
+ksocknal_lib_reset_callback(struct socket *sock, ksock_conn_t *conn)
+{ 
+        CFS_DECL_NET_DATA;
+
+        CFS_NET_IN;
+        sock->so_upcall = NULL; 
+        sock->so_upcallarg = NULL; 
+        sock->so_rcv.sb_flags &= ~SB_UPCALL; 
+        sock->so_snd.sb_flags &= ~SB_UPCALL;
+        CFS_NET_EX;
+}
+
+