Whamcloud - gitweb
LU-14945 lnet: don't use hops to determine the route state
[fs/lustre-release.git] / lnet / lnet / lib-socket.c
index 5348ac0..90cdc3e 100644 (file)
  *
  * You should have received a copy of the GNU General Public License
  * version 2 along with this program; If not, see
- * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
- *
- * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
- * CA 95054 USA or visit www.sun.com if you need additional information or
- * have any questions.
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
  * GPL HEADER END
  */
  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2012, 2014, Intel Corporation.
+ * Copyright (c) 2015, 2017, Intel Corporation.
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
  */
 #define DEBUG_SUBSYSTEM S_LNET
 
-#ifdef HAVE_COMPAT_RDMA
-#include <linux/compat-2.6.h>
-#endif
 #include <linux/if.h>
 #include <linux/in.h>
 #include <linux/net.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
 /* For sys_open & sys_close */
 #include <linux/syscalls.h>
 #include <net/sock.h>
+#include <linux/inetdevice.h>
 
+#include <libcfs/linux/linux-time.h>
+#include <libcfs/linux/linux-net.h>
 #include <libcfs/libcfs.h>
 #include <lnet/lib-lnet.h>
 
-static int
-kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg)
-{
-       mm_segment_t oldfs = get_fs();
-       int err;
-
-       set_fs(KERNEL_DS);
-       err = filp->f_op->unlocked_ioctl(filp, cmd, arg);
-       set_fs(oldfs);
-
-       return err;
-}
-
-static int
-lnet_sock_ioctl(int cmd, unsigned long arg)
-{
-       struct file    *sock_filp;
-       struct socket  *sock;
-       int             fd = -1;
-       int             rc;
-
-       rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
-       if (rc != 0) {
-               CERROR("Can't create socket: %d\n", rc);
-               return rc;
-       }
-
-#if !defined(HAVE_SOCK_ALLOC_FILE) && !defined(HAVE_SOCK_ALLOC_FILE_3ARGS)
-       fd = sock_map_fd(sock, 0);
-       if (fd < 0) {
-               rc = fd;
-               sock_release(sock);
-               goto out;
-       }
-       sock_filp = fget(fd);
-#else
-# ifdef HAVE_SOCK_ALLOC_FILE_3ARGS
-       sock_filp = sock_alloc_file(sock, 0, NULL);
-# else
-       sock_filp = sock_alloc_file(sock, 0);
-# endif
-#endif
-       if (IS_ERR(sock_filp)) {
-               rc = PTR_ERR(sock_filp);
-               sock_release(sock);
-               goto out;
-       }
-
-       rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg);
-
-       fput(sock_filp);
-out:
-       if (fd >= 0)
-               sys_close(fd);
-       return rc;
-}
-
-int
-lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask)
-{
-       struct ifreq    ifr;
-       int             nob;
-       int             rc;
-       __u32           val;
-
-       nob = strnlen(name, IFNAMSIZ);
-       if (nob == IFNAMSIZ) {
-               CERROR("Interface name %s too long\n", name);
-               return -EINVAL;
-       }
-
-       CLASSERT(sizeof(ifr.ifr_name) >= IFNAMSIZ);
-
-       if (strlen(name) > sizeof(ifr.ifr_name)-1)
-               return -E2BIG;
-       strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
-
-       rc = lnet_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr);
-       if (rc != 0) {
-               CERROR("Can't get flags for interface %s\n", name);
-               return rc;
-       }
-
-       if ((ifr.ifr_flags & IFF_UP) == 0) {
-               CDEBUG(D_NET, "Interface %s down\n", name);
-               *up = 0;
-               *ip = *mask = 0;
-               return 0;
-       }
-       *up = 1;
-
-       if (strlen(name) > sizeof(ifr.ifr_name)-1)
-               return -E2BIG;
-       strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
-
-       ifr.ifr_addr.sa_family = AF_INET;
-       rc = lnet_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr);
-
-       if (rc != 0) {
-               CERROR("Can't get IP address for interface %s\n", name);
-               return rc;
-       }
-
-       val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr;
-       *ip = ntohl(val);
-
-       if (strlen(name) > sizeof(ifr.ifr_name)-1)
-               return -E2BIG;
-       strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
-
-       ifr.ifr_addr.sa_family = AF_INET;
-       rc = lnet_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr);
-       if (rc != 0) {
-               CERROR("Can't get netmask for interface %s\n", name);
-               return rc;
-       }
-
-       val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr;
-       *mask = ntohl(val);
-
-       return 0;
-}
-EXPORT_SYMBOL(lnet_ipif_query);
-
-void
-lnet_ipif_free_enumeration(char **names, int n)
-{
-       int     i;
-
-       LASSERT(n > 0);
-
-       for (i = 0; i < n && names[i] != NULL; i++)
-               LIBCFS_FREE(names[i], IFNAMSIZ);
-
-       LIBCFS_FREE(names, n * sizeof(*names));
-}
-EXPORT_SYMBOL(lnet_ipif_free_enumeration);
-
-int
-lnet_ipif_enumerate(char ***namesp)
-{
-       /* Allocate and fill in 'names', returning # interfaces/error */
-       char          **names;
-       int             toobig;
-       int             nalloc;
-       int             nfound;
-       struct ifreq   *ifr;
-       struct ifconf   ifc;
-       int             rc;
-       int             nob;
-       int             i;
-
-       nalloc = 16;    /* first guess at max interfaces */
-       toobig = 0;
-       for (;;) {
-               if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) {
-                       toobig = 1;
-                       nalloc = PAGE_CACHE_SIZE/sizeof(*ifr);
-                       CWARN("Too many interfaces: only enumerating "
-                             "first %d\n", nalloc);
-               }
-
-               LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr));
-               if (ifr == NULL) {
-                       CERROR("ENOMEM enumerating up to %d interfaces\n",
-                              nalloc);
-                       rc = -ENOMEM;
-                       goto out0;
-               }
-
-               ifc.ifc_buf = (char *)ifr;
-               ifc.ifc_len = nalloc * sizeof(*ifr);
-
-               rc = lnet_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc);
-               if (rc < 0) {
-                       CERROR("Error %d enumerating interfaces\n", rc);
-                       goto out1;
-               }
-
-               LASSERT(rc == 0);
-
-               nfound = ifc.ifc_len/sizeof(*ifr);
-               LASSERT(nfound <= nalloc);
-
-               if (nfound < nalloc || toobig)
-                       break;
-
-               LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
-               nalloc *= 2;
-       }
-
-       if (nfound == 0)
-               goto out1;
-
-       LIBCFS_ALLOC(names, nfound * sizeof(*names));
-       if (names == NULL) {
-               rc = -ENOMEM;
-               goto out1;
-       }
-
-       for (i = 0; i < nfound; i++) {
-               nob = strnlen(ifr[i].ifr_name, IFNAMSIZ);
-               if (nob == IFNAMSIZ) {
-                       /* no space for terminating NULL */
-                       CERROR("interface name %.*s too long (%d max)\n",
-                              nob, ifr[i].ifr_name, IFNAMSIZ);
-                       rc = -ENAMETOOLONG;
-                       goto out2;
-               }
-
-               LIBCFS_ALLOC(names[i], IFNAMSIZ);
-               if (names[i] == NULL) {
-                       rc = -ENOMEM;
-                       goto out2;
-               }
-
-               memcpy(names[i], ifr[i].ifr_name, nob);
-               names[i][nob] = 0;
-       }
-
-       *namesp = names;
-       rc = nfound;
-
- out2:
-       if (rc < 0)
-               lnet_ipif_free_enumeration(names, nfound);
- out1:
-       LIBCFS_FREE(ifr, nalloc * sizeof(*ifr));
- out0:
-       return rc;
-}
-EXPORT_SYMBOL(lnet_ipif_enumerate);
-
 int
 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
 {
-       int             rc;
-       long            jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
-       unsigned long   then;
-       struct timeval  tv;
+       int rc;
+       long jiffies_left = cfs_time_seconds(timeout);
+       unsigned long then;
 
        LASSERT(nob > 0);
        /* Caller may pass a zero timeout if she thinks the socket buffer is
@@ -305,24 +68,12 @@ lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
                };
 
                if (timeout != 0) {
+                       struct sock *sk = sock->sk;
+
                        /* Set send timeout to remaining time */
-                       tv = (struct timeval) {
-                               .tv_sec = jiffies_left /
-                                         msecs_to_jiffies(MSEC_PER_SEC),
-                               .tv_usec = ((jiffies_left %
-                                            msecs_to_jiffies(MSEC_PER_SEC)) *
-                                            USEC_PER_SEC) /
-                                            msecs_to_jiffies(MSEC_PER_SEC)
-                       };
-
-                       rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO,
-                                              (char *)&tv, sizeof(tv));
-                       if (rc != 0) {
-                               CERROR("Can't set socket send timeout "
-                                      "%ld.%06d: %d\n",
-                                      (long)tv.tv_sec, (int)tv.tv_usec, rc);
-                               return rc;
-                       }
+                       lock_sock(sk);
+                       sk->sk_sndtimeo = jiffies_left;
+                       release_sock(sk);
                }
 
                then = jiffies;
@@ -353,10 +104,9 @@ EXPORT_SYMBOL(lnet_sock_write);
 int
 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
 {
-       int             rc;
-       long            jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC);
-       unsigned long   then;
-       struct timeval  tv;
+       int rc;
+       long jiffies_left = cfs_time_seconds(timeout);
+       unsigned long then;
 
        LASSERT(nob > 0);
        LASSERT(jiffies_left > 0);
@@ -369,22 +119,12 @@ lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
                struct msghdr msg = {
                        .msg_flags      = 0
                };
+               struct sock *sk = sock->sk;
 
                /* Set receive timeout to remaining time */
-               tv = (struct timeval) {
-                       .tv_sec = jiffies_left / msecs_to_jiffies(MSEC_PER_SEC),
-                       .tv_usec = ((jiffies_left %
-                                       msecs_to_jiffies(MSEC_PER_SEC)) *
-                                       USEC_PER_SEC) /
-                                       msecs_to_jiffies(MSEC_PER_SEC)
-               };
-               rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO,
-                                      (char *)&tv, sizeof(tv));
-               if (rc != 0) {
-                       CERROR("Can't set socket recv timeout %ld.%06d: %d\n",
-                              (long)tv.tv_sec, (int)tv.tv_usec, rc);
-                       return rc;
-               }
+               lock_sock(sk);
+               sk->sk_rcvtimeo = jiffies_left;
+               release_sock(sk);
 
                then = jiffies;
                rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
@@ -408,45 +148,158 @@ lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
 }
 EXPORT_SYMBOL(lnet_sock_read);
 
-static int
-lnet_sock_create(struct socket **sockp, int *fatal,
-                __u32 local_ip, int local_port)
+int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
 {
-       struct sockaddr_in  locaddr;
-       struct socket      *sock;
-       int                 rc;
-       int                 option;
+       struct net_device *dev;
+       struct in_device *in_dev;
+       int err;
+       DECLARE_CONST_IN_IFADDR(ifa);
 
-       /* All errors are fatal except bind failure if the port is in use */
-       *fatal = 1;
+       rcu_read_lock();
+       dev = dev_get_by_index_rcu(ns, interface);
+       err = -EINVAL;
+       if (!dev || !(dev->flags & IFF_UP))
+               goto out;
+       in_dev = __in_dev_get_rcu(dev);
+       if (!in_dev)
+               goto out;
+       err = -ENOENT;
+       in_dev_for_each_ifa_rcu(ifa, in_dev) {
+               if (err ||
+                   ((dst_ipaddr ^ ntohl(ifa->ifa_local))
+                    & ntohl(ifa->ifa_mask)) == 0) {
+                       /* This address at least as good as what we
+                        * already have
+                        */
+                       *ret = ntohl(ifa->ifa_local);
+                       err = 0;
+               }
+       }
+       endfor_ifa(in_dev);
+out:
+       rcu_read_unlock();
+       return err;
+}
+EXPORT_SYMBOL(choose_ipv4_src);
 
-       rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock);
-       *sockp = sock;
-       if (rc != 0) {
-               CERROR("Can't create socket: %d\n", rc);
-               return rc;
+static struct socket *
+lnet_sock_create(int interface, struct sockaddr *remaddr,
+                int local_port, struct net *ns)
+{
+       struct socket *sock;
+       int rc;
+       int family;
+
+       family = AF_INET6;
+       if (remaddr)
+               family = remaddr->sa_family;
+retry:
+#ifdef HAVE_SOCK_CREATE_KERN_USE_NET
+       rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
+#else
+       rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
+#endif
+       if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
+               family = AF_INET;
+               goto retry;
        }
 
-       option = 1;
-       rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
-                              (char *)&option, sizeof(option));
-       if (rc != 0) {
-               CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc);
-               goto failed;
+       if (rc) {
+               CERROR("Can't create socket: %d\n", rc);
+               return ERR_PTR(rc);
        }
 
-       if (local_ip != 0 || local_port != 0) {
-               memset(&locaddr, 0, sizeof(locaddr));
-               locaddr.sin_family = AF_INET;
-               locaddr.sin_port = htons(local_port);
-               locaddr.sin_addr.s_addr = (local_ip == 0) ?
-                                         INADDR_ANY : htonl(local_ip);
+       sock->sk->sk_reuseport = 1;
+
+       if (interface >= 0 || local_port != 0) {
+               struct sockaddr_storage locaddr = {};
 
+               switch (family) {
+               case AF_INET: {
+                       struct sockaddr_in *sin = (void *)&locaddr;
+
+                       sin->sin_family = AF_INET;
+                       sin->sin_addr.s_addr = INADDR_ANY;
+                       if (interface >= 0 && remaddr) {
+                               struct sockaddr_in *rem = (void *)remaddr;
+                               __u32 ip;
+
+                               rc = choose_ipv4_src(&ip,
+                                                    interface,
+                                                    ntohl(rem->sin_addr.s_addr),
+                                                    ns);
+                               if (rc)
+                                       goto failed;
+                               sin->sin_addr.s_addr = htonl(ip);
+                       }
+                       sin->sin_port = htons(local_port);
+                       break;
+               }
+#if IS_ENABLED(CONFIG_IPV6)
+               case AF_INET6: {
+                       struct sockaddr_in6 *sin6 = (void *)&locaddr;
+                       int val = 0;
+
+                       sin6->sin6_family = AF_INET6;
+                       sin6->sin6_addr = in6addr_any;
+
+                       /* Make sure we get both IPv4 and IPv6 connections.
+                        * This is the default, but it can be overridden so we
+                        * force it back.
+                        */
+#ifdef HAVE_KERNEL_SETSOCKOPT
+                       kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+                                         (char *) &val, sizeof(val));
+#elif defined(_LINUX_SOCKPTR_H)
+                       /* sockptr_t was introduced around
+                        * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
+                        * kernel address to be passed to ->setsockopt
+                        */
+                       if (ipv6_only_sock(sock->sk)) {
+                               sockptr_t optval = KERNEL_SOCKPTR(&val);
+
+                               sock->ops->setsockopt(sock,
+                                                     IPPROTO_IPV6, IPV6_V6ONLY,
+                                                     optval, sizeof(val));
+                       }
+#else
+                       /* From v5.7-rc6-2614-g5a892ff2facb when
+                        * kernel_setsockopt() was removed until
+                        * sockptr_t (above) there is no clean way to
+                        * pass kernel address to setsockopt.  We could
+                        * use get_fs()/set_fs(), but in this particular
+                        * situation there is an easier way.  It depends
+                        * on the fact that at least for these few
+                        * kernels a NULL address to ipv6_setsockopt()
+                        * is treated like the address of a zero.
+                        */
+                       if (ipv6_only_sock(sock->sk) && !val) {
+                               void *optval = NULL;
+
+                               sock->ops->setsockopt(sock,
+                                                     IPPROTO_IPV6, IPV6_V6ONLY,
+                                                     optval, sizeof(val));
+                       }
+#endif /* HAVE_KERNEL_SETSOCKOPT */
+
+                       if (interface >= 0 && remaddr) {
+                               struct sockaddr_in6 *rem = (void *)remaddr;
+
+                               ipv6_dev_get_saddr(ns,
+                                                  dev_get_by_index(ns,
+                                                                   interface),
+                                                  &rem->sin6_addr, 0,
+                                                  &sin6->sin6_addr);
+                       }
+                       sin6->sin6_port = htons(local_port);
+                       break;
+               }
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+               }
                rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
                                 sizeof(locaddr));
                if (rc == -EADDRINUSE) {
                        CDEBUG(D_NET, "Port %d already in use\n", local_port);
-                       *fatal = 0;
                        goto failed;
                }
                if (rc != 0) {
@@ -455,186 +308,127 @@ lnet_sock_create(struct socket **sockp, int *fatal,
                        goto failed;
                }
        }
-       return 0;
+       return sock;
 
 failed:
        sock_release(sock);
-       return rc;
+       return ERR_PTR(rc);
 }
 
-int
+void
 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
 {
-       int                 option;
-       int                 rc;
+       struct sock *sk = sock->sk;
 
        if (txbufsize != 0) {
-               option = txbufsize;
-               rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
-                                      (char *)&option, sizeof(option));
-               if (rc != 0) {
-                       CERROR("Can't set send buffer %d: %d\n",
-                               option, rc);
-                       return rc;
-               }
+               sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+               sk->sk_sndbuf = txbufsize;
+               sk->sk_write_space(sk);
        }
 
        if (rxbufsize != 0) {
-               option = rxbufsize;
-               rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
-                                      (char *)&option, sizeof(option));
-               if (rc != 0) {
-                       CERROR("Can't set receive buffer %d: %d\n",
-                               option, rc);
-                       return rc;
-               }
+               sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+               sk->sk_sndbuf = rxbufsize;
        }
-       return 0;
 }
 EXPORT_SYMBOL(lnet_sock_setbuf);
 
 int
-lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port)
+lnet_sock_getaddr(struct socket *sock, bool remote,
+                 struct sockaddr_storage *peer)
 {
-       struct sockaddr_in sin;
-       int                len = sizeof(sin);
-       int                rc;
+       int rc;
+#ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
+       int len = sizeof(*peer);
+#endif
 
        if (remote)
-               rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &len);
+               rc = lnet_kernel_getpeername(sock,
+                                            (struct sockaddr *)peer, &len);
        else
-               rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &len);
-       if (rc != 0) {
+               rc = lnet_kernel_getsockname(sock,
+                                            (struct sockaddr *)peer, &len);
+       if (rc < 0) {
                CERROR("Error %d getting sock %s IP/port\n",
                        rc, remote ? "peer" : "local");
                return rc;
        }
-
-       if (ip != NULL)
-               *ip = ntohl(sin.sin_addr.s_addr);
-
-       if (port != NULL)
-               *port = ntohs(sin.sin_port);
-
+       if (peer->ss_family == AF_INET6) {
+               struct sockaddr_in6 *in6 = (void *)peer;
+               struct sockaddr_in *in = (void *)peer;
+               short port = in6->sin6_port;
+
+               if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
+                       /* Pretend it is a v4 socket */
+                       memset(in, 0, sizeof(*in));
+                       in->sin_family = AF_INET;
+                       in->sin_port = port;
+                       memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
+               }
+       }
        return 0;
 }
 EXPORT_SYMBOL(lnet_sock_getaddr);
 
-int
-lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
+void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
 {
        if (txbufsize != NULL)
                *txbufsize = sock->sk->sk_sndbuf;
 
        if (rxbufsize != NULL)
                *rxbufsize = sock->sk->sk_rcvbuf;
-
-       return 0;
 }
 EXPORT_SYMBOL(lnet_sock_getbuf);
 
-int
-lnet_sock_listen(struct socket **sockp,
-                  __u32 local_ip, int local_port, int backlog)
+struct socket *
+lnet_sock_listen(int local_port, int backlog, struct net *ns)
 {
-       int      fatal;
-       int      rc;
+       struct socket *sock;
+       int rc;
 
-       rc = lnet_sock_create(sockp, &fatal, local_ip, local_port);
-       if (rc != 0) {
-               if (!fatal)
+       sock = lnet_sock_create(-1, NULL, local_port, ns);
+       if (IS_ERR(sock)) {
+               rc = PTR_ERR(sock);
+               if (rc == -EADDRINUSE)
                        CERROR("Can't create socket: port %d already in use\n",
                               local_port);
-               return rc;
+               return ERR_PTR(rc);
        }
 
-       rc = kernel_listen(*sockp, backlog);
+       rc = kernel_listen(sock, backlog);
        if (rc == 0)
-               return 0;
+               return sock;
 
        CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
-       sock_release(*sockp);
-       return rc;
-}
-
-#ifndef HAVE_SK_SLEEP
-static inline wait_queue_head_t *sk_sleep(struct sock *sk)
-{
-       return sk->sk_sleep;
-}
-#endif
-
-int
-lnet_sock_accept(struct socket **newsockp, struct socket *sock)
-{
-       wait_queue_t   wait;
-       struct socket *newsock;
-       int            rc;
-
-       /* XXX this should add a ref to sock->ops->owner, if
-        * TCP could be a module */
-       rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock);
-       if (rc) {
-               CERROR("Can't allocate socket\n");
-               return rc;
-       }
-
-       newsock->ops = sock->ops;
-
-       rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
-       if (rc == -EAGAIN) {
-               /* Nothing ready, so wait for activity */
-               init_waitqueue_entry(&wait, current);
-               add_wait_queue(sk_sleep(sock->sk), &wait);
-               set_current_state(TASK_INTERRUPTIBLE);
-               schedule();
-               remove_wait_queue(sk_sleep(sock->sk), &wait);
-               rc = sock->ops->accept(sock, newsock, O_NONBLOCK);
-       }
-
-       if (rc != 0)
-               goto failed;
-
-       *newsockp = newsock;
-       return 0;
-
-failed:
-       sock_release(newsock);
-       return rc;
+       sock_release(sock);
+       return ERR_PTR(rc);
 }
 
-int
-lnet_sock_connect(struct socket **sockp, int *fatal,
-                 __u32 local_ip, int local_port,
-                 __u32 peer_ip, int peer_port)
+struct socket *
+lnet_sock_connect(int interface, int local_port,
+                 struct sockaddr *peeraddr,
+                 struct net *ns)
 {
-       struct sockaddr_in  srvaddr;
-       int                 rc;
+       struct socket *sock;
+       int rc;
 
-       rc = lnet_sock_create(sockp, fatal, local_ip, local_port);
-       if (rc != 0)
-               return rc;
+       sock = lnet_sock_create(interface, peeraddr, local_port, ns);
+       if (IS_ERR(sock))
+               return sock;
 
-       memset(&srvaddr, 0, sizeof(srvaddr));
-       srvaddr.sin_family = AF_INET;
-       srvaddr.sin_port = htons(peer_port);
-       srvaddr.sin_addr.s_addr = htonl(peer_ip);
-
-       rc = kernel_connect(*sockp, (struct sockaddr *)&srvaddr,
-                           sizeof(srvaddr), 0);
+       rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
        if (rc == 0)
-               return 0;
+               return sock;
 
        /* EADDRNOTAVAIL probably means we're already connected to the same
         * peer/port on the same local port on a differently typed
         * connection.  Let our caller retry with a different local
         * port... */
-       *fatal = !(rc == -EADDRNOTAVAIL);
 
-       CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET,
-              "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc,
-              &local_ip, local_port, &peer_ip, peer_port);
+       CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
+                    "Error %d connecting %d -> %pISp\n", rc,
+                    local_port, peeraddr);
 
-       sock_release(*sockp);
-       return rc;
+       sock_release(sock);
+       return ERR_PTR(rc);
 }