X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Flnet%2Flib-socket.c;h=90cdc3e2b4dbe9bc978316d738280aacbec7582c;hp=9392f4f934ecc1ca0f17a8f6b7c880bef0fa874f;hb=3f2844dc9333c86452c37bd7b4519729b1351371;hpb=703ebd87c1705810b8f8eb0f8f25ebef11bde8fc diff --git a/lnet/lnet/lib-socket.c b/lnet/lnet/lib-socket.c index 9392f4f..90cdc3e 100644 --- a/lnet/lnet/lib-socket.c +++ b/lnet/lnet/lib-socket.c @@ -15,11 +15,7 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ @@ -27,266 +23,36 @@ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2015, Intel Corporation. + * Copyright (c) 2015, 2017, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. */ #define DEBUG_SUBSYSTEM S_LNET #include #include #include +#include +#include #include #include /* For sys_open & sys_close */ #include #include +#include +#include +#include #include #include -static int -kernel_sock_unlocked_ioctl(struct file *filp, int cmd, unsigned long arg) -{ - mm_segment_t oldfs = get_fs(); - int err; - - set_fs(KERNEL_DS); - err = filp->f_op->unlocked_ioctl(filp, cmd, arg); - set_fs(oldfs); - - return err; -} - -static int -lnet_sock_ioctl(int cmd, unsigned long arg) -{ - struct file *sock_filp; - struct socket *sock; - int fd = -1; - int rc; - - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - if (rc != 0) { - CERROR("Can't create socket: %d\n", rc); - return rc; - } - -#if !defined(HAVE_SOCK_ALLOC_FILE) && !defined(HAVE_SOCK_ALLOC_FILE_3ARGS) - fd = sock_map_fd(sock, 0); - if (fd < 0) { - rc = fd; - sock_release(sock); - goto out; - } - sock_filp = fget(fd); -#else -# ifdef HAVE_SOCK_ALLOC_FILE_3ARGS - sock_filp = sock_alloc_file(sock, 0, NULL); -# else - sock_filp = sock_alloc_file(sock, 0); -# endif -#endif - if (IS_ERR(sock_filp)) { - rc = PTR_ERR(sock_filp); - sock_release(sock); - goto out; - } - - rc = kernel_sock_unlocked_ioctl(sock_filp, cmd, arg); - - fput(sock_filp); -out: - if (fd >= 0) - sys_close(fd); - return rc; -} - -int -lnet_ipif_query(char *name, int *up, __u32 *ip, __u32 *mask) -{ - struct ifreq ifr; - int nob; - int rc; - __u32 val; - - nob = strnlen(name, IFNAMSIZ); - if (nob == IFNAMSIZ) { - CERROR("Interface name %s too long\n", name); - return -EINVAL; - } - - CLASSERT(sizeof(ifr.ifr_name) >= IFNAMSIZ); - - if (strlen(name) > sizeof(ifr.ifr_name)-1) - return -E2BIG; - strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - - rc = lnet_sock_ioctl(SIOCGIFFLAGS, (unsigned long)&ifr); - if (rc != 0) { - CERROR("Can't get flags for interface %s\n", name); - return rc; - } - - if ((ifr.ifr_flags & IFF_UP) == 0) { - CDEBUG(D_NET, "Interface %s down\n", name); - *up = 0; - *ip = *mask = 0; - return 0; - } - *up = 1; - - if (strlen(name) > sizeof(ifr.ifr_name)-1) - return -E2BIG; - strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - - ifr.ifr_addr.sa_family = AF_INET; - rc = lnet_sock_ioctl(SIOCGIFADDR, (unsigned long)&ifr); - - if (rc != 0) { - CERROR("Can't get IP address for interface %s\n", name); - return rc; - } - - val = ((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr.s_addr; - *ip = ntohl(val); - - if (strlen(name) > sizeof(ifr.ifr_name)-1) - return -E2BIG; - strncpy(ifr.ifr_name, name, sizeof(ifr.ifr_name)); - - ifr.ifr_addr.sa_family = AF_INET; - rc = lnet_sock_ioctl(SIOCGIFNETMASK, (unsigned long)&ifr); - if (rc != 0) { - CERROR("Can't get netmask for interface %s\n", name); - return rc; - } - - val = ((struct sockaddr_in *)&ifr.ifr_netmask)->sin_addr.s_addr; - *mask = ntohl(val); - - return 0; -} -EXPORT_SYMBOL(lnet_ipif_query); - -void -lnet_ipif_free_enumeration(char **names, int n) -{ - int i; - - LASSERT(n > 0); - - for (i = 0; i < n && names[i] != NULL; i++) - LIBCFS_FREE(names[i], IFNAMSIZ); - - LIBCFS_FREE(names, n * sizeof(*names)); -} -EXPORT_SYMBOL(lnet_ipif_free_enumeration); - -int -lnet_ipif_enumerate(char ***namesp) -{ - /* Allocate and fill in 'names', returning # interfaces/error */ - char **names; - int toobig; - int nalloc; - int nfound; - struct ifreq *ifr; - struct ifconf ifc; - int rc; - int nob; - int i; - - nalloc = 16; /* first guess at max interfaces */ - toobig = 0; - for (;;) { - if (nalloc * sizeof(*ifr) > PAGE_CACHE_SIZE) { - toobig = 1; - nalloc = PAGE_CACHE_SIZE/sizeof(*ifr); - CWARN("Too many interfaces: only enumerating " - "first %d\n", nalloc); - } - - LIBCFS_ALLOC(ifr, nalloc * sizeof(*ifr)); - if (ifr == NULL) { - CERROR("ENOMEM enumerating up to %d interfaces\n", - nalloc); - rc = -ENOMEM; - goto out0; - } - - ifc.ifc_buf = (char *)ifr; - ifc.ifc_len = nalloc * sizeof(*ifr); - - rc = lnet_sock_ioctl(SIOCGIFCONF, (unsigned long)&ifc); - if (rc < 0) { - CERROR("Error %d enumerating interfaces\n", rc); - goto out1; - } - - LASSERT(rc == 0); - - nfound = ifc.ifc_len/sizeof(*ifr); - LASSERT(nfound <= nalloc); - - if (nfound < nalloc || toobig) - break; - - LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); - nalloc *= 2; - } - - if (nfound == 0) - goto out1; - - LIBCFS_ALLOC(names, nfound * sizeof(*names)); - if (names == NULL) { - rc = -ENOMEM; - goto out1; - } - - for (i = 0; i < nfound; i++) { - nob = strnlen(ifr[i].ifr_name, IFNAMSIZ); - if (nob == IFNAMSIZ) { - /* no space for terminating NULL */ - CERROR("interface name %.*s too long (%d max)\n", - nob, ifr[i].ifr_name, IFNAMSIZ); - rc = -ENAMETOOLONG; - goto out2; - } - - LIBCFS_ALLOC(names[i], IFNAMSIZ); - if (names[i] == NULL) { - rc = -ENOMEM; - goto out2; - } - - memcpy(names[i], ifr[i].ifr_name, nob); - names[i][nob] = 0; - } - - *namesp = names; - rc = nfound; - - out2: - if (rc < 0) - lnet_ipif_free_enumeration(names, nfound); - out1: - LIBCFS_FREE(ifr, nalloc * sizeof(*ifr)); - out0: - return rc; -} -EXPORT_SYMBOL(lnet_ipif_enumerate); - int lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) { - int rc; - long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); - unsigned long then; - struct timeval tv; + int rc; + long jiffies_left = cfs_time_seconds(timeout); + unsigned long then; LASSERT(nob > 0); /* Caller may pass a zero timeout if she thinks the socket buffer is @@ -302,24 +68,12 @@ lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout) }; if (timeout != 0) { + struct sock *sk = sock->sk; + /* Set send timeout to remaining time */ - tv = (struct timeval) { - .tv_sec = jiffies_left / - msecs_to_jiffies(MSEC_PER_SEC), - .tv_usec = ((jiffies_left % - msecs_to_jiffies(MSEC_PER_SEC)) * - USEC_PER_SEC) / - msecs_to_jiffies(MSEC_PER_SEC) - }; - - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, - (char *)&tv, sizeof(tv)); - if (rc != 0) { - CERROR("Can't set socket send timeout " - "%ld.%06d: %d\n", - (long)tv.tv_sec, (int)tv.tv_usec, rc); - return rc; - } + lock_sock(sk); + sk->sk_sndtimeo = jiffies_left; + release_sock(sk); } then = jiffies; @@ -350,10 +104,9 @@ EXPORT_SYMBOL(lnet_sock_write); int lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout) { - int rc; - long jiffies_left = timeout * msecs_to_jiffies(MSEC_PER_SEC); - unsigned long then; - struct timeval tv; + int rc; + long jiffies_left = cfs_time_seconds(timeout); + unsigned long then; LASSERT(nob > 0); LASSERT(jiffies_left > 0); @@ -366,22 +119,12 @@ lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout) struct msghdr msg = { .msg_flags = 0 }; + struct sock *sk = sock->sk; /* Set receive timeout to remaining time */ - tv = (struct timeval) { - .tv_sec = jiffies_left / msecs_to_jiffies(MSEC_PER_SEC), - .tv_usec = ((jiffies_left % - msecs_to_jiffies(MSEC_PER_SEC)) * - USEC_PER_SEC) / - msecs_to_jiffies(MSEC_PER_SEC) - }; - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, - (char *)&tv, sizeof(tv)); - if (rc != 0) { - CERROR("Can't set socket recv timeout %ld.%06d: %d\n", - (long)tv.tv_sec, (int)tv.tv_usec, rc); - return rc; - } + lock_sock(sk); + sk->sk_rcvtimeo = jiffies_left; + release_sock(sk); then = jiffies; rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0); @@ -405,45 +148,158 @@ lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout) } EXPORT_SYMBOL(lnet_sock_read); -static int -lnet_sock_create(struct socket **sockp, int *fatal, - __u32 local_ip, int local_port) +int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns) { - struct sockaddr_in locaddr; - struct socket *sock; - int rc; - int option; + struct net_device *dev; + struct in_device *in_dev; + int err; + DECLARE_CONST_IN_IFADDR(ifa); - /* All errors are fatal except bind failure if the port is in use */ - *fatal = 1; + rcu_read_lock(); + dev = dev_get_by_index_rcu(ns, interface); + err = -EINVAL; + if (!dev || !(dev->flags & IFF_UP)) + goto out; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto out; + err = -ENOENT; + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (err || + ((dst_ipaddr ^ ntohl(ifa->ifa_local)) + & ntohl(ifa->ifa_mask)) == 0) { + /* This address at least as good as what we + * already have + */ + *ret = ntohl(ifa->ifa_local); + err = 0; + } + } + endfor_ifa(in_dev); +out: + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(choose_ipv4_src); - rc = sock_create(PF_INET, SOCK_STREAM, 0, &sock); - *sockp = sock; - if (rc != 0) { - CERROR("Can't create socket: %d\n", rc); - return rc; +static struct socket * +lnet_sock_create(int interface, struct sockaddr *remaddr, + int local_port, struct net *ns) +{ + struct socket *sock; + int rc; + int family; + + family = AF_INET6; + if (remaddr) + family = remaddr->sa_family; +retry: +#ifdef HAVE_SOCK_CREATE_KERN_USE_NET + rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock); +#else + rc = sock_create_kern(family, SOCK_STREAM, 0, &sock); +#endif + if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) { + family = AF_INET; + goto retry; } - option = 1; - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&option, sizeof(option)); - if (rc != 0) { - CERROR("Can't set SO_REUSEADDR for socket: %d\n", rc); - goto failed; + if (rc) { + CERROR("Can't create socket: %d\n", rc); + return ERR_PTR(rc); } - if (local_ip != 0 || local_port != 0) { - memset(&locaddr, 0, sizeof(locaddr)); - locaddr.sin_family = AF_INET; - locaddr.sin_port = htons(local_port); - locaddr.sin_addr.s_addr = (local_ip == 0) ? - INADDR_ANY : htonl(local_ip); + sock->sk->sk_reuseport = 1; + + if (interface >= 0 || local_port != 0) { + struct sockaddr_storage locaddr = {}; + switch (family) { + case AF_INET: { + struct sockaddr_in *sin = (void *)&locaddr; + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + if (interface >= 0 && remaddr) { + struct sockaddr_in *rem = (void *)remaddr; + __u32 ip; + + rc = choose_ipv4_src(&ip, + interface, + ntohl(rem->sin_addr.s_addr), + ns); + if (rc) + goto failed; + sin->sin_addr.s_addr = htonl(ip); + } + sin->sin_port = htons(local_port); + break; + } +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: { + struct sockaddr_in6 *sin6 = (void *)&locaddr; + int val = 0; + + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = in6addr_any; + + /* Make sure we get both IPv4 and IPv6 connections. + * This is the default, but it can be overridden so we + * force it back. + */ +#ifdef HAVE_KERNEL_SETSOCKOPT + kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + (char *) &val, sizeof(val)); +#elif defined(_LINUX_SOCKPTR_H) + /* sockptr_t was introduced around + * v5.8-rc4-1952-ga7b75c5a8c41 and allows a + * kernel address to be passed to ->setsockopt + */ + if (ipv6_only_sock(sock->sk)) { + sockptr_t optval = KERNEL_SOCKPTR(&val); + + sock->ops->setsockopt(sock, + IPPROTO_IPV6, IPV6_V6ONLY, + optval, sizeof(val)); + } +#else + /* From v5.7-rc6-2614-g5a892ff2facb when + * kernel_setsockopt() was removed until + * sockptr_t (above) there is no clean way to + * pass kernel address to setsockopt. We could + * use get_fs()/set_fs(), but in this particular + * situation there is an easier way. It depends + * on the fact that at least for these few + * kernels a NULL address to ipv6_setsockopt() + * is treated like the address of a zero. + */ + if (ipv6_only_sock(sock->sk) && !val) { + void *optval = NULL; + + sock->ops->setsockopt(sock, + IPPROTO_IPV6, IPV6_V6ONLY, + optval, sizeof(val)); + } +#endif /* HAVE_KERNEL_SETSOCKOPT */ + + if (interface >= 0 && remaddr) { + struct sockaddr_in6 *rem = (void *)remaddr; + + ipv6_dev_get_saddr(ns, + dev_get_by_index(ns, + interface), + &rem->sin6_addr, 0, + &sin6->sin6_addr); + } + sin6->sin6_port = htons(local_port); + break; + } +#endif /* IS_ENABLED(CONFIG_IPV6) */ + } rc = kernel_bind(sock, (struct sockaddr *)&locaddr, sizeof(locaddr)); if (rc == -EADDRINUSE) { CDEBUG(D_NET, "Port %d already in use\n", local_port); - *fatal = 0; goto failed; } if (rc != 0) { @@ -452,186 +308,127 @@ lnet_sock_create(struct socket **sockp, int *fatal, goto failed; } } - return 0; + return sock; failed: sock_release(sock); - return rc; + return ERR_PTR(rc); } -int +void lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize) { - int option; - int rc; + struct sock *sk = sock->sk; if (txbufsize != 0) { - option = txbufsize; - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, - (char *)&option, sizeof(option)); - if (rc != 0) { - CERROR("Can't set send buffer %d: %d\n", - option, rc); - return rc; - } + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = txbufsize; + sk->sk_write_space(sk); } if (rxbufsize != 0) { - option = rxbufsize; - rc = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, - (char *)&option, sizeof(option)); - if (rc != 0) { - CERROR("Can't set receive buffer %d: %d\n", - option, rc); - return rc; - } + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_sndbuf = rxbufsize; } - return 0; } EXPORT_SYMBOL(lnet_sock_setbuf); int -lnet_sock_getaddr(struct socket *sock, bool remote, __u32 *ip, int *port) +lnet_sock_getaddr(struct socket *sock, bool remote, + struct sockaddr_storage *peer) { - struct sockaddr_in sin; - int len = sizeof(sin); - int rc; + int rc; +#ifndef HAVE_KERN_SOCK_GETNAME_2ARGS + int len = sizeof(*peer); +#endif if (remote) - rc = kernel_getpeername(sock, (struct sockaddr *)&sin, &len); + rc = lnet_kernel_getpeername(sock, + (struct sockaddr *)peer, &len); else - rc = kernel_getsockname(sock, (struct sockaddr *)&sin, &len); - if (rc != 0) { + rc = lnet_kernel_getsockname(sock, + (struct sockaddr *)peer, &len); + if (rc < 0) { CERROR("Error %d getting sock %s IP/port\n", rc, remote ? "peer" : "local"); return rc; } - - if (ip != NULL) - *ip = ntohl(sin.sin_addr.s_addr); - - if (port != NULL) - *port = ntohs(sin.sin_port); - + if (peer->ss_family == AF_INET6) { + struct sockaddr_in6 *in6 = (void *)peer; + struct sockaddr_in *in = (void *)peer; + short port = in6->sin6_port; + + if (ipv6_addr_v4mapped(&in6->sin6_addr)) { + /* Pretend it is a v4 socket */ + memset(in, 0, sizeof(*in)); + in->sin_family = AF_INET; + in->sin_port = port; + memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4); + } + } return 0; } EXPORT_SYMBOL(lnet_sock_getaddr); -int -lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize) +void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize) { if (txbufsize != NULL) *txbufsize = sock->sk->sk_sndbuf; if (rxbufsize != NULL) *rxbufsize = sock->sk->sk_rcvbuf; - - return 0; } EXPORT_SYMBOL(lnet_sock_getbuf); -int -lnet_sock_listen(struct socket **sockp, - __u32 local_ip, int local_port, int backlog) +struct socket * +lnet_sock_listen(int local_port, int backlog, struct net *ns) { - int fatal; - int rc; + struct socket *sock; + int rc; - rc = lnet_sock_create(sockp, &fatal, local_ip, local_port); - if (rc != 0) { - if (!fatal) + sock = lnet_sock_create(-1, NULL, local_port, ns); + if (IS_ERR(sock)) { + rc = PTR_ERR(sock); + if (rc == -EADDRINUSE) CERROR("Can't create socket: port %d already in use\n", local_port); - return rc; + return ERR_PTR(rc); } - rc = kernel_listen(*sockp, backlog); + rc = kernel_listen(sock, backlog); if (rc == 0) - return 0; + return sock; CERROR("Can't set listen backlog %d: %d\n", backlog, rc); - sock_release(*sockp); - return rc; -} - -#ifndef HAVE_SK_SLEEP -static inline wait_queue_head_t *sk_sleep(struct sock *sk) -{ - return sk->sk_sleep; -} -#endif - -int -lnet_sock_accept(struct socket **newsockp, struct socket *sock) -{ - wait_queue_t wait; - struct socket *newsock; - int rc; - - /* XXX this should add a ref to sock->ops->owner, if - * TCP could be a module */ - rc = sock_create_lite(PF_PACKET, sock->type, IPPROTO_TCP, &newsock); - if (rc) { - CERROR("Can't allocate socket\n"); - return rc; - } - - newsock->ops = sock->ops; - - rc = sock->ops->accept(sock, newsock, O_NONBLOCK); - if (rc == -EAGAIN) { - /* Nothing ready, so wait for activity */ - init_waitqueue_entry(&wait, current); - add_wait_queue(sk_sleep(sock->sk), &wait); - set_current_state(TASK_INTERRUPTIBLE); - schedule(); - remove_wait_queue(sk_sleep(sock->sk), &wait); - rc = sock->ops->accept(sock, newsock, O_NONBLOCK); - } - - if (rc != 0) - goto failed; - - *newsockp = newsock; - return 0; - -failed: - sock_release(newsock); - return rc; + sock_release(sock); + return ERR_PTR(rc); } -int -lnet_sock_connect(struct socket **sockp, int *fatal, - __u32 local_ip, int local_port, - __u32 peer_ip, int peer_port) +struct socket * +lnet_sock_connect(int interface, int local_port, + struct sockaddr *peeraddr, + struct net *ns) { - struct sockaddr_in srvaddr; - int rc; + struct socket *sock; + int rc; - rc = lnet_sock_create(sockp, fatal, local_ip, local_port); - if (rc != 0) - return rc; + sock = lnet_sock_create(interface, peeraddr, local_port, ns); + if (IS_ERR(sock)) + return sock; - memset(&srvaddr, 0, sizeof(srvaddr)); - srvaddr.sin_family = AF_INET; - srvaddr.sin_port = htons(peer_port); - srvaddr.sin_addr.s_addr = htonl(peer_ip); - - rc = kernel_connect(*sockp, (struct sockaddr *)&srvaddr, - sizeof(srvaddr), 0); + rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0); if (rc == 0) - return 0; + return sock; /* EADDRNOTAVAIL probably means we're already connected to the same * peer/port on the same local port on a differently typed * connection. Let our caller retry with a different local * port... */ - *fatal = !(rc == -EADDRNOTAVAIL); - CDEBUG_LIMIT(*fatal ? D_NETERROR : D_NET, - "Error %d connecting %pI4h/%d -> %pI4h/%d\n", rc, - &local_ip, local_port, &peer_ip, peer_port); + CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR, + "Error %d connecting %d -> %pISp\n", rc, + local_port, peeraddr); - sock_release(*sockp); - return rc; + sock_release(sock); + return ERR_PTR(rc); }