4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2015, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
31 #define DEBUG_SUBSYSTEM S_LNET
35 #include <linux/net.h>
36 #include <net/addrconf.h>
38 #include <linux/file.h>
39 #include <linux/pagemap.h>
40 /* For sys_open & sys_close */
41 #include <linux/syscalls.h>
43 #include <linux/inetdevice.h>
45 #include <libcfs/linux/linux-time.h>
46 #include <libcfs/linux/linux-net.h>
47 #include <libcfs/libcfs.h>
48 #include <lnet/lib-lnet.h>
51 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
54 long jiffies_left = cfs_time_seconds(timeout);
58 /* Caller may pass a zero timeout if she thinks the socket buffer is
59 * empty enough to take the whole message immediately */
67 .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0
71 struct sock *sk = sock->sk;
73 /* Set send timeout to remaining time */
75 sk->sk_sndtimeo = jiffies_left;
80 rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
81 jiffies_left -= jiffies - then;
90 CERROR("Unexpected zero rc\n");
94 if (jiffies_left <= 0)
97 buffer = ((char *)buffer) + rc;
102 EXPORT_SYMBOL(lnet_sock_write);
105 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
108 long jiffies_left = cfs_time_seconds(timeout);
112 LASSERT(jiffies_left > 0);
119 struct msghdr msg = {
122 struct sock *sk = sock->sk;
124 /* Set receive timeout to remaining time */
126 sk->sk_rcvtimeo = jiffies_left;
130 rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
131 jiffies_left -= jiffies - then;
139 buffer = ((char *)buffer) + rc;
145 if (jiffies_left <= 0)
149 EXPORT_SYMBOL(lnet_sock_read);
151 int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
153 struct net_device *dev;
154 struct in_device *in_dev;
156 DECLARE_CONST_IN_IFADDR(ifa);
159 dev = dev_get_by_index_rcu(ns, interface);
161 if (!dev || !(dev->flags & IFF_UP))
163 in_dev = __in_dev_get_rcu(dev);
167 in_dev_for_each_ifa_rcu(ifa, in_dev) {
169 ((dst_ipaddr ^ ntohl(ifa->ifa_local))
170 & ntohl(ifa->ifa_mask)) == 0) {
171 /* This address at least as good as what we
174 *ret = ntohl(ifa->ifa_local);
183 EXPORT_SYMBOL(choose_ipv4_src);
185 static struct socket *
186 lnet_sock_create(int interface, struct sockaddr *remaddr,
187 int local_port, struct net *ns)
195 family = remaddr->sa_family;
197 #ifdef HAVE_SOCK_CREATE_KERN_USE_NET
198 rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
200 rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
202 if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
208 CERROR("Can't create socket: %d\n", rc);
212 sock->sk->sk_reuseport = 1;
214 if (interface >= 0 || local_port != 0) {
215 struct sockaddr_storage locaddr = {};
219 struct sockaddr_in *sin = (void *)&locaddr;
221 sin->sin_family = AF_INET;
222 sin->sin_addr.s_addr = INADDR_ANY;
223 if (interface >= 0 && remaddr) {
224 struct sockaddr_in *rem = (void *)remaddr;
227 rc = choose_ipv4_src(&ip,
229 ntohl(rem->sin_addr.s_addr),
233 sin->sin_addr.s_addr = htonl(ip);
235 sin->sin_port = htons(local_port);
238 #if IS_ENABLED(CONFIG_IPV6)
240 struct sockaddr_in6 *sin6 = (void *)&locaddr;
243 sin6->sin6_family = AF_INET6;
244 sin6->sin6_addr = in6addr_any;
246 /* Make sure we get both IPv4 and IPv6 connections.
247 * This is the default, but it can be overridden so we
250 #ifdef HAVE_KERNEL_SETSOCKOPT
251 kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
252 (char *) &val, sizeof(val));
253 #elif defined(_LINUX_SOCKPTR_H)
254 /* sockptr_t was introduced around
255 * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
256 * kernel address to be passed to ->setsockopt
258 if (ipv6_only_sock(sock->sk)) {
259 sockptr_t optval = KERNEL_SOCKPTR(&val);
261 sock->ops->setsockopt(sock,
262 IPPROTO_IPV6, IPV6_V6ONLY,
263 optval, sizeof(val));
266 /* From v5.7-rc6-2614-g5a892ff2facb when
267 * kernel_setsockopt() was removed until
268 * sockptr_t (above) there is no clean way to
269 * pass kernel address to setsockopt. We could
270 * use get_fs()/set_fs(), but in this particular
271 * situation there is an easier way. It depends
272 * on the fact that at least for these few
273 * kernels a NULL address to ipv6_setsockopt()
274 * is treated like the address of a zero.
276 if (ipv6_only_sock(sock->sk) && !val) {
279 sock->ops->setsockopt(sock,
280 IPPROTO_IPV6, IPV6_V6ONLY,
281 optval, sizeof(val));
283 #endif /* HAVE_KERNEL_SETSOCKOPT */
285 if (interface >= 0 && remaddr) {
286 struct sockaddr_in6 *rem = (void *)remaddr;
288 ipv6_dev_get_saddr(ns,
294 sin6->sin6_port = htons(local_port);
297 #endif /* IS_ENABLED(CONFIG_IPV6) */
299 rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
301 if (rc == -EADDRINUSE) {
302 CDEBUG(D_NET, "Port %d already in use\n", local_port);
306 CERROR("Error trying to bind to port %d: %d\n",
319 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
321 struct sock *sk = sock->sk;
323 if (txbufsize != 0) {
324 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
325 sk->sk_sndbuf = txbufsize;
326 sk->sk_write_space(sk);
329 if (rxbufsize != 0) {
330 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
331 sk->sk_sndbuf = rxbufsize;
334 EXPORT_SYMBOL(lnet_sock_setbuf);
337 lnet_sock_getaddr(struct socket *sock, bool remote,
338 struct sockaddr_storage *peer)
341 #ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
342 int len = sizeof(*peer);
346 rc = lnet_kernel_getpeername(sock,
347 (struct sockaddr *)peer, &len);
349 rc = lnet_kernel_getsockname(sock,
350 (struct sockaddr *)peer, &len);
352 CERROR("Error %d getting sock %s IP/port\n",
353 rc, remote ? "peer" : "local");
356 if (peer->ss_family == AF_INET6) {
357 struct sockaddr_in6 *in6 = (void *)peer;
358 struct sockaddr_in *in = (void *)peer;
359 short port = in6->sin6_port;
361 if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
362 /* Pretend it is a v4 socket */
363 memset(in, 0, sizeof(*in));
364 in->sin_family = AF_INET;
366 memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
371 EXPORT_SYMBOL(lnet_sock_getaddr);
373 void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
375 if (txbufsize != NULL)
376 *txbufsize = sock->sk->sk_sndbuf;
378 if (rxbufsize != NULL)
379 *rxbufsize = sock->sk->sk_rcvbuf;
381 EXPORT_SYMBOL(lnet_sock_getbuf);
384 lnet_sock_listen(int local_port, int backlog, struct net *ns)
389 sock = lnet_sock_create(-1, NULL, local_port, ns);
392 if (rc == -EADDRINUSE)
393 CERROR("Can't create socket: port %d already in use\n",
398 rc = kernel_listen(sock, backlog);
402 CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
408 lnet_sock_connect(int interface, int local_port,
409 struct sockaddr *peeraddr,
415 sock = lnet_sock_create(interface, peeraddr, local_port, ns);
419 /* Avoid temporary address, they are bad for long-lived
420 * connections such as lustre mounts.
421 * RFC4941, section 3.6 suggests that:
422 * Individual applications, which have specific
423 * knowledge about the normal duration of connections,
424 * MAY override this as appropriate.
426 if (peeraddr->sa_family == PF_INET6)
427 ip6_sock_set_addr_preferences(sock->sk,
428 IPV6_PREFER_SRC_PUBLIC);
430 rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
434 /* EADDRNOTAVAIL probably means we're already connected to the same
435 * peer/port on the same local port on a differently typed
436 * connection. Let our caller retry with a different local
439 CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
440 "Error %d connecting %d -> %pIScp\n", rc,
441 local_port, peeraddr);