1 // SPDX-License-Identifier: GPL-2.0
3 /* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
4 * Use is subject to license terms.
6 * Copyright (c) 2015, 2017, Intel Corporation.
9 /* This file is part of Lustre, http://www.lustre.org/ */
11 #define DEBUG_SUBSYSTEM S_LNET
15 #include <linux/net.h>
16 #include <net/addrconf.h>
18 #include <linux/file.h>
19 #include <linux/pagemap.h>
20 /* For sys_open & sys_close */
21 #include <linux/syscalls.h>
23 #include <linux/inetdevice.h>
25 #include <libcfs/linux/linux-time.h>
26 #include <libcfs/linux/linux-net.h>
27 #include <libcfs/libcfs.h>
28 #include <lnet/lib-lnet.h>
31 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
34 long jiffies_left = cfs_time_seconds(timeout);
38 /* Caller may pass a zero timeout if she thinks the socket buffer is
39 * empty enough to take the whole message immediately */
47 .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0
51 struct sock *sk = sock->sk;
53 /* Set send timeout to remaining time */
55 sk->sk_sndtimeo = jiffies_left;
60 rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
61 jiffies_left -= jiffies - then;
70 CERROR("Unexpected zero rc\n");
74 if (jiffies_left <= 0)
77 buffer = ((char *)buffer) + rc;
82 EXPORT_SYMBOL(lnet_sock_write);
85 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
88 long jiffies_left = cfs_time_seconds(timeout);
92 LASSERT(jiffies_left > 0);
102 struct sock *sk = sock->sk;
104 /* Set receive timeout to remaining time */
106 sk->sk_rcvtimeo = jiffies_left;
110 rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
111 jiffies_left -= jiffies - then;
119 buffer = ((char *)buffer) + rc;
125 if (jiffies_left <= 0)
129 EXPORT_SYMBOL(lnet_sock_read);
131 int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
133 struct net_device *dev;
134 struct in_device *in_dev;
136 DECLARE_CONST_IN_IFADDR(ifa);
139 dev = dev_get_by_index_rcu(ns, interface);
141 if (!dev || !(dev->flags & IFF_UP))
143 in_dev = __in_dev_get_rcu(dev);
147 in_dev_for_each_ifa_rcu(ifa, in_dev) {
149 ((dst_ipaddr ^ ntohl(ifa->ifa_local))
150 & ntohl(ifa->ifa_mask)) == 0) {
151 /* This address at least as good as what we
154 *ret = ntohl(ifa->ifa_local);
163 EXPORT_SYMBOL(choose_ipv4_src);
165 static struct socket *
166 lnet_sock_create(int interface, struct sockaddr *remaddr,
167 int local_port, struct net *ns)
175 family = remaddr->sa_family;
177 #ifdef HAVE_SOCK_CREATE_KERN_USE_NET
178 rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
180 rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
182 if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
188 CERROR("Can't create socket: %d\n", rc);
192 sock->sk->sk_reuseport = 1;
194 if (interface >= 0 || local_port != 0) {
195 struct sockaddr_storage locaddr = {};
199 struct sockaddr_in *sin = (void *)&locaddr;
201 sin->sin_family = AF_INET;
202 sin->sin_addr.s_addr = INADDR_ANY;
203 if (interface >= 0 && remaddr) {
204 struct sockaddr_in *rem = (void *)remaddr;
207 rc = choose_ipv4_src(&ip,
209 ntohl(rem->sin_addr.s_addr),
213 sin->sin_addr.s_addr = htonl(ip);
215 sin->sin_port = htons(local_port);
218 #if IS_ENABLED(CONFIG_IPV6)
220 struct sockaddr_in6 *sin6 = (void *)&locaddr;
223 sin6->sin6_family = AF_INET6;
224 sin6->sin6_addr = in6addr_any;
226 /* Make sure we get both IPv4 and IPv6 connections.
227 * This is the default, but it can be overridden so we
230 #ifdef HAVE_KERNEL_SETSOCKOPT
231 kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
232 (char *) &val, sizeof(val));
233 #elif defined(_LINUX_SOCKPTR_H)
234 /* sockptr_t was introduced around
235 * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
236 * kernel address to be passed to ->setsockopt
238 if (ipv6_only_sock(sock->sk)) {
239 sockptr_t optval = KERNEL_SOCKPTR(&val);
241 sock->ops->setsockopt(sock,
242 IPPROTO_IPV6, IPV6_V6ONLY,
243 optval, sizeof(val));
246 /* From v5.7-rc6-2614-g5a892ff2facb when
247 * kernel_setsockopt() was removed until
248 * sockptr_t (above) there is no clean way to
249 * pass kernel address to setsockopt. We could
250 * use get_fs()/set_fs(), but in this particular
251 * situation there is an easier way. It depends
252 * on the fact that at least for these few
253 * kernels a NULL address to ipv6_setsockopt()
254 * is treated like the address of a zero.
256 if (ipv6_only_sock(sock->sk) && !val) {
259 sock->ops->setsockopt(sock,
260 IPPROTO_IPV6, IPV6_V6ONLY,
261 optval, sizeof(val));
263 #endif /* HAVE_KERNEL_SETSOCKOPT */
265 if (interface >= 0 && remaddr) {
266 struct sockaddr_in6 *rem = (void *)remaddr;
268 ipv6_dev_get_saddr(ns,
274 sin6->sin6_port = htons(local_port);
277 #endif /* IS_ENABLED(CONFIG_IPV6) */
279 rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
281 if (rc == -EADDRINUSE) {
282 CDEBUG(D_NET, "Port %d already in use\n", local_port);
286 CERROR("Error trying to bind to %pISc/%d: rc = %d\n",
287 &locaddr, local_port, rc);
299 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
301 struct sock *sk = sock->sk;
303 if (txbufsize != 0) {
304 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
305 sk->sk_sndbuf = txbufsize;
306 sk->sk_write_space(sk);
309 if (rxbufsize != 0) {
310 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
311 sk->sk_sndbuf = rxbufsize;
314 EXPORT_SYMBOL(lnet_sock_setbuf);
317 lnet_sock_getaddr(struct socket *sock, bool remote,
318 struct sockaddr_storage *peer)
321 #ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
322 int len = sizeof(*peer);
326 rc = lnet_kernel_getpeername(sock,
327 (struct sockaddr *)peer, &len);
329 rc = lnet_kernel_getsockname(sock,
330 (struct sockaddr *)peer, &len);
332 CERROR("Error %d getting sock %s IP/port\n",
333 rc, remote ? "peer" : "local");
336 if (peer->ss_family == AF_INET6) {
337 struct sockaddr_in6 *in6 = (void *)peer;
338 struct sockaddr_in *in = (void *)peer;
339 short port = in6->sin6_port;
341 if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
342 /* Pretend it is a v4 socket */
343 memset(in, 0, sizeof(*in));
344 in->sin_family = AF_INET;
346 memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
351 EXPORT_SYMBOL(lnet_sock_getaddr);
353 void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
355 if (txbufsize != NULL)
356 *txbufsize = sock->sk->sk_sndbuf;
358 if (rxbufsize != NULL)
359 *rxbufsize = sock->sk->sk_rcvbuf;
361 EXPORT_SYMBOL(lnet_sock_getbuf);
364 lnet_sock_listen(int local_port, int backlog, struct net *ns)
369 sock = lnet_sock_create(-1, NULL, local_port, ns);
372 if (rc == -EADDRINUSE)
373 CERROR("Can't create socket: port %d already in use\n",
378 rc = kernel_listen(sock, backlog);
382 CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
388 lnet_sock_connect(int interface, int local_port,
389 struct sockaddr *peeraddr,
395 sock = lnet_sock_create(interface, peeraddr, local_port, ns);
399 /* Avoid temporary address, they are bad for long-lived
400 * connections such as lustre mounts.
401 * RFC4941, section 3.6 suggests that:
402 * Individual applications, which have specific
403 * knowledge about the normal duration of connections,
404 * MAY override this as appropriate.
406 if (peeraddr->sa_family == PF_INET6)
407 ip6_sock_set_addr_preferences(sock->sk,
408 IPV6_PREFER_SRC_PUBLIC);
410 rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
414 /* EADDRNOTAVAIL probably means we're already connected to the same
415 * peer/port on the same local port on a differently typed
416 * connection. Let our caller retry with a different local
419 CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
420 "Error %d connecting %d -> %pIScp\n", rc,
421 local_port, peeraddr);
427 static int lnet_inet4_enumerate(struct net_device *dev, int flags,
428 int *nalloc, int nip, int cpt,
429 struct lnet_inetdev **dev_list)
431 struct lnet_inetdev *ifaces = *dev_list;
432 struct in_device *in_dev;
433 DECLARE_CONST_IN_IFADDR(ifa);
435 in_dev = __in_dev_get_rtnl(dev);
437 CWARN("lnet: Interface %s has no IPv4 status.\n",
442 in_dev_for_each_ifa_rtnl(ifa, in_dev) {
443 if (nip >= *nalloc) {
444 struct lnet_inetdev *tmp;
446 *nalloc += LNET_INTERFACES_NUM;
447 tmp = krealloc(ifaces, *nalloc * sizeof(*tmp),
457 ifaces[nip].li_cpt = cpt;
458 ifaces[nip].li_iff_master = !!(flags & IFF_MASTER);
459 ifaces[nip].li_size = sizeof(ifa->ifa_local);
460 ifaces[nip].li_index = dev->ifindex;
461 ifaces[nip].li_ipaddr = ifa->ifa_local;
462 ifaces[nip].li_netmask = ntohl(ifa->ifa_mask);
463 strscpy(ifaces[nip].li_name, ifa->ifa_label,
464 sizeof(ifaces[nip].li_name));
474 static int lnet_inet6_enumerate(struct net_device *dev, int flags,
475 int *nalloc, int nip, int cpt,
476 struct lnet_inetdev **dev_list)
478 #if IS_ENABLED(CONFIG_IPV6)
479 struct lnet_inetdev *ifaces = *dev_list;
480 const struct inet6_ifaddr *ifa6;
481 struct inet6_dev *in6_dev;
483 in6_dev = __in6_dev_get(dev);
485 CWARN("lnet: Interface %s has no IPv6 status.\n",
490 list_for_each_entry_rcu(ifa6, &in6_dev->addr_list, if_list) {
491 if (ifa6->flags & IFA_F_TEMPORARY)
494 if (ipv6_addr_type(&ifa6->addr) & IPV6_ADDR_LINKLOCAL)
497 if (nip >= *nalloc) {
498 struct lnet_inetdev *tmp;
500 *nalloc += LNET_INTERFACES_NUM;
501 tmp = krealloc(ifaces, *nalloc * sizeof(*tmp),
511 ifaces[nip].li_cpt = cpt;
512 ifaces[nip].li_iff_master = !!(flags & IFF_MASTER);
513 ifaces[nip].li_size = sizeof(struct in6_addr);
514 ifaces[nip].li_index = dev->ifindex;
515 memcpy(ifaces[nip].li_ipv6addr,
516 &ifa6->addr, sizeof(struct in6_addr));
517 strscpy(ifaces[nip].li_name, dev->name,
518 sizeof(ifaces[nip].li_name));
520 /* As different IPv6 addresses don't have unique
521 * labels, it is safest just to use the first
522 * and ignore the rest.
528 #endif /* IS_ENABLED(CONFIG_IPV6) */
532 int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns,
535 struct lnet_inetdev *ifaces = NULL;
536 struct net_device *dev;
541 for_each_netdev(ns, dev) {
542 int flags = dev_get_flags(dev);
546 if (flags & IFF_LOOPBACK) /* skip the loopback IF */
549 if (!(flags & IFF_UP)) {
550 CWARN("lnet: Ignoring interface %s: it's down\n",
555 node_id = dev_to_node(&dev->dev);
556 cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
559 count = lnet_inet6_enumerate(dev, flags, &nalloc, nip,
562 CWARN("lnet: No IPv6 addresses for interface %s.\n",
567 count = lnet_inet4_enumerate(dev, flags, &nalloc, nip,
570 CWARN("lnet: No IPv4 addresses for interface %s.\n",
575 count = lnet_inet4_enumerate(dev, flags, &nalloc, nip,
578 CWARN("lnet: No IPv4 addresses for interface %s.\n",
583 count = lnet_inet6_enumerate(dev, flags, &nalloc, nip,
586 CWARN("lnet: No IPv6 addresses for interface %s.\n",
595 CERROR("lnet: Can't find any usable interfaces, rc = -ENOENT\n");
602 EXPORT_SYMBOL(lnet_inet_enumerate);