1 // SPDX-License-Identifier: GPL-2.0
3 /* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
4 * Use is subject to license terms.
6 * Copyright (c) 2015, 2017, Intel Corporation.
9 /* This file is part of Lustre, http://www.lustre.org/ */
11 #define DEBUG_SUBSYSTEM S_LNET
15 #include <linux/net.h>
16 #include <net/addrconf.h>
18 #include <linux/file.h>
19 #include <linux/pagemap.h>
20 /* For sys_open & sys_close */
21 #include <linux/syscalls.h>
23 #include <linux/inetdevice.h>
25 #include <libcfs/linux/linux-time.h>
26 #include <libcfs/linux/linux-net.h>
27 #include <libcfs/libcfs.h>
28 #include <lnet/lib-lnet.h>
31 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
34 long jiffies_left = cfs_time_seconds(timeout);
38 /* Caller may pass a zero timeout if she thinks the socket buffer is
39 * empty enough to take the whole message immediately */
47 .msg_flags = (timeout == 0) ? MSG_DONTWAIT : 0
51 struct sock *sk = sock->sk;
53 /* Set send timeout to remaining time */
55 sk->sk_sndtimeo = jiffies_left;
60 rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
61 jiffies_left -= jiffies - then;
70 CERROR("Unexpected zero rc\n");
74 if (jiffies_left <= 0)
77 buffer = ((char *)buffer) + rc;
82 EXPORT_SYMBOL(lnet_sock_write);
85 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
88 long jiffies_left = cfs_time_seconds(timeout);
92 LASSERT(jiffies_left > 0);
102 struct sock *sk = sock->sk;
104 /* Set receive timeout to remaining time */
106 sk->sk_rcvtimeo = jiffies_left;
110 rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
111 jiffies_left -= jiffies - then;
119 buffer = ((char *)buffer) + rc;
125 if (jiffies_left <= 0)
129 EXPORT_SYMBOL(lnet_sock_read);
131 int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
133 struct net_device *dev;
134 struct in_device *in_dev;
136 DECLARE_CONST_IN_IFADDR(ifa);
139 dev = dev_get_by_index_rcu(ns, interface);
141 if (!dev || !(dev->flags & IFF_UP))
143 in_dev = __in_dev_get_rcu(dev);
147 in_dev_for_each_ifa_rcu(ifa, in_dev) {
149 ((dst_ipaddr ^ ntohl(ifa->ifa_local))
150 & ntohl(ifa->ifa_mask)) == 0) {
151 /* This address at least as good as what we
154 *ret = ntohl(ifa->ifa_local);
163 EXPORT_SYMBOL(choose_ipv4_src);
165 static struct socket *
166 lnet_sock_create(int interface, struct sockaddr *remaddr,
167 int local_port, struct net *ns)
175 family = remaddr->sa_family;
177 #ifdef HAVE_SOCK_CREATE_KERN_USE_NET
178 rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
180 rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
182 if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
188 CERROR("Can't create socket: %d\n", rc);
192 sock->sk->sk_reuseport = 1;
194 if (interface >= 0 || local_port != 0) {
195 struct sockaddr_storage locaddr = {};
199 struct sockaddr_in *sin = (void *)&locaddr;
201 sin->sin_family = AF_INET;
202 sin->sin_addr.s_addr = INADDR_ANY;
203 if (interface >= 0 && remaddr) {
204 struct sockaddr_in *rem = (void *)remaddr;
207 rc = choose_ipv4_src(&ip,
209 ntohl(rem->sin_addr.s_addr),
213 sin->sin_addr.s_addr = htonl(ip);
215 sin->sin_port = htons(local_port);
218 #if IS_ENABLED(CONFIG_IPV6)
220 struct sockaddr_in6 *sin6 = (void *)&locaddr;
223 sin6->sin6_family = AF_INET6;
224 sin6->sin6_addr = in6addr_any;
226 /* Make sure we get both IPv4 and IPv6 connections.
227 * This is the default, but it can be overridden so we
230 #ifdef HAVE_KERNEL_SETSOCKOPT
231 kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
232 (char *) &val, sizeof(val));
233 #elif defined(_LINUX_SOCKPTR_H)
234 /* sockptr_t was introduced around
235 * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
236 * kernel address to be passed to ->setsockopt
238 if (ipv6_only_sock(sock->sk)) {
239 sockptr_t optval = KERNEL_SOCKPTR(&val);
241 sock->ops->setsockopt(sock,
242 IPPROTO_IPV6, IPV6_V6ONLY,
243 optval, sizeof(val));
246 /* From v5.7-rc6-2614-g5a892ff2facb when
247 * kernel_setsockopt() was removed until
248 * sockptr_t (above) there is no clean way to
249 * pass kernel address to setsockopt. We could
250 * use get_fs()/set_fs(), but in this particular
251 * situation there is an easier way. It depends
252 * on the fact that at least for these few
253 * kernels a NULL address to ipv6_setsockopt()
254 * is treated like the address of a zero.
256 if (ipv6_only_sock(sock->sk) && !val) {
259 sock->ops->setsockopt(sock,
260 IPPROTO_IPV6, IPV6_V6ONLY,
261 optval, sizeof(val));
263 #endif /* HAVE_KERNEL_SETSOCKOPT */
265 if (interface >= 0 && remaddr) {
266 struct sockaddr_in6 *rem = (void *)remaddr;
268 ipv6_dev_get_saddr(ns,
274 sin6->sin6_port = htons(local_port);
277 #endif /* IS_ENABLED(CONFIG_IPV6) */
279 rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
281 if (rc == -EADDRINUSE) {
282 CDEBUG(D_NET, "Port %d already in use\n", local_port);
286 CERROR("Error trying to bind to %pISc/%d: rc = %d\n",
287 &locaddr, local_port, rc);
299 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
301 struct sock *sk = sock->sk;
303 if (txbufsize != 0) {
304 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
305 sk->sk_sndbuf = txbufsize;
306 sk->sk_write_space(sk);
309 if (rxbufsize != 0) {
310 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
311 sk->sk_sndbuf = rxbufsize;
314 EXPORT_SYMBOL(lnet_sock_setbuf);
317 lnet_sock_getaddr(struct socket *sock, bool remote,
318 struct sockaddr_storage *peer)
321 #ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
322 int len = sizeof(*peer);
326 rc = lnet_kernel_getpeername(sock,
327 (struct sockaddr *)peer, &len);
329 rc = lnet_kernel_getsockname(sock,
330 (struct sockaddr *)peer, &len);
332 CERROR("Error %d getting sock %s IP/port\n",
333 rc, remote ? "peer" : "local");
336 if (peer->ss_family == AF_INET6) {
337 struct sockaddr_in6 *in6 = (void *)peer;
338 struct sockaddr_in *in = (void *)peer;
339 short port = in6->sin6_port;
341 if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
342 /* Pretend it is a v4 socket */
343 memset(in, 0, sizeof(*in));
344 in->sin_family = AF_INET;
346 memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
351 EXPORT_SYMBOL(lnet_sock_getaddr);
353 void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
355 if (txbufsize != NULL)
356 *txbufsize = sock->sk->sk_sndbuf;
358 if (rxbufsize != NULL)
359 *rxbufsize = sock->sk->sk_rcvbuf;
361 EXPORT_SYMBOL(lnet_sock_getbuf);
364 lnet_sock_listen(int local_port, int backlog, struct net *ns)
369 sock = lnet_sock_create(-1, NULL, local_port, ns);
372 if (rc == -EADDRINUSE)
373 CERROR("Can't create socket: port %d already in use\n",
378 rc = kernel_listen(sock, backlog);
382 CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
388 lnet_sock_connect(int interface, int local_port,
389 struct sockaddr *peeraddr,
395 sock = lnet_sock_create(interface, peeraddr, local_port, ns);
399 /* Avoid temporary address, they are bad for long-lived
400 * connections such as lustre mounts.
401 * RFC4941, section 3.6 suggests that:
402 * Individual applications, which have specific
403 * knowledge about the normal duration of connections,
404 * MAY override this as appropriate.
406 if (peeraddr->sa_family == PF_INET6)
407 ip6_sock_set_addr_preferences(sock->sk,
408 IPV6_PREFER_SRC_PUBLIC);
410 rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
414 /* EADDRNOTAVAIL probably means we're already connected to the same
415 * peer/port on the same local port on a differently typed
416 * connection. Let our caller retry with a different local
419 CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
420 "Error %d connecting %d -> %pIScp\n", rc,
421 local_port, peeraddr);