Whamcloud - gitweb
LU-6142 lnet: SPDX for lnet/lnet/
[fs/lustre-release.git] / lnet / lnet / lib-socket.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
4  * Use is subject to license terms.
5  *
6  * Copyright (c) 2015, 2017, Intel Corporation.
7  */
8
9 /* This file is part of Lustre, http://www.lustre.org/ */
10
11 #define DEBUG_SUBSYSTEM S_LNET
12
13 #include <linux/if.h>
14 #include <linux/in.h>
15 #include <linux/net.h>
16 #include <net/addrconf.h>
17 #include <net/ipv6.h>
18 #include <linux/file.h>
19 #include <linux/pagemap.h>
20 /* For sys_open & sys_close */
21 #include <linux/syscalls.h>
22 #include <net/sock.h>
23 #include <linux/inetdevice.h>
24
25 #include <libcfs/linux/linux-time.h>
26 #include <libcfs/linux/linux-net.h>
27 #include <libcfs/libcfs.h>
28 #include <lnet/lib-lnet.h>
29
30 int
31 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
32 {
33         int rc;
34         long jiffies_left = cfs_time_seconds(timeout);
35         unsigned long then;
36
37         LASSERT(nob > 0);
38         /* Caller may pass a zero timeout if she thinks the socket buffer is
39          * empty enough to take the whole message immediately */
40
41         for (;;) {
42                 struct kvec  iov = {
43                         .iov_base = buffer,
44                         .iov_len  = nob
45                 };
46                 struct msghdr msg = {
47                         .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
48                 };
49
50                 if (timeout != 0) {
51                         struct sock *sk = sock->sk;
52
53                         /* Set send timeout to remaining time */
54                         lock_sock(sk);
55                         sk->sk_sndtimeo = jiffies_left;
56                         release_sock(sk);
57                 }
58
59                 then = jiffies;
60                 rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
61                 jiffies_left -= jiffies - then;
62
63                 if (rc == nob)
64                         return 0;
65
66                 if (rc < 0)
67                         return rc;
68
69                 if (rc == 0) {
70                         CERROR("Unexpected zero rc\n");
71                         return -ECONNABORTED;
72                 }
73
74                 if (jiffies_left <= 0)
75                         return -EAGAIN;
76
77                 buffer = ((char *)buffer) + rc;
78                 nob -= rc;
79         }
80         return 0;
81 }
82 EXPORT_SYMBOL(lnet_sock_write);
83
84 int
85 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
86 {
87         int rc;
88         long jiffies_left = cfs_time_seconds(timeout);
89         unsigned long then;
90
91         LASSERT(nob > 0);
92         LASSERT(jiffies_left > 0);
93
94         for (;;) {
95                 struct kvec  iov = {
96                         .iov_base = buffer,
97                         .iov_len  = nob
98                 };
99                 struct msghdr msg = {
100                         .msg_flags      = 0
101                 };
102                 struct sock *sk = sock->sk;
103
104                 /* Set receive timeout to remaining time */
105                 lock_sock(sk);
106                 sk->sk_rcvtimeo = jiffies_left;
107                 release_sock(sk);
108
109                 then = jiffies;
110                 rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
111                 jiffies_left -= jiffies - then;
112
113                 if (rc < 0)
114                         return rc;
115
116                 if (rc == 0)
117                         return -ECONNRESET;
118
119                 buffer = ((char *)buffer) + rc;
120                 nob -= rc;
121
122                 if (nob == 0)
123                         return 0;
124
125                 if (jiffies_left <= 0)
126                         return -ETIMEDOUT;
127         }
128 }
129 EXPORT_SYMBOL(lnet_sock_read);
130
131 int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
132 {
133         struct net_device *dev;
134         struct in_device *in_dev;
135         int err;
136         DECLARE_CONST_IN_IFADDR(ifa);
137
138         rcu_read_lock();
139         dev = dev_get_by_index_rcu(ns, interface);
140         err = -EINVAL;
141         if (!dev || !(dev->flags & IFF_UP))
142                 goto out;
143         in_dev = __in_dev_get_rcu(dev);
144         if (!in_dev)
145                 goto out;
146         err = -ENOENT;
147         in_dev_for_each_ifa_rcu(ifa, in_dev) {
148                 if (err ||
149                     ((dst_ipaddr ^ ntohl(ifa->ifa_local))
150                      & ntohl(ifa->ifa_mask)) == 0) {
151                         /* This address at least as good as what we
152                          * already have
153                          */
154                         *ret = ntohl(ifa->ifa_local);
155                         err = 0;
156                 }
157         }
158         endfor_ifa(in_dev);
159 out:
160         rcu_read_unlock();
161         return err;
162 }
163 EXPORT_SYMBOL(choose_ipv4_src);
164
165 static struct socket *
166 lnet_sock_create(int interface, struct sockaddr *remaddr,
167                  int local_port, struct net *ns)
168 {
169         struct socket *sock;
170         int rc;
171         int family;
172
173         family = AF_INET6;
174         if (remaddr)
175                 family = remaddr->sa_family;
176 retry:
177 #ifdef HAVE_SOCK_CREATE_KERN_USE_NET
178         rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
179 #else
180         rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
181 #endif
182         if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
183                 family = AF_INET;
184                 goto retry;
185         }
186
187         if (rc) {
188                 CERROR("Can't create socket: %d\n", rc);
189                 return ERR_PTR(rc);
190         }
191
192         sock->sk->sk_reuseport = 1;
193
194         if (interface >= 0 || local_port != 0) {
195                 struct sockaddr_storage locaddr = {};
196
197                 switch (family) {
198                 case AF_INET: {
199                         struct sockaddr_in *sin = (void *)&locaddr;
200
201                         sin->sin_family = AF_INET;
202                         sin->sin_addr.s_addr = INADDR_ANY;
203                         if (interface >= 0 && remaddr) {
204                                 struct sockaddr_in *rem = (void *)remaddr;
205                                 __u32 ip;
206
207                                 rc = choose_ipv4_src(&ip,
208                                                      interface,
209                                                      ntohl(rem->sin_addr.s_addr),
210                                                      ns);
211                                 if (rc)
212                                         goto failed;
213                                 sin->sin_addr.s_addr = htonl(ip);
214                         }
215                         sin->sin_port = htons(local_port);
216                         break;
217                 }
218 #if IS_ENABLED(CONFIG_IPV6)
219                 case AF_INET6: {
220                         struct sockaddr_in6 *sin6 = (void *)&locaddr;
221                         int val = 0;
222
223                         sin6->sin6_family = AF_INET6;
224                         sin6->sin6_addr = in6addr_any;
225
226                         /* Make sure we get both IPv4 and IPv6 connections.
227                          * This is the default, but it can be overridden so we
228                          * force it back.
229                          */
230 #ifdef HAVE_KERNEL_SETSOCKOPT
231                         kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
232                                           (char *) &val, sizeof(val));
233 #elif defined(_LINUX_SOCKPTR_H)
234                         /* sockptr_t was introduced around
235                          * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
236                          * kernel address to be passed to ->setsockopt
237                          */
238                         if (ipv6_only_sock(sock->sk)) {
239                                 sockptr_t optval = KERNEL_SOCKPTR(&val);
240
241                                 sock->ops->setsockopt(sock,
242                                                       IPPROTO_IPV6, IPV6_V6ONLY,
243                                                       optval, sizeof(val));
244                         }
245 #else
246                         /* From v5.7-rc6-2614-g5a892ff2facb when
247                          * kernel_setsockopt() was removed until
248                          * sockptr_t (above) there is no clean way to
249                          * pass kernel address to setsockopt.  We could
250                          * use get_fs()/set_fs(), but in this particular
251                          * situation there is an easier way.  It depends
252                          * on the fact that at least for these few
253                          * kernels a NULL address to ipv6_setsockopt()
254                          * is treated like the address of a zero.
255                          */
256                         if (ipv6_only_sock(sock->sk) && !val) {
257                                 void *optval = NULL;
258
259                                 sock->ops->setsockopt(sock,
260                                                       IPPROTO_IPV6, IPV6_V6ONLY,
261                                                       optval, sizeof(val));
262                         }
263 #endif /* HAVE_KERNEL_SETSOCKOPT */
264
265                         if (interface >= 0 && remaddr) {
266                                 struct sockaddr_in6 *rem = (void *)remaddr;
267
268                                 ipv6_dev_get_saddr(ns,
269                                                    dev_get_by_index(ns,
270                                                                     interface),
271                                                    &rem->sin6_addr, 0,
272                                                    &sin6->sin6_addr);
273                         }
274                         sin6->sin6_port = htons(local_port);
275                         break;
276                 }
277 #endif /* IS_ENABLED(CONFIG_IPV6) */
278                 }
279                 rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
280                                  sizeof(locaddr));
281                 if (rc == -EADDRINUSE) {
282                         CDEBUG(D_NET, "Port %d already in use\n", local_port);
283                         goto failed;
284                 }
285                 if (rc != 0) {
286                         CERROR("Error trying to bind to %pISc/%d: rc = %d\n",
287                                &locaddr, local_port, rc);
288                         goto failed;
289                 }
290         }
291         return sock;
292
293 failed:
294         sock_release(sock);
295         return ERR_PTR(rc);
296 }
297
298 void
299 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
300 {
301         struct sock *sk = sock->sk;
302
303         if (txbufsize != 0) {
304                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
305                 sk->sk_sndbuf = txbufsize;
306                 sk->sk_write_space(sk);
307         }
308
309         if (rxbufsize != 0) {
310                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
311                 sk->sk_sndbuf = rxbufsize;
312         }
313 }
314 EXPORT_SYMBOL(lnet_sock_setbuf);
315
316 int
317 lnet_sock_getaddr(struct socket *sock, bool remote,
318                   struct sockaddr_storage *peer)
319 {
320         int rc;
321 #ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
322         int len = sizeof(*peer);
323 #endif
324
325         if (remote)
326                 rc = lnet_kernel_getpeername(sock,
327                                              (struct sockaddr *)peer, &len);
328         else
329                 rc = lnet_kernel_getsockname(sock,
330                                              (struct sockaddr *)peer, &len);
331         if (rc < 0) {
332                 CERROR("Error %d getting sock %s IP/port\n",
333                         rc, remote ? "peer" : "local");
334                 return rc;
335         }
336         if (peer->ss_family == AF_INET6) {
337                 struct sockaddr_in6 *in6 = (void *)peer;
338                 struct sockaddr_in *in = (void *)peer;
339                 short port = in6->sin6_port;
340
341                 if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
342                         /* Pretend it is a v4 socket */
343                         memset(in, 0, sizeof(*in));
344                         in->sin_family = AF_INET;
345                         in->sin_port = port;
346                         memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
347                 }
348         }
349         return 0;
350 }
351 EXPORT_SYMBOL(lnet_sock_getaddr);
352
353 void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
354 {
355         if (txbufsize != NULL)
356                 *txbufsize = sock->sk->sk_sndbuf;
357
358         if (rxbufsize != NULL)
359                 *rxbufsize = sock->sk->sk_rcvbuf;
360 }
361 EXPORT_SYMBOL(lnet_sock_getbuf);
362
363 struct socket *
364 lnet_sock_listen(int local_port, int backlog, struct net *ns)
365 {
366         struct socket *sock;
367         int rc;
368
369         sock = lnet_sock_create(-1, NULL, local_port, ns);
370         if (IS_ERR(sock)) {
371                 rc = PTR_ERR(sock);
372                 if (rc == -EADDRINUSE)
373                         CERROR("Can't create socket: port %d already in use\n",
374                                local_port);
375                 return ERR_PTR(rc);
376         }
377
378         rc = kernel_listen(sock, backlog);
379         if (rc == 0)
380                 return sock;
381
382         CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
383         sock_release(sock);
384         return ERR_PTR(rc);
385 }
386
387 struct socket *
388 lnet_sock_connect(int interface, int local_port,
389                   struct sockaddr *peeraddr,
390                   struct net *ns)
391 {
392         struct socket *sock;
393         int rc;
394
395         sock = lnet_sock_create(interface, peeraddr, local_port, ns);
396         if (IS_ERR(sock))
397                 return sock;
398
399         /* Avoid temporary address, they are bad for long-lived
400          * connections such as lustre mounts.
401          * RFC4941, section 3.6 suggests that:
402          *    Individual applications, which have specific
403          *    knowledge about the normal duration of connections,
404          *    MAY override this as appropriate.
405          */
406         if (peeraddr->sa_family == PF_INET6)
407                 ip6_sock_set_addr_preferences(sock->sk,
408                                               IPV6_PREFER_SRC_PUBLIC);
409
410         rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
411         if (rc == 0)
412                 return sock;
413
414         /* EADDRNOTAVAIL probably means we're already connected to the same
415          * peer/port on the same local port on a differently typed
416          * connection.  Let our caller retry with a different local
417          * port... */
418
419         CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
420                      "Error %d connecting %d -> %pIScp\n", rc,
421                      local_port, peeraddr);
422
423         sock_release(sock);
424         return ERR_PTR(rc);
425 }