Whamcloud - gitweb
New tag 2.15.63
[fs/lustre-release.git] / lnet / lnet / lib-socket.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
4  * Use is subject to license terms.
5  *
6  * Copyright (c) 2015, 2017, Intel Corporation.
7  */
8
9 /* This file is part of Lustre, http://www.lustre.org/ */
10
11 #define DEBUG_SUBSYSTEM S_LNET
12
13 #include <linux/if.h>
14 #include <linux/in.h>
15 #include <linux/net.h>
16 #include <net/addrconf.h>
17 #include <net/ipv6.h>
18 #include <linux/file.h>
19 #include <linux/pagemap.h>
20 /* For sys_open & sys_close */
21 #include <linux/syscalls.h>
22 #include <net/sock.h>
23 #include <linux/inetdevice.h>
24
25 #include <libcfs/linux/linux-time.h>
26 #include <libcfs/linux/linux-net.h>
27 #include <libcfs/libcfs.h>
28 #include <lnet/lib-lnet.h>
29
30 int
31 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
32 {
33         int rc;
34         long jiffies_left = cfs_time_seconds(timeout);
35         unsigned long then;
36
37         LASSERT(nob > 0);
38         /* Caller may pass a zero timeout if she thinks the socket buffer is
39          * empty enough to take the whole message immediately */
40
41         for (;;) {
42                 struct kvec  iov = {
43                         .iov_base = buffer,
44                         .iov_len  = nob
45                 };
46                 struct msghdr msg = {
47                         .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
48                 };
49
50                 if (timeout != 0) {
51                         struct sock *sk = sock->sk;
52
53                         /* Set send timeout to remaining time */
54                         lock_sock(sk);
55                         sk->sk_sndtimeo = jiffies_left;
56                         release_sock(sk);
57                 }
58
59                 then = jiffies;
60                 rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
61                 jiffies_left -= jiffies - then;
62
63                 if (rc == nob)
64                         return 0;
65
66                 if (rc < 0)
67                         return rc;
68
69                 if (rc == 0) {
70                         CERROR("Unexpected zero rc\n");
71                         return -ECONNABORTED;
72                 }
73
74                 if (jiffies_left <= 0)
75                         return -EAGAIN;
76
77                 buffer = ((char *)buffer) + rc;
78                 nob -= rc;
79         }
80         return 0;
81 }
82 EXPORT_SYMBOL(lnet_sock_write);
83
84 int
85 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
86 {
87         int rc;
88         long jiffies_left = cfs_time_seconds(timeout);
89         unsigned long then;
90
91         LASSERT(nob > 0);
92         LASSERT(jiffies_left > 0);
93
94         for (;;) {
95                 struct kvec  iov = {
96                         .iov_base = buffer,
97                         .iov_len  = nob
98                 };
99                 struct msghdr msg = {
100                         .msg_flags      = 0
101                 };
102                 struct sock *sk = sock->sk;
103
104                 /* Set receive timeout to remaining time */
105                 lock_sock(sk);
106                 sk->sk_rcvtimeo = jiffies_left;
107                 release_sock(sk);
108
109                 then = jiffies;
110                 rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
111                 jiffies_left -= jiffies - then;
112
113                 if (rc < 0)
114                         return rc;
115
116                 if (rc == 0)
117                         return -ECONNRESET;
118
119                 buffer = ((char *)buffer) + rc;
120                 nob -= rc;
121
122                 if (nob == 0)
123                         return 0;
124
125                 if (jiffies_left <= 0)
126                         return -ETIMEDOUT;
127         }
128 }
129 EXPORT_SYMBOL(lnet_sock_read);
130
131 int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
132 {
133         struct net_device *dev;
134         struct in_device *in_dev;
135         int err;
136         DECLARE_CONST_IN_IFADDR(ifa);
137
138         rcu_read_lock();
139         dev = dev_get_by_index_rcu(ns, interface);
140         err = -EINVAL;
141         if (!dev || !(dev->flags & IFF_UP))
142                 goto out;
143         in_dev = __in_dev_get_rcu(dev);
144         if (!in_dev)
145                 goto out;
146         err = -ENOENT;
147         in_dev_for_each_ifa_rcu(ifa, in_dev) {
148                 if (err ||
149                     ((dst_ipaddr ^ ntohl(ifa->ifa_local))
150                      & ntohl(ifa->ifa_mask)) == 0) {
151                         /* This address at least as good as what we
152                          * already have
153                          */
154                         *ret = ntohl(ifa->ifa_local);
155                         err = 0;
156                 }
157         }
158         endfor_ifa(in_dev);
159 out:
160         rcu_read_unlock();
161         return err;
162 }
163 EXPORT_SYMBOL(choose_ipv4_src);
164
165 static struct socket *
166 lnet_sock_create(int interface, struct sockaddr *remaddr,
167                  int local_port, struct net *ns)
168 {
169         struct socket *sock;
170         int rc;
171         int family;
172
173         family = AF_INET6;
174         if (remaddr)
175                 family = remaddr->sa_family;
176 retry:
177 #ifdef HAVE_SOCK_CREATE_KERN_USE_NET
178         rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
179 #else
180         rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
181 #endif
182         if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
183                 family = AF_INET;
184                 goto retry;
185         }
186
187         if (rc) {
188                 CERROR("Can't create socket: %d\n", rc);
189                 return ERR_PTR(rc);
190         }
191
192         sock->sk->sk_reuseport = 1;
193
194         if (interface >= 0 || local_port != 0) {
195                 struct sockaddr_storage locaddr = {};
196
197                 switch (family) {
198                 case AF_INET: {
199                         struct sockaddr_in *sin = (void *)&locaddr;
200
201                         sin->sin_family = AF_INET;
202                         sin->sin_addr.s_addr = INADDR_ANY;
203                         if (interface >= 0 && remaddr) {
204                                 struct sockaddr_in *rem = (void *)remaddr;
205                                 __u32 ip;
206
207                                 rc = choose_ipv4_src(&ip,
208                                                      interface,
209                                                      ntohl(rem->sin_addr.s_addr),
210                                                      ns);
211                                 if (rc)
212                                         goto failed;
213                                 sin->sin_addr.s_addr = htonl(ip);
214                         }
215                         sin->sin_port = htons(local_port);
216                         break;
217                 }
218 #if IS_ENABLED(CONFIG_IPV6)
219                 case AF_INET6: {
220                         struct sockaddr_in6 *sin6 = (void *)&locaddr;
221                         int val = 0;
222
223                         sin6->sin6_family = AF_INET6;
224                         sin6->sin6_addr = in6addr_any;
225
226                         /* Make sure we get both IPv4 and IPv6 connections.
227                          * This is the default, but it can be overridden so we
228                          * force it back.
229                          */
230 #ifdef HAVE_KERNEL_SETSOCKOPT
231                         kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
232                                           (char *) &val, sizeof(val));
233 #elif defined(_LINUX_SOCKPTR_H)
234                         /* sockptr_t was introduced around
235                          * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
236                          * kernel address to be passed to ->setsockopt
237                          */
238                         if (ipv6_only_sock(sock->sk)) {
239                                 sockptr_t optval = KERNEL_SOCKPTR(&val);
240
241                                 sock->ops->setsockopt(sock,
242                                                       IPPROTO_IPV6, IPV6_V6ONLY,
243                                                       optval, sizeof(val));
244                         }
245 #else
246                         /* From v5.7-rc6-2614-g5a892ff2facb when
247                          * kernel_setsockopt() was removed until
248                          * sockptr_t (above) there is no clean way to
249                          * pass kernel address to setsockopt.  We could
250                          * use get_fs()/set_fs(), but in this particular
251                          * situation there is an easier way.  It depends
252                          * on the fact that at least for these few
253                          * kernels a NULL address to ipv6_setsockopt()
254                          * is treated like the address of a zero.
255                          */
256                         if (ipv6_only_sock(sock->sk) && !val) {
257                                 void *optval = NULL;
258
259                                 sock->ops->setsockopt(sock,
260                                                       IPPROTO_IPV6, IPV6_V6ONLY,
261                                                       optval, sizeof(val));
262                         }
263 #endif /* HAVE_KERNEL_SETSOCKOPT */
264
265                         if (interface >= 0 && remaddr) {
266                                 struct sockaddr_in6 *rem = (void *)remaddr;
267
268                                 ipv6_dev_get_saddr(ns,
269                                                    dev_get_by_index(ns,
270                                                                     interface),
271                                                    &rem->sin6_addr, 0,
272                                                    &sin6->sin6_addr);
273                         }
274                         sin6->sin6_port = htons(local_port);
275                         break;
276                 }
277 #endif /* IS_ENABLED(CONFIG_IPV6) */
278                 }
279                 rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
280                                  sizeof(locaddr));
281                 if (rc == -EADDRINUSE) {
282                         CDEBUG(D_NET, "Port %d already in use\n", local_port);
283                         goto failed;
284                 }
285                 if (rc != 0) {
286                         CERROR("Error trying to bind to %pISc/%d: rc = %d\n",
287                                &locaddr, local_port, rc);
288                         goto failed;
289                 }
290         }
291         return sock;
292
293 failed:
294         sock_release(sock);
295         return ERR_PTR(rc);
296 }
297
298 void
299 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
300 {
301         struct sock *sk = sock->sk;
302
303         if (txbufsize != 0) {
304                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
305                 sk->sk_sndbuf = txbufsize;
306                 sk->sk_write_space(sk);
307         }
308
309         if (rxbufsize != 0) {
310                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
311                 sk->sk_sndbuf = rxbufsize;
312         }
313 }
314 EXPORT_SYMBOL(lnet_sock_setbuf);
315
316 int
317 lnet_sock_getaddr(struct socket *sock, bool remote,
318                   struct sockaddr_storage *peer)
319 {
320         int rc;
321 #ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
322         int len = sizeof(*peer);
323 #endif
324
325         if (remote)
326                 rc = lnet_kernel_getpeername(sock,
327                                              (struct sockaddr *)peer, &len);
328         else
329                 rc = lnet_kernel_getsockname(sock,
330                                              (struct sockaddr *)peer, &len);
331         if (rc < 0) {
332                 CERROR("Error %d getting sock %s IP/port\n",
333                         rc, remote ? "peer" : "local");
334                 return rc;
335         }
336         if (peer->ss_family == AF_INET6) {
337                 struct sockaddr_in6 *in6 = (void *)peer;
338                 struct sockaddr_in *in = (void *)peer;
339                 short port = in6->sin6_port;
340
341                 if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
342                         /* Pretend it is a v4 socket */
343                         memset(in, 0, sizeof(*in));
344                         in->sin_family = AF_INET;
345                         in->sin_port = port;
346                         memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
347                 }
348         }
349         return 0;
350 }
351 EXPORT_SYMBOL(lnet_sock_getaddr);
352
353 void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
354 {
355         if (txbufsize != NULL)
356                 *txbufsize = sock->sk->sk_sndbuf;
357
358         if (rxbufsize != NULL)
359                 *rxbufsize = sock->sk->sk_rcvbuf;
360 }
361 EXPORT_SYMBOL(lnet_sock_getbuf);
362
363 struct socket *
364 lnet_sock_listen(int local_port, int backlog, struct net *ns)
365 {
366         struct socket *sock;
367         int rc;
368
369         sock = lnet_sock_create(-1, NULL, local_port, ns);
370         if (IS_ERR(sock)) {
371                 rc = PTR_ERR(sock);
372                 if (rc == -EADDRINUSE)
373                         CERROR("Can't create socket: port %d already in use\n",
374                                local_port);
375                 return ERR_PTR(rc);
376         }
377
378         rc = kernel_listen(sock, backlog);
379         if (rc == 0)
380                 return sock;
381
382         CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
383         sock_release(sock);
384         return ERR_PTR(rc);
385 }
386
387 struct socket *
388 lnet_sock_connect(int interface, int local_port,
389                   struct sockaddr *peeraddr,
390                   struct net *ns)
391 {
392         struct socket *sock;
393         int rc;
394
395         sock = lnet_sock_create(interface, peeraddr, local_port, ns);
396         if (IS_ERR(sock))
397                 return sock;
398
399         /* Avoid temporary address, they are bad for long-lived
400          * connections such as lustre mounts.
401          * RFC4941, section 3.6 suggests that:
402          *    Individual applications, which have specific
403          *    knowledge about the normal duration of connections,
404          *    MAY override this as appropriate.
405          */
406         if (peeraddr->sa_family == PF_INET6)
407                 ip6_sock_set_addr_preferences(sock->sk,
408                                               IPV6_PREFER_SRC_PUBLIC);
409
410         rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
411         if (rc == 0)
412                 return sock;
413
414         /* EADDRNOTAVAIL probably means we're already connected to the same
415          * peer/port on the same local port on a differently typed
416          * connection.  Let our caller retry with a different local
417          * port... */
418
419         CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
420                      "Error %d connecting %d -> %pIScp\n", rc,
421                      local_port, peeraddr);
422
423         sock_release(sock);
424         return ERR_PTR(rc);
425 }
426
427 static int lnet_inet4_enumerate(struct net_device *dev, int flags,
428                                 int *nalloc, int nip, int cpt,
429                                 struct lnet_inetdev **dev_list)
430 {
431         struct lnet_inetdev *ifaces = *dev_list;
432         struct in_device *in_dev;
433         DECLARE_CONST_IN_IFADDR(ifa);
434
435         in_dev = __in_dev_get_rtnl(dev);
436         if (!in_dev) {
437                 CWARN("lnet: Interface %s has no IPv4 status.\n",
438                       dev->name);
439                 return nip;
440         }
441
442         in_dev_for_each_ifa_rtnl(ifa, in_dev) {
443                 if (nip >= *nalloc) {
444                         struct lnet_inetdev *tmp;
445
446                         *nalloc += LNET_INTERFACES_NUM;
447                         tmp = krealloc(ifaces, *nalloc * sizeof(*tmp),
448                                        GFP_KERNEL);
449                         if (!tmp) {
450                                 kfree(ifaces);
451                                 ifaces = NULL;
452                                 return -ENOMEM;
453                         }
454                         ifaces = tmp;
455                 }
456
457                 ifaces[nip].li_cpt = cpt;
458                 ifaces[nip].li_iff_master = !!(flags & IFF_MASTER);
459                 ifaces[nip].li_size = sizeof(ifa->ifa_local);
460                 ifaces[nip].li_index = dev->ifindex;
461                 ifaces[nip].li_ipaddr = ifa->ifa_local;
462                 ifaces[nip].li_netmask = ntohl(ifa->ifa_mask);
463                 strscpy(ifaces[nip].li_name, ifa->ifa_label,
464                        sizeof(ifaces[nip].li_name));
465                 nip++;
466         }
467         endfor_ifa(in_dev);
468
469         *dev_list = ifaces;
470
471         return nip;
472 }
473
474 static int lnet_inet6_enumerate(struct net_device *dev, int flags,
475                                 int *nalloc, int nip, int cpt,
476                                 struct lnet_inetdev **dev_list)
477 {
478 #if IS_ENABLED(CONFIG_IPV6)
479         struct lnet_inetdev *ifaces = *dev_list;
480         const struct inet6_ifaddr *ifa6;
481         struct inet6_dev *in6_dev;
482
483         in6_dev = __in6_dev_get(dev);
484         if (!in6_dev) {
485                 CWARN("lnet: Interface %s has no IPv6 status.\n",
486                       dev->name);
487                 return nip;
488         }
489
490         list_for_each_entry_rcu(ifa6, &in6_dev->addr_list, if_list) {
491                 if (ifa6->flags & IFA_F_TEMPORARY)
492                         continue;
493
494                 if (ipv6_addr_type(&ifa6->addr) & IPV6_ADDR_LINKLOCAL)
495                         continue;
496
497                 if (nip >= *nalloc) {
498                         struct lnet_inetdev *tmp;
499
500                         *nalloc += LNET_INTERFACES_NUM;
501                         tmp = krealloc(ifaces, *nalloc * sizeof(*tmp),
502                                        GFP_KERNEL);
503                         if (!tmp) {
504                                 kfree(ifaces);
505                                 ifaces = NULL;
506                                 return -ENOMEM;
507                         }
508                         ifaces = tmp;
509                 }
510
511                 ifaces[nip].li_cpt = cpt;
512                 ifaces[nip].li_iff_master = !!(flags & IFF_MASTER);
513                 ifaces[nip].li_size = sizeof(struct in6_addr);
514                 ifaces[nip].li_index = dev->ifindex;
515                 memcpy(ifaces[nip].li_ipv6addr,
516                        &ifa6->addr, sizeof(struct in6_addr));
517                 strscpy(ifaces[nip].li_name, dev->name,
518                         sizeof(ifaces[nip].li_name));
519                 nip++;
520                 /* As different IPv6 addresses don't have unique
521                  * labels, it is safest just to use the first
522                  * and ignore the rest.
523                  */
524                 break;
525         }
526
527         *dev_list = ifaces;
528 #endif /* IS_ENABLED(CONFIG_IPV6) */
529         return nip;
530 }
531
532 int lnet_inet_enumerate(struct lnet_inetdev **dev_list, struct net *ns,
533                         bool v6_first)
534 {
535         struct lnet_inetdev *ifaces = NULL;
536         struct net_device *dev;
537         int nalloc = 0;
538         int nip = 0;
539
540         rtnl_lock();
541         for_each_netdev(ns, dev) {
542                 int flags = dev_get_flags(dev);
543                 int node_id, cpt;
544                 int count;
545
546                 if (flags & IFF_LOOPBACK) /* skip the loopback IF */
547                         continue;
548
549                 if (!(flags & IFF_UP)) {
550                         CWARN("lnet: Ignoring interface %s: it's down\n",
551                               dev->name);
552                         continue;
553                 }
554
555                 node_id = dev_to_node(&dev->dev);
556                 cpt = cfs_cpt_of_node(lnet_cpt_table(), node_id);
557
558                 if (v6_first) {
559                         count = lnet_inet6_enumerate(dev, flags, &nalloc, nip,
560                                                      cpt, &ifaces);
561                         if (count < 0)
562                                 CWARN("lnet: No IPv6 addresses for interface %s.\n",
563                                       dev->name);
564                         else
565                                 nip = count;
566
567                         count = lnet_inet4_enumerate(dev, flags, &nalloc, nip,
568                                                      cpt, &ifaces);
569                         if (count < 0)
570                                 CWARN("lnet: No IPv4 addresses for interface %s.\n",
571                                       dev->name);
572                         else
573                                 nip = count;
574                 } else {
575                         count = lnet_inet4_enumerate(dev, flags, &nalloc, nip,
576                                                      cpt, &ifaces);
577                         if (count < 0)
578                                 CWARN("lnet: No IPv4 addresses for interface %s.\n",
579                                       dev->name);
580                         else
581                                 nip = count;
582
583                         count = lnet_inet6_enumerate(dev, flags, &nalloc, nip,
584                                                      cpt, &ifaces);
585                         if (count < 0)
586                                 CWARN("lnet: No IPv6 addresses for interface %s.\n",
587                                       dev->name);
588                         else
589                                 nip = count;
590                 }
591         }
592         rtnl_unlock();
593
594         if (nip == 0) {
595                 CERROR("lnet: Can't find any usable interfaces, rc = -ENOENT\n");
596                 nip = -ENOENT;
597         }
598
599         *dev_list = ifaces;
600         return nip;
601 }
602 EXPORT_SYMBOL(lnet_inet_enumerate);