Whamcloud - gitweb
LU-14945 lnet: don't use hops to determine the route state
[fs/lustre-release.git] / lnet / lnet / lib-socket.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2015, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  */
31 #define DEBUG_SUBSYSTEM S_LNET
32
33 #include <linux/if.h>
34 #include <linux/in.h>
35 #include <linux/net.h>
36 #include <net/addrconf.h>
37 #include <net/ipv6.h>
38 #include <linux/file.h>
39 #include <linux/pagemap.h>
40 /* For sys_open & sys_close */
41 #include <linux/syscalls.h>
42 #include <net/sock.h>
43 #include <linux/inetdevice.h>
44
45 #include <libcfs/linux/linux-time.h>
46 #include <libcfs/linux/linux-net.h>
47 #include <libcfs/libcfs.h>
48 #include <lnet/lib-lnet.h>
49
50 int
51 lnet_sock_write(struct socket *sock, void *buffer, int nob, int timeout)
52 {
53         int rc;
54         long jiffies_left = cfs_time_seconds(timeout);
55         unsigned long then;
56
57         LASSERT(nob > 0);
58         /* Caller may pass a zero timeout if she thinks the socket buffer is
59          * empty enough to take the whole message immediately */
60
61         for (;;) {
62                 struct kvec  iov = {
63                         .iov_base = buffer,
64                         .iov_len  = nob
65                 };
66                 struct msghdr msg = {
67                         .msg_flags      = (timeout == 0) ? MSG_DONTWAIT : 0
68                 };
69
70                 if (timeout != 0) {
71                         struct sock *sk = sock->sk;
72
73                         /* Set send timeout to remaining time */
74                         lock_sock(sk);
75                         sk->sk_sndtimeo = jiffies_left;
76                         release_sock(sk);
77                 }
78
79                 then = jiffies;
80                 rc = kernel_sendmsg(sock, &msg, &iov, 1, nob);
81                 jiffies_left -= jiffies - then;
82
83                 if (rc == nob)
84                         return 0;
85
86                 if (rc < 0)
87                         return rc;
88
89                 if (rc == 0) {
90                         CERROR("Unexpected zero rc\n");
91                         return -ECONNABORTED;
92                 }
93
94                 if (jiffies_left <= 0)
95                         return -EAGAIN;
96
97                 buffer = ((char *)buffer) + rc;
98                 nob -= rc;
99         }
100         return 0;
101 }
102 EXPORT_SYMBOL(lnet_sock_write);
103
104 int
105 lnet_sock_read(struct socket *sock, void *buffer, int nob, int timeout)
106 {
107         int rc;
108         long jiffies_left = cfs_time_seconds(timeout);
109         unsigned long then;
110
111         LASSERT(nob > 0);
112         LASSERT(jiffies_left > 0);
113
114         for (;;) {
115                 struct kvec  iov = {
116                         .iov_base = buffer,
117                         .iov_len  = nob
118                 };
119                 struct msghdr msg = {
120                         .msg_flags      = 0
121                 };
122                 struct sock *sk = sock->sk;
123
124                 /* Set receive timeout to remaining time */
125                 lock_sock(sk);
126                 sk->sk_rcvtimeo = jiffies_left;
127                 release_sock(sk);
128
129                 then = jiffies;
130                 rc = kernel_recvmsg(sock, &msg, &iov, 1, nob, 0);
131                 jiffies_left -= jiffies - then;
132
133                 if (rc < 0)
134                         return rc;
135
136                 if (rc == 0)
137                         return -ECONNRESET;
138
139                 buffer = ((char *)buffer) + rc;
140                 nob -= rc;
141
142                 if (nob == 0)
143                         return 0;
144
145                 if (jiffies_left <= 0)
146                         return -ETIMEDOUT;
147         }
148 }
149 EXPORT_SYMBOL(lnet_sock_read);
150
151 int choose_ipv4_src(__u32 *ret, int interface, __u32 dst_ipaddr, struct net *ns)
152 {
153         struct net_device *dev;
154         struct in_device *in_dev;
155         int err;
156         DECLARE_CONST_IN_IFADDR(ifa);
157
158         rcu_read_lock();
159         dev = dev_get_by_index_rcu(ns, interface);
160         err = -EINVAL;
161         if (!dev || !(dev->flags & IFF_UP))
162                 goto out;
163         in_dev = __in_dev_get_rcu(dev);
164         if (!in_dev)
165                 goto out;
166         err = -ENOENT;
167         in_dev_for_each_ifa_rcu(ifa, in_dev) {
168                 if (err ||
169                     ((dst_ipaddr ^ ntohl(ifa->ifa_local))
170                      & ntohl(ifa->ifa_mask)) == 0) {
171                         /* This address at least as good as what we
172                          * already have
173                          */
174                         *ret = ntohl(ifa->ifa_local);
175                         err = 0;
176                 }
177         }
178         endfor_ifa(in_dev);
179 out:
180         rcu_read_unlock();
181         return err;
182 }
183 EXPORT_SYMBOL(choose_ipv4_src);
184
185 static struct socket *
186 lnet_sock_create(int interface, struct sockaddr *remaddr,
187                  int local_port, struct net *ns)
188 {
189         struct socket *sock;
190         int rc;
191         int family;
192
193         family = AF_INET6;
194         if (remaddr)
195                 family = remaddr->sa_family;
196 retry:
197 #ifdef HAVE_SOCK_CREATE_KERN_USE_NET
198         rc = sock_create_kern(ns, family, SOCK_STREAM, 0, &sock);
199 #else
200         rc = sock_create_kern(family, SOCK_STREAM, 0, &sock);
201 #endif
202         if (rc == -EAFNOSUPPORT && family == AF_INET6 && !remaddr) {
203                 family = AF_INET;
204                 goto retry;
205         }
206
207         if (rc) {
208                 CERROR("Can't create socket: %d\n", rc);
209                 return ERR_PTR(rc);
210         }
211
212         sock->sk->sk_reuseport = 1;
213
214         if (interface >= 0 || local_port != 0) {
215                 struct sockaddr_storage locaddr = {};
216
217                 switch (family) {
218                 case AF_INET: {
219                         struct sockaddr_in *sin = (void *)&locaddr;
220
221                         sin->sin_family = AF_INET;
222                         sin->sin_addr.s_addr = INADDR_ANY;
223                         if (interface >= 0 && remaddr) {
224                                 struct sockaddr_in *rem = (void *)remaddr;
225                                 __u32 ip;
226
227                                 rc = choose_ipv4_src(&ip,
228                                                      interface,
229                                                      ntohl(rem->sin_addr.s_addr),
230                                                      ns);
231                                 if (rc)
232                                         goto failed;
233                                 sin->sin_addr.s_addr = htonl(ip);
234                         }
235                         sin->sin_port = htons(local_port);
236                         break;
237                 }
238 #if IS_ENABLED(CONFIG_IPV6)
239                 case AF_INET6: {
240                         struct sockaddr_in6 *sin6 = (void *)&locaddr;
241                         int val = 0;
242
243                         sin6->sin6_family = AF_INET6;
244                         sin6->sin6_addr = in6addr_any;
245
246                         /* Make sure we get both IPv4 and IPv6 connections.
247                          * This is the default, but it can be overridden so we
248                          * force it back.
249                          */
250 #ifdef HAVE_KERNEL_SETSOCKOPT
251                         kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
252                                           (char *) &val, sizeof(val));
253 #elif defined(_LINUX_SOCKPTR_H)
254                         /* sockptr_t was introduced around
255                          * v5.8-rc4-1952-ga7b75c5a8c41 and allows a
256                          * kernel address to be passed to ->setsockopt
257                          */
258                         if (ipv6_only_sock(sock->sk)) {
259                                 sockptr_t optval = KERNEL_SOCKPTR(&val);
260
261                                 sock->ops->setsockopt(sock,
262                                                       IPPROTO_IPV6, IPV6_V6ONLY,
263                                                       optval, sizeof(val));
264                         }
265 #else
266                         /* From v5.7-rc6-2614-g5a892ff2facb when
267                          * kernel_setsockopt() was removed until
268                          * sockptr_t (above) there is no clean way to
269                          * pass kernel address to setsockopt.  We could
270                          * use get_fs()/set_fs(), but in this particular
271                          * situation there is an easier way.  It depends
272                          * on the fact that at least for these few
273                          * kernels a NULL address to ipv6_setsockopt()
274                          * is treated like the address of a zero.
275                          */
276                         if (ipv6_only_sock(sock->sk) && !val) {
277                                 void *optval = NULL;
278
279                                 sock->ops->setsockopt(sock,
280                                                       IPPROTO_IPV6, IPV6_V6ONLY,
281                                                       optval, sizeof(val));
282                         }
283 #endif /* HAVE_KERNEL_SETSOCKOPT */
284
285                         if (interface >= 0 && remaddr) {
286                                 struct sockaddr_in6 *rem = (void *)remaddr;
287
288                                 ipv6_dev_get_saddr(ns,
289                                                    dev_get_by_index(ns,
290                                                                     interface),
291                                                    &rem->sin6_addr, 0,
292                                                    &sin6->sin6_addr);
293                         }
294                         sin6->sin6_port = htons(local_port);
295                         break;
296                 }
297 #endif /* IS_ENABLED(CONFIG_IPV6) */
298                 }
299                 rc = kernel_bind(sock, (struct sockaddr *)&locaddr,
300                                  sizeof(locaddr));
301                 if (rc == -EADDRINUSE) {
302                         CDEBUG(D_NET, "Port %d already in use\n", local_port);
303                         goto failed;
304                 }
305                 if (rc != 0) {
306                         CERROR("Error trying to bind to port %d: %d\n",
307                                local_port, rc);
308                         goto failed;
309                 }
310         }
311         return sock;
312
313 failed:
314         sock_release(sock);
315         return ERR_PTR(rc);
316 }
317
318 void
319 lnet_sock_setbuf(struct socket *sock, int txbufsize, int rxbufsize)
320 {
321         struct sock *sk = sock->sk;
322
323         if (txbufsize != 0) {
324                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
325                 sk->sk_sndbuf = txbufsize;
326                 sk->sk_write_space(sk);
327         }
328
329         if (rxbufsize != 0) {
330                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
331                 sk->sk_sndbuf = rxbufsize;
332         }
333 }
334 EXPORT_SYMBOL(lnet_sock_setbuf);
335
336 int
337 lnet_sock_getaddr(struct socket *sock, bool remote,
338                   struct sockaddr_storage *peer)
339 {
340         int rc;
341 #ifndef HAVE_KERN_SOCK_GETNAME_2ARGS
342         int len = sizeof(*peer);
343 #endif
344
345         if (remote)
346                 rc = lnet_kernel_getpeername(sock,
347                                              (struct sockaddr *)peer, &len);
348         else
349                 rc = lnet_kernel_getsockname(sock,
350                                              (struct sockaddr *)peer, &len);
351         if (rc < 0) {
352                 CERROR("Error %d getting sock %s IP/port\n",
353                         rc, remote ? "peer" : "local");
354                 return rc;
355         }
356         if (peer->ss_family == AF_INET6) {
357                 struct sockaddr_in6 *in6 = (void *)peer;
358                 struct sockaddr_in *in = (void *)peer;
359                 short port = in6->sin6_port;
360
361                 if (ipv6_addr_v4mapped(&in6->sin6_addr)) {
362                         /* Pretend it is a v4 socket */
363                         memset(in, 0, sizeof(*in));
364                         in->sin_family = AF_INET;
365                         in->sin_port = port;
366                         memcpy(&in->sin_addr, &in6->sin6_addr.s6_addr32[3], 4);
367                 }
368         }
369         return 0;
370 }
371 EXPORT_SYMBOL(lnet_sock_getaddr);
372
373 void lnet_sock_getbuf(struct socket *sock, int *txbufsize, int *rxbufsize)
374 {
375         if (txbufsize != NULL)
376                 *txbufsize = sock->sk->sk_sndbuf;
377
378         if (rxbufsize != NULL)
379                 *rxbufsize = sock->sk->sk_rcvbuf;
380 }
381 EXPORT_SYMBOL(lnet_sock_getbuf);
382
383 struct socket *
384 lnet_sock_listen(int local_port, int backlog, struct net *ns)
385 {
386         struct socket *sock;
387         int rc;
388
389         sock = lnet_sock_create(-1, NULL, local_port, ns);
390         if (IS_ERR(sock)) {
391                 rc = PTR_ERR(sock);
392                 if (rc == -EADDRINUSE)
393                         CERROR("Can't create socket: port %d already in use\n",
394                                local_port);
395                 return ERR_PTR(rc);
396         }
397
398         rc = kernel_listen(sock, backlog);
399         if (rc == 0)
400                 return sock;
401
402         CERROR("Can't set listen backlog %d: %d\n", backlog, rc);
403         sock_release(sock);
404         return ERR_PTR(rc);
405 }
406
407 struct socket *
408 lnet_sock_connect(int interface, int local_port,
409                   struct sockaddr *peeraddr,
410                   struct net *ns)
411 {
412         struct socket *sock;
413         int rc;
414
415         sock = lnet_sock_create(interface, peeraddr, local_port, ns);
416         if (IS_ERR(sock))
417                 return sock;
418
419         rc = kernel_connect(sock, peeraddr, sizeof(struct sockaddr_in6), 0);
420         if (rc == 0)
421                 return sock;
422
423         /* EADDRNOTAVAIL probably means we're already connected to the same
424          * peer/port on the same local port on a differently typed
425          * connection.  Let our caller retry with a different local
426          * port... */
427
428         CDEBUG_LIMIT(rc == -EADDRNOTAVAIL ? D_NET : D_NETERROR,
429                      "Error %d connecting %d -> %pISp\n", rc,
430                      local_port, peeraddr);
431
432         sock_release(sock);
433         return ERR_PTR(rc);
434 }