Whamcloud - gitweb
New RC 2.16.0-RC5
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd_modparams.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 /* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
4  *
5  * Copyright (c) 2011, 2012, Intel Corporation.
6  */
7
8 /* This file is part of Lustre, http://www.lustre.org/
9  *
10  * Author: Eric Barton <eric@bartonsoftware.com>
11  */
12
13 #include "socklnd.h"
14
15 #include <linux/kvm_host.h>
16 #if defined(__x86_64__) || defined(__i386__)
17 #include <asm/hypervisor.h>
18 #endif
19 #ifdef HAVE_ETHTOOL_LINK_SETTINGS
20 #include <linux/inetdevice.h>
21 #include <linux/ethtool.h>
22 #include <net/addrconf.h>
23 #endif
24
25 #define CURRENT_LND_VERSION 1
26
27 static int sock_timeout;
28 module_param(sock_timeout, int, 0644);
29 MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
30
31 static int credits = DEFAULT_CREDITS;
32 module_param(credits, int, 0444);
33 MODULE_PARM_DESC(credits, "# concurrent sends");
34
35 static int peer_credits = DEFAULT_PEER_CREDITS;
36 module_param(peer_credits, int, 0444);
37 MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
38
39 static int peer_buffer_credits;
40 module_param(peer_buffer_credits, int, 0444);
41 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
42
43 static int peer_timeout = DEFAULT_PEER_TIMEOUT;
44 module_param(peer_timeout, int, 0444);
45 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
46
47 /* Number of daemons in each thread pool which is percpt,
48  * we will estimate reasonable value based on CPUs if it's not set. */
49 static unsigned int nscheds;
50 module_param(nscheds, int, 0444);
51 MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
52
53 static int nconnds = 4;
54 module_param(nconnds, int, 0444);
55 MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
56
57 static int nconnds_max = 64;
58 module_param(nconnds_max, int, 0444);
59 MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
60
61 static int min_reconnectms = 1000;
62 module_param(min_reconnectms, int, 0644);
63 MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
64
65 static int max_reconnectms = 60000;
66 module_param(max_reconnectms, int, 0644);
67 MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
68
69 static int eager_ack;
70 module_param(eager_ack, int, 0644);
71 MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
72
73 static int typed_conns = 1;
74 module_param(typed_conns, int, 0444);
75 MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
76
77 static int min_bulk = (1<<10);
78 module_param(min_bulk, int, 0644);
79 MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
80
81 # define DEFAULT_BUFFER_SIZE 0
82 static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
83 module_param(tx_buffer_size, int, 0644);
84 MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
85
86 static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
87 module_param(rx_buffer_size, int, 0644);
88 MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
89
90 static int nagle = 0;
91 module_param(nagle, int, 0644);
92 MODULE_PARM_DESC(nagle, "enable NAGLE?");
93
94 static int round_robin = 1;
95 module_param(round_robin, int, 0644);
96 MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
97
98 static int keepalive = 30;
99 module_param(keepalive, int, 0644);
100 MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
101
102 static int keepalive_idle = 30;
103 module_param(keepalive_idle, int, 0644);
104 MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
105
106 #define DEFAULT_KEEPALIVE_COUNT  5
107 static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
108 module_param(keepalive_count, int, 0644);
109 MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
110
111 static int keepalive_intvl = 5;
112 module_param(keepalive_intvl, int, 0644);
113 MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
114
115 static int enable_csum = 0;
116 module_param(enable_csum, int, 0644);
117 MODULE_PARM_DESC(enable_csum, "enable check sum");
118
119 static int inject_csum_error = 0;
120 module_param(inject_csum_error, int, 0644);
121 MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
122
123 static int enable_irq_affinity = 0;
124 module_param(enable_irq_affinity, int, 0644);
125 MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");
126
127 static int nonblk_zcack = 1;
128 module_param(nonblk_zcack, int, 0644);
129 MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
130
131 static unsigned int zc_min_payload = (16 << 10);
132 module_param(zc_min_payload, int, 0644);
133 MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
134
135 static unsigned int zc_recv = 0;
136 module_param(zc_recv, int, 0644);
137 MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
138
139 static unsigned int zc_recv_min_nfrags = 16;
140 module_param(zc_recv_min_nfrags, int, 0644);
141 MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
142
143 static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER;
144 module_param(conns_per_peer, uint, 0644);
145 MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
146
147 /* By default skip_mr_route_setup is 0 (do not skip) */
148 static unsigned int skip_mr_route_setup;
149 module_param(skip_mr_route_setup, uint, 0444);
150 MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR");
151
152 #ifdef SOCKNAL_BACKOFF
153 static int backoff_init = 3;
154 module_param(backoff_init, int, 0644);
155 MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff");
156
157 static int backoff_max = 3;
158 module_param(backoff_max, int, 0644);
159 MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff");
160 #endif
161
162 #if SOCKNAL_VERSION_DEBUG
163 static int protocol = 3;
164 module_param(protocol, int, 0644);
165 MODULE_PARM_DESC(protocol, "protocol version");
166 #endif
167
168 static int tos = -1;
169 static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp);
170 #ifdef HAVE_KERNEL_PARAM_OPS
171 static const struct kernel_param_ops param_ops_tos = {
172         .set = param_set_tos,
173         .get = param_get_int,
174 };
175
176 #define param_check_tos(name, p) \
177         __param_check(name, p, int)
178 module_param(tos, tos, 0444);
179 #else
180 module_param_call(tos, param_set_tos, param_get_int, &tos, 0444);
181 #endif
182 MODULE_PARM_DESC(tos, "Set the type of service (=-1 to disable)");
183
184 static inline bool is_native_host(void)
185 {
186 #ifdef HAVE_HYPERVISOR_IS_TYPE
187         return hypervisor_is_type(X86_HYPER_NATIVE);
188 #elif defined(__x86_64__) || defined(__i386__)
189         return x86_hyper == NULL;
190 #else
191         return true;
192 #endif
193 }
194
195 struct ksock_tunables ksocknal_tunables;
196 struct lnet_ioctl_config_socklnd_tunables ksock_default_tunables;
197
198 static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp)
199 {
200         int rc, t;
201
202         if (!val)
203                 return -EINVAL;
204
205         rc = kstrtoint(val, 0, &t);
206         if (rc)
207                 return rc;
208
209         if (t < -1 || t > 0xff)
210                 return -ERANGE;
211
212         *((int *)kp->arg) = t;
213
214         return 0;
215 }
216
217 #ifdef HAVE_ETHTOOL_LINK_SETTINGS
218 static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni)
219 {
220         struct net_device *dev;
221         int intf_idx = -1;
222         int ret = -1;
223
224         DECLARE_CONST_IN_IFADDR(ifa);
225
226         /* check if ni has interface assigned */
227         if (!ni->ni_net_ns || !ni->ni_interface)
228                 return 0;
229
230         rtnl_lock();
231         for_each_netdev(ni->ni_net_ns, dev) {
232                 int flags = dev_get_flags(dev);
233                 struct in_device *in_dev;
234
235                 if (flags & IFF_LOOPBACK) /* skip the loopback IF */
236                         continue;
237
238                 if (!(flags & IFF_UP))
239                         continue;
240
241                 in_dev = __in_dev_get_rtnl(dev);
242                 if (in_dev) {
243                         in_dev_for_each_ifa_rtnl(ifa, in_dev) {
244                                 if (strcmp(ifa->ifa_label, ni->ni_interface) == 0)
245                                         intf_idx = dev->ifindex;
246                         }
247                         endfor_ifa(in_dev);
248                 } else {
249 #if IS_ENABLED(CONFIG_IPV6)
250                         struct inet6_dev *in6_dev = __in6_dev_get(dev);
251
252                         if (in6_dev) {
253                                 const struct inet6_ifaddr *ifa6;
254
255                                 list_for_each_entry_rcu(ifa6,
256                                                         &in6_dev->addr_list,
257                                                         if_list) {
258                                         if (ifa6->flags & IFA_F_TEMPORARY)
259                                                 continue;
260
261                                         /* As different IPv6 addresses don't
262                                          * have unique labels, it is safest
263                                          * just to use the first and ignore
264                                          * the rest.
265                                          */
266                                         if (strcmp(dev->name,
267                                                    ni->ni_interface) == 0) {
268                                                 intf_idx = dev->ifindex;
269                                                 break;
270                                         }
271                                 }
272                         } else {
273 #endif
274                                 continue;
275 #if IS_ENABLED(CONFIG_IPV6)
276                         }
277 #endif
278                 }
279
280                 if (intf_idx >= 0)
281                         break;
282         }
283         if (intf_idx >= 0) {
284                 struct ethtool_link_ksettings cmd;
285                 int ethtool_ret;
286
287                 /* Some devices may not be providing link settings */
288                 ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd);
289                 if (!ethtool_ret)
290                         ret = cmd.base.speed;
291                 else
292                         ret = ethtool_ret;
293         }
294         rtnl_unlock();
295
296         return ret;
297 }
298
299 static int ksocklnd_speed2cpp(int speed)
300 {
301         /* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */
302         if (speed < 1000)
303                 speed = 1000;
304
305         /* Pick heuristically optimal conns_per_peer value
306          * for the specified ethernet interface speed (Mbps)
307          */
308         return ilog2(speed/1000) / 2 + 1;
309 }
310 #endif
311
312 static int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni)
313 {
314         int cpp = 1;
315 #ifdef HAVE_ETHTOOL_LINK_SETTINGS
316         int speed = ksocklnd_ni_get_eth_intf_speed(ni);
317
318         if (ni->ni_interface)
319                 CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed);
320
321         if (speed > 0)
322                 cpp = ksocklnd_speed2cpp(speed);
323 #endif
324         return cpp;
325 }
326
327 int ksocknal_tunables_init(void)
328 {
329         ksock_default_tunables.lnd_version = CURRENT_LND_VERSION;
330         ksock_default_tunables.lnd_conns_per_peer = conns_per_peer;
331         ksock_default_tunables.lnd_tos = tos;
332
333         /* initialize ksocknal_tunables structure */
334         ksocknal_tunables.ksnd_timeout            = &sock_timeout;
335         ksocknal_tunables.ksnd_nscheds            = &nscheds;
336         ksocknal_tunables.ksnd_nconnds            = &nconnds;
337         ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
338         ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
339         ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
340         ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
341         ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
342         ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
343         ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
344         ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
345         ksocknal_tunables.ksnd_nagle              = &nagle;
346         ksocknal_tunables.ksnd_round_robin        = &round_robin;
347         ksocknal_tunables.ksnd_keepalive          = &keepalive;
348         ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
349         ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
350         ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
351         ksocknal_tunables.ksnd_credits            = &credits;
352         ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
353         ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
354         ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
355         ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
356         ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
357         ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
358         ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
359         ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
360         ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
361         if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
362                 CWARN("socklnd conns_per_peer is capped at %u.\n",
363                       (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
364         }
365         ksocknal_tunables.ksnd_conns_per_peer     = &conns_per_peer;
366
367         if (enable_irq_affinity) {
368                 CWARN("irq_affinity is removed from socklnd because modern "
369                       "computer always has fast CPUs and more cores than "
370                       "# NICs, although you still can set irq_affinity by "
371                       "another way, please check manual for details.\n");
372         }
373         ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
374
375 #ifdef SOCKNAL_BACKOFF
376         ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
377         ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
378 #endif
379
380 #if SOCKNAL_VERSION_DEBUG
381         ksocknal_tunables.ksnd_protocol           = &protocol;
382 #endif
383
384         if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
385                 *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
386
387         /* When on a hypervisor set the minimum zero copy size
388          * above the maximum payload size
389          */
390         if (!is_native_host())
391                 *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1;
392
393         return 0;
394 }
395
396 void ksocknal_tunables_setup(struct lnet_ni *ni)
397 {
398         struct lnet_ioctl_config_socklnd_tunables *tunables;
399         struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
400
401         /* If no tunables specified, setup default tunables */
402         if (!ni->ni_lnd_tunables_set)
403                 memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock,
404                        &ksock_default_tunables, sizeof(*tunables));
405
406         tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock;
407
408         /* Current API version */
409         tunables->lnd_version = CURRENT_LND_VERSION;
410
411         net_tunables = &ni->ni_net->net_tunables;
412
413         if (net_tunables->lct_peer_timeout == -1)
414                 net_tunables->lct_peer_timeout =
415                         *ksocknal_tunables.ksnd_peertimeout;
416
417         if (net_tunables->lct_max_tx_credits == -1)
418                 net_tunables->lct_max_tx_credits =
419                         *ksocknal_tunables.ksnd_credits;
420
421         if (net_tunables->lct_peer_tx_credits == -1)
422                 net_tunables->lct_peer_tx_credits =
423                         *ksocknal_tunables.ksnd_peertxcredits;
424
425         if (net_tunables->lct_peer_tx_credits >
426             net_tunables->lct_max_tx_credits)
427                 net_tunables->lct_peer_tx_credits =
428                         net_tunables->lct_max_tx_credits;
429
430         if (net_tunables->lct_peer_rtr_credits == -1)
431                 net_tunables->lct_peer_rtr_credits =
432                         *ksocknal_tunables.ksnd_peerrtrcredits;
433
434         if (!tunables->lnd_conns_per_peer)
435                 tunables->lnd_conns_per_peer =
436                         ksocklnd_lookup_conns_per_peer(ni);
437
438         if (tunables->lnd_tos < 0)
439                 tunables->lnd_tos = tos;
440
441         tunables->lnd_timeout = ksocknal_timeout();
442 }