Whamcloud - gitweb
LU-14662 lnet: set eth routes needed for multi rail
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd_modparams.c
1 /*
2  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
3  *
4  * Copyright (c) 2011, 2012, Intel Corporation.
5  *
6  *   Author: Eric Barton <eric@bartonsoftware.com>
7  *
8  *   Portals is free software; you can redistribute it and/or
9  *   modify it under the terms of version 2 of the GNU General Public
10  *   License as published by the Free Software Foundation.
11  *
12  *   Portals is distributed in the hope that it will be useful,
13  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
14  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  *   GNU General Public License for more details.
16  *
17  *   You should have received a copy of the GNU General Public License
18  *   along with Portals; if not, write to the Free Software
19  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20  */
21
22 #include "socklnd.h"
23
24 #include <linux/kvm_host.h>
25 #if defined(__x86_64__) || defined(__i386__)
26 #include <asm/hypervisor.h>
27 #endif
28 #ifdef HAVE_ETHTOOL_LINK_SETTINGS
29 #include <linux/inetdevice.h>
30 #include <linux/ethtool.h>
31 #endif
32
33 #define CURRENT_LND_VERSION 1
34
35 static int sock_timeout;
36 module_param(sock_timeout, int, 0644);
37 MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
38
39 static int credits = DEFAULT_CREDITS;
40 module_param(credits, int, 0444);
41 MODULE_PARM_DESC(credits, "# concurrent sends");
42
43 static int peer_credits = DEFAULT_PEER_CREDITS;
44 module_param(peer_credits, int, 0444);
45 MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
46
47 static int peer_buffer_credits;
48 module_param(peer_buffer_credits, int, 0444);
49 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
50
51 static int peer_timeout = DEFAULT_PEER_TIMEOUT;
52 module_param(peer_timeout, int, 0444);
53 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
54
55 /* Number of daemons in each thread pool which is percpt,
56  * we will estimate reasonable value based on CPUs if it's not set. */
57 static unsigned int nscheds;
58 module_param(nscheds, int, 0444);
59 MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
60
61 static int nconnds = 4;
62 module_param(nconnds, int, 0444);
63 MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
64
65 static int nconnds_max = 64;
66 module_param(nconnds_max, int, 0444);
67 MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
68
69 static int min_reconnectms = 1000;
70 module_param(min_reconnectms, int, 0644);
71 MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
72
73 static int max_reconnectms = 60000;
74 module_param(max_reconnectms, int, 0644);
75 MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
76
77 static int eager_ack;
78 module_param(eager_ack, int, 0644);
79 MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
80
81 static int typed_conns = 1;
82 module_param(typed_conns, int, 0444);
83 MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
84
85 static int min_bulk = (1<<10);
86 module_param(min_bulk, int, 0644);
87 MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
88
89 # define DEFAULT_BUFFER_SIZE 0
90 static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
91 module_param(tx_buffer_size, int, 0644);
92 MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
93
94 static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
95 module_param(rx_buffer_size, int, 0644);
96 MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
97
98 static int nagle = 0;
99 module_param(nagle, int, 0644);
100 MODULE_PARM_DESC(nagle, "enable NAGLE?");
101
102 static int round_robin = 1;
103 module_param(round_robin, int, 0644);
104 MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
105
106 static int keepalive = 30;
107 module_param(keepalive, int, 0644);
108 MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
109
110 static int keepalive_idle = 30;
111 module_param(keepalive_idle, int, 0644);
112 MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
113
114 #define DEFAULT_KEEPALIVE_COUNT  5
115 static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
116 module_param(keepalive_count, int, 0644);
117 MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
118
119 static int keepalive_intvl = 5;
120 module_param(keepalive_intvl, int, 0644);
121 MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
122
123 static int enable_csum = 0;
124 module_param(enable_csum, int, 0644);
125 MODULE_PARM_DESC(enable_csum, "enable check sum");
126
127 static int inject_csum_error = 0;
128 module_param(inject_csum_error, int, 0644);
129 MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
130
131 static int enable_irq_affinity = 0;
132 module_param(enable_irq_affinity, int, 0644);
133 MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");
134
135 static int nonblk_zcack = 1;
136 module_param(nonblk_zcack, int, 0644);
137 MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
138
139 static unsigned int zc_min_payload = (16 << 10);
140 module_param(zc_min_payload, int, 0644);
141 MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
142
143 static unsigned int zc_recv = 0;
144 module_param(zc_recv, int, 0644);
145 MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
146
147 static unsigned int zc_recv_min_nfrags = 16;
148 module_param(zc_recv_min_nfrags, int, 0644);
149 MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
150
151 static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER;
152 module_param(conns_per_peer, uint, 0644);
153 MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
154
155 /* By default skip_mr_route_setup is 0 (do not skip) */
156 static unsigned int skip_mr_route_setup;
157 module_param(skip_mr_route_setup, uint, 0444);
158 MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR");
159
160 #ifdef SOCKNAL_BACKOFF
161 static int backoff_init = 3;
162 module_param(backoff_init, int, 0644);
163 MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff");
164
165 static int backoff_max = 3;
166 module_param(backoff_max, int, 0644);
167 MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff");
168 #endif
169
170 #if SOCKNAL_VERSION_DEBUG
171 static int protocol = 3;
172 module_param(protocol, int, 0644);
173 MODULE_PARM_DESC(protocol, "protocol version");
174 #endif
175
176 static inline bool is_native_host(void)
177 {
178 #ifdef HAVE_HYPERVISOR_IS_TYPE
179         return hypervisor_is_type(X86_HYPER_NATIVE);
180 #elif defined(__x86_64__) || defined(__i386__)
181         return x86_hyper == NULL;
182 #else
183         return true;
184 #endif
185 }
186
187 struct ksock_tunables ksocknal_tunables;
188 static struct lnet_ioctl_config_socklnd_tunables default_tunables;
189
190 #ifdef HAVE_ETHTOOL_LINK_SETTINGS
191 static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni)
192 {
193         struct net_device *dev;
194         int intf_idx = -1;
195         int ret = -1;
196
197         DECLARE_CONST_IN_IFADDR(ifa);
198
199         /* check if ni has interface assigned */
200         if (!ni->ni_net_ns || !ni->ni_interface)
201                 return 0;
202
203         rtnl_lock();
204         for_each_netdev(ni->ni_net_ns, dev) {
205                 int flags = dev_get_flags(dev);
206                 struct in_device *in_dev;
207
208                 if (flags & IFF_LOOPBACK) /* skip the loopback IF */
209                         continue;
210
211                 if (!(flags & IFF_UP))
212                         continue;
213
214                 in_dev = __in_dev_get_rcu(dev);
215                 if (!in_dev)
216                         continue;
217
218                 in_dev_for_each_ifa_rcu(ifa, in_dev) {
219                         if (strcmp(ifa->ifa_label, ni->ni_interface) == 0)
220                                 intf_idx = dev->ifindex;
221                 }
222                 endfor_ifa(in_dev);
223
224                 if (intf_idx >= 0)
225                         break;
226         }
227         if (intf_idx >= 0) {
228                 struct ethtool_link_ksettings cmd;
229                 int ethtool_ret;
230
231                 /* Some devices may not be providing link settings */
232                 ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd);
233                 if (!ethtool_ret)
234                         ret = cmd.base.speed;
235                 else
236                         ret = ethtool_ret;
237         }
238         rtnl_unlock();
239
240         return ret;
241 }
242
243 static int ksocklnd_speed2cpp(int speed)
244 {
245         /* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */
246         if (speed < 1000)
247                 speed = 1000;
248
249         /* Pick heuristically optimal conns_per_peer value
250          * for the specified ethernet interface speed (Mbps)
251          */
252         return ilog2(speed/1000) / 2 + 1;
253 }
254 #endif
255
256 static int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni)
257 {
258         int cpp = 1;
259 #ifdef HAVE_ETHTOOL_LINK_SETTINGS
260         int speed = ksocklnd_ni_get_eth_intf_speed(ni);
261
262         if (ni->ni_interface)
263                 CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed);
264
265         if (speed > 0)
266                 cpp = ksocklnd_speed2cpp(speed);
267 #endif
268         return cpp;
269 }
270
271 int ksocknal_tunables_init(void)
272 {
273         default_tunables.lnd_version = CURRENT_LND_VERSION;
274         default_tunables.lnd_conns_per_peer = conns_per_peer;
275
276         /* initialize ksocknal_tunables structure */
277         ksocknal_tunables.ksnd_timeout            = &sock_timeout;
278         ksocknal_tunables.ksnd_nscheds            = &nscheds;
279         ksocknal_tunables.ksnd_nconnds            = &nconnds;
280         ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
281         ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
282         ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
283         ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
284         ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
285         ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
286         ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
287         ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
288         ksocknal_tunables.ksnd_nagle              = &nagle;
289         ksocknal_tunables.ksnd_round_robin        = &round_robin;
290         ksocknal_tunables.ksnd_keepalive          = &keepalive;
291         ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
292         ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
293         ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
294         ksocknal_tunables.ksnd_credits            = &credits;
295         ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
296         ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
297         ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
298         ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
299         ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
300         ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
301         ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
302         ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
303         ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
304         if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
305                 CWARN("socklnd conns_per_peer is capped at %u.\n",
306                       (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
307         }
308         ksocknal_tunables.ksnd_conns_per_peer     = &conns_per_peer;
309
310         if (enable_irq_affinity) {
311                 CWARN("irq_affinity is removed from socklnd because modern "
312                       "computer always has fast CPUs and more cores than "
313                       "# NICs, although you still can set irq_affinity by "
314                       "another way, please check manual for details.\n");
315         }
316         ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
317
318 #ifdef SOCKNAL_BACKOFF
319         ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
320         ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
321 #endif
322
323 #if SOCKNAL_VERSION_DEBUG
324         ksocknal_tunables.ksnd_protocol           = &protocol;
325 #endif
326
327         if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
328                 *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
329
330         /* When on a hypervisor set the minimum zero copy size
331          * above the maximum payload size
332          */
333         if (!is_native_host())
334                 *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1;
335
336         return 0;
337 }
338
339 void ksocknal_tunables_setup(struct lnet_ni *ni)
340 {
341         struct lnet_ioctl_config_socklnd_tunables *tunables;
342         struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
343
344         /* If no tunables specified, setup default tunables */
345         if (!ni->ni_lnd_tunables_set)
346                 memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock,
347                        &default_tunables, sizeof(*tunables));
348
349         tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock;
350
351         /* Current API version */
352         tunables->lnd_version = CURRENT_LND_VERSION;
353
354         net_tunables = &ni->ni_net->net_tunables;
355
356         if (net_tunables->lct_peer_timeout == -1)
357                 net_tunables->lct_peer_timeout =
358                         *ksocknal_tunables.ksnd_peertimeout;
359
360         if (net_tunables->lct_max_tx_credits == -1)
361                 net_tunables->lct_max_tx_credits =
362                         *ksocknal_tunables.ksnd_credits;
363
364         if (net_tunables->lct_peer_tx_credits == -1)
365                 net_tunables->lct_peer_tx_credits =
366                         *ksocknal_tunables.ksnd_peertxcredits;
367
368         if (net_tunables->lct_peer_tx_credits >
369             net_tunables->lct_max_tx_credits)
370                 net_tunables->lct_peer_tx_credits =
371                         net_tunables->lct_max_tx_credits;
372
373         if (net_tunables->lct_peer_rtr_credits == -1)
374                 net_tunables->lct_peer_rtr_credits =
375                         *ksocknal_tunables.ksnd_peerrtrcredits;
376
377         if (!tunables->lnd_conns_per_peer)
378                 tunables->lnd_conns_per_peer =
379                         ksocklnd_lookup_conns_per_peer(ni);
380 }