2 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
4 * Copyright (c) 2011, 2012, Intel Corporation.
6 * Author: Eric Barton <eric@bartonsoftware.com>
8 * Portals is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Portals is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Portals; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include <linux/kvm_host.h>
25 #if defined(__x86_64__) || defined(__i386__)
26 #include <asm/hypervisor.h>
29 #define CURRENT_LND_VERSION 1
31 static int sock_timeout;
32 module_param(sock_timeout, int, 0644);
33 MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
35 static int credits = DEFAULT_CREDITS;
36 module_param(credits, int, 0444);
37 MODULE_PARM_DESC(credits, "# concurrent sends");
39 static int peer_credits = DEFAULT_PEER_CREDITS;
40 module_param(peer_credits, int, 0444);
41 MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
43 static int peer_buffer_credits;
44 module_param(peer_buffer_credits, int, 0444);
45 MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
47 static int peer_timeout = DEFAULT_PEER_TIMEOUT;
48 module_param(peer_timeout, int, 0444);
49 MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
51 /* Number of daemons in each thread pool which is percpt,
52 * we will estimate reasonable value based on CPUs if it's not set. */
53 static unsigned int nscheds;
54 module_param(nscheds, int, 0444);
55 MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
57 static int nconnds = 4;
58 module_param(nconnds, int, 0444);
59 MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
61 static int nconnds_max = 64;
62 module_param(nconnds_max, int, 0444);
63 MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
65 static int min_reconnectms = 1000;
66 module_param(min_reconnectms, int, 0644);
67 MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
69 static int max_reconnectms = 60000;
70 module_param(max_reconnectms, int, 0644);
71 MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
74 module_param(eager_ack, int, 0644);
75 MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
77 static int typed_conns = 1;
78 module_param(typed_conns, int, 0444);
79 MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
81 static int min_bulk = (1<<10);
82 module_param(min_bulk, int, 0644);
83 MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
85 # define DEFAULT_BUFFER_SIZE 0
86 static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
87 module_param(tx_buffer_size, int, 0644);
88 MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
90 static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
91 module_param(rx_buffer_size, int, 0644);
92 MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
95 module_param(nagle, int, 0644);
96 MODULE_PARM_DESC(nagle, "enable NAGLE?");
98 static int round_robin = 1;
99 module_param(round_robin, int, 0644);
100 MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
102 static int keepalive = 30;
103 module_param(keepalive, int, 0644);
104 MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
106 static int keepalive_idle = 30;
107 module_param(keepalive_idle, int, 0644);
108 MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
110 #define DEFAULT_KEEPALIVE_COUNT 5
111 static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
112 module_param(keepalive_count, int, 0644);
113 MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
115 static int keepalive_intvl = 5;
116 module_param(keepalive_intvl, int, 0644);
117 MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
119 static int enable_csum = 0;
120 module_param(enable_csum, int, 0644);
121 MODULE_PARM_DESC(enable_csum, "enable check sum");
123 static int inject_csum_error = 0;
124 module_param(inject_csum_error, int, 0644);
125 MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
127 static int enable_irq_affinity = 0;
128 module_param(enable_irq_affinity, int, 0644);
129 MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");
131 static int nonblk_zcack = 1;
132 module_param(nonblk_zcack, int, 0644);
133 MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
135 static unsigned int zc_min_payload = (16 << 10);
136 module_param(zc_min_payload, int, 0644);
137 MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
139 static unsigned int zc_recv = 0;
140 module_param(zc_recv, int, 0644);
141 MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
143 static unsigned int zc_recv_min_nfrags = 16;
144 module_param(zc_recv_min_nfrags, int, 0644);
145 MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
147 static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER;
148 module_param(conns_per_peer, uint, 0644);
149 MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
151 #ifdef SOCKNAL_BACKOFF
152 static int backoff_init = 3;
153 module_param(backoff_init, int, 0644);
154 MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff");
156 static int backoff_max = 3;
157 module_param(backoff_max, int, 0644);
158 MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff");
161 #if SOCKNAL_VERSION_DEBUG
162 static int protocol = 3;
163 module_param(protocol, int, 0644);
164 MODULE_PARM_DESC(protocol, "protocol version");
167 static inline bool is_native_host(void)
169 #ifdef HAVE_HYPERVISOR_IS_TYPE
170 return hypervisor_is_type(X86_HYPER_NATIVE);
171 #elif defined(__x86_64__) || defined(__i386__)
172 return x86_hyper == NULL;
178 struct ksock_tunables ksocknal_tunables;
179 static struct lnet_ioctl_config_socklnd_tunables default_tunables;
181 int ksocknal_tunables_init(void)
183 default_tunables.lnd_version = CURRENT_LND_VERSION;
184 default_tunables.lnd_conns_per_peer = conns_per_peer;
186 /* initialize ksocknal_tunables structure */
187 ksocknal_tunables.ksnd_timeout = &sock_timeout;
188 ksocknal_tunables.ksnd_nscheds = &nscheds;
189 ksocknal_tunables.ksnd_nconnds = &nconnds;
190 ksocknal_tunables.ksnd_nconnds_max = &nconnds_max;
191 ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
192 ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
193 ksocknal_tunables.ksnd_eager_ack = &eager_ack;
194 ksocknal_tunables.ksnd_typed_conns = &typed_conns;
195 ksocknal_tunables.ksnd_min_bulk = &min_bulk;
196 ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size;
197 ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size;
198 ksocknal_tunables.ksnd_nagle = &nagle;
199 ksocknal_tunables.ksnd_round_robin = &round_robin;
200 ksocknal_tunables.ksnd_keepalive = &keepalive;
201 ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle;
202 ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
203 ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
204 ksocknal_tunables.ksnd_credits = &credits;
205 ksocknal_tunables.ksnd_peertxcredits = &peer_credits;
206 ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits;
207 ksocknal_tunables.ksnd_peertimeout = &peer_timeout;
208 ksocknal_tunables.ksnd_enable_csum = &enable_csum;
209 ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
210 ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack;
211 ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload;
212 ksocknal_tunables.ksnd_zc_recv = &zc_recv;
213 ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
214 if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
215 CWARN("socklnd conns_per_peer is capped at %u.\n",
216 (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
218 ksocknal_tunables.ksnd_conns_per_peer = &conns_per_peer;
220 if (enable_irq_affinity) {
221 CWARN("irq_affinity is removed from socklnd because modern "
222 "computer always has fast CPUs and more cores than "
223 "# NICs, although you still can set irq_affinity by "
224 "another way, please check manual for details.\n");
226 ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity;
228 #ifdef SOCKNAL_BACKOFF
229 ksocknal_tunables.ksnd_backoff_init = &backoff_init;
230 ksocknal_tunables.ksnd_backoff_max = &backoff_max;
233 #if SOCKNAL_VERSION_DEBUG
234 ksocknal_tunables.ksnd_protocol = &protocol;
237 if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
238 *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
240 /* When on a hypervisor set the minimum zero copy size
241 * above the maximum payload size
243 if (!is_native_host())
244 *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1;
249 void ksocknal_tunables_setup(struct lnet_ni *ni)
251 struct lnet_ioctl_config_socklnd_tunables *tunables;
252 struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
254 /* If no tunables specified, setup default tunables */
255 if (!ni->ni_lnd_tunables_set)
256 memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock,
257 &default_tunables, sizeof(*tunables));
259 tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock;
261 /* Current API version */
262 tunables->lnd_version = CURRENT_LND_VERSION;
264 net_tunables = &ni->ni_net->net_tunables;
266 if (net_tunables->lct_peer_timeout == -1)
267 net_tunables->lct_peer_timeout =
268 *ksocknal_tunables.ksnd_peertimeout;
270 if (net_tunables->lct_max_tx_credits == -1)
271 net_tunables->lct_max_tx_credits =
272 *ksocknal_tunables.ksnd_credits;
274 if (net_tunables->lct_peer_tx_credits == -1)
275 net_tunables->lct_peer_tx_credits =
276 *ksocknal_tunables.ksnd_peertxcredits;
278 if (net_tunables->lct_peer_tx_credits >
279 net_tunables->lct_max_tx_credits)
280 net_tunables->lct_peer_tx_credits =
281 net_tunables->lct_max_tx_credits;
283 if (net_tunables->lct_peer_rtr_credits == -1)
284 net_tunables->lct_peer_rtr_credits =
285 *ksocknal_tunables.ksnd_peerrtrcredits;
287 if (!tunables->lnd_conns_per_peer)
288 tunables->lnd_conns_per_peer = (conns_per_peer) ?
289 conns_per_peer : DEFAULT_CONNS_PER_PEER;