2 * Copyright (C) 2004 Cluster File Systems, Inc.
4 * Copyright (C) 2009-2012 Cray, Inc.
6 * Derived from work by: Eric Barton <eric@bartonsoftware.com>
7 * Author: Nic Henke <nic@cray.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 static int credits = GNILND_DEFAULT_CREDITS;
29 module_param(credits, int, 0444);
30 MODULE_PARM_DESC(credits, "# concurrent sends");
32 static int eager_credits = 256 * 1024;
33 module_param(eager_credits, int, 0644);
34 MODULE_PARM_DESC(eager_credits, "# eager buffers");
36 static int peer_credits = 16;
37 module_param(peer_credits, int, 0444);
38 MODULE_PARM_DESC(peer_credits, "# LNet peer credits");
40 /* NB - we'll not actually limit sends to this, we just size the mailbox buffer
41 * such that at most we'll have concurrent_sends * max_immediate messages
43 static int concurrent_sends = 0;
44 module_param(concurrent_sends, int, 0444);
45 MODULE_PARM_DESC(concurrent_sends, "# concurrent HW sends to 1 peer");
47 /* default for 2k nodes @ 16 peer credits */
48 static int fma_cq_size = 32768;
49 module_param(fma_cq_size, int, 0444);
50 MODULE_PARM_DESC(fma_cq_size, "size of the completion queue");
52 static int timeout = GNILND_BASE_TIMEOUT;
53 /* can't change @ runtime because LNet gets NI data at startup from
55 module_param(timeout, int, 0444);
56 MODULE_PARM_DESC(timeout, "communications timeout (seconds)");
58 /* time to wait between datagram timeout and sending of next dgram */
59 static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
60 module_param(min_reconnect_interval, int, 0644);
61 MODULE_PARM_DESC(min_reconnect_interval, "minimum connection retry interval (seconds)");
63 /* if this goes longer than timeout, we'll timeout the TX before
65 static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
66 module_param(max_reconnect_interval, int, 0644);
67 MODULE_PARM_DESC(max_reconnect_interval, "maximum connection retry interval (seconds)");
69 static int max_immediate = 2048;
70 module_param(max_immediate, int, 0444);
71 MODULE_PARM_DESC(max_immediate, "immediate/RDMA breakpoint");
73 static int checksum = GNILND_CHECKSUM_DEFAULT;
74 module_param(checksum, int, 0644);
75 MODULE_PARM_DESC(checksum, "0: None, 1: headers, 2: short msg, 3: all traffic");
77 static int checksum_dump = 0;
78 module_param(checksum_dump, int, 0644);
79 MODULE_PARM_DESC(checksum_dump, "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
81 static int bte_put_dlvr_mode = GNILND_RDMA_DLVR_OPTION;
82 module_param(bte_put_dlvr_mode, int, 0644);
83 MODULE_PARM_DESC(bte_put_dlvr_mode, "Modify BTE Put Routing Option");
85 static int bte_get_dlvr_mode = GNILND_RDMA_DLVR_OPTION;
86 module_param(bte_get_dlvr_mode, int, 0644);
87 MODULE_PARM_DESC(bte_get_dlvr_mode, "Modify BTE Get Routing Option");
89 static int bte_relaxed_ordering = 1;
90 module_param(bte_relaxed_ordering, int, 0644);
91 MODULE_PARM_DESC(bte_relaxed_ordering, "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
94 static int ptag = GNI_PTAG_LND_KNC;
96 static int ptag = GNI_PTAG_LND;
98 module_param(ptag, int, 0444);
99 MODULE_PARM_DESC(ptag, "ptag for Gemini CDM");
101 static int pkey = GNI_JOB_CREATE_COOKIE(GNI_PKEY_LND, 0);
102 module_param(pkey, int, 0444);
103 MODULE_PARM_DESC(pkey, "pkey for CDM");
105 static int max_retransmits = 128;
106 module_param(max_retransmits, int, 0444);
107 MODULE_PARM_DESC(max_retransmits,
108 "max retransmits for FMA before entering delay queue");
110 static int nwildcard = 4;
111 module_param(nwildcard, int, 0444);
112 MODULE_PARM_DESC(nwildcard, "# wildcard datagrams to post per net (interface)");
114 static int nice = -20;
115 module_param(nice, int, 0444);
116 MODULE_PARM_DESC(nice, "nice value for kgnilnd threads, default -20");
118 static int rdmaq_intervals = 4;
119 module_param(rdmaq_intervals, int, 0644);
120 MODULE_PARM_DESC(rdmaq_intervals, "# intervals per second for rdmaq throttling, default 4, 0 to disable");
122 static int loops = 100;
123 module_param(loops, int, 0644);
124 MODULE_PARM_DESC(loops, "# of loops before scheduler is friendly, default 100");
126 static int hash_size = 503;
127 module_param(hash_size, int, 0444);
128 MODULE_PARM_DESC(hash_size, "prime number for peer/conn hash sizing, default 503");
130 static int peer_health = 0;
131 module_param(peer_health, int, 0444);
132 MODULE_PARM_DESC(peer_health, "Disable peer timeout for LNet peer health, default off, > 0 to enable");
134 static int peer_timeout = -1;
135 module_param(peer_timeout, int, 0444);
136 MODULE_PARM_DESC(peer_timeout, "Peer timeout used for peer_health, default based on gnilnd timeout, > -1 to manually set");
138 static int vmap_cksum = 0;
139 module_param(vmap_cksum, int, 0644);
140 MODULE_PARM_DESC(vmap_cksum, "use vmap for all kiov checksumming, default off");
142 static int mbox_per_block = GNILND_FMABLK;
143 module_param(mbox_per_block, int, 0644);
144 MODULE_PARM_DESC(mbox_per_block, "mailboxes per block");
146 static int nphys_mbox = 0;
147 module_param(nphys_mbox, int, 0444);
148 MODULE_PARM_DESC(nphys_mbox, "# mbox to preallocate from physical memory, default 0");
150 static int mbox_credits = GNILND_MBOX_CREDITS;
151 module_param(mbox_credits, int, 0644);
152 MODULE_PARM_DESC(mbox_credits, "number of credits per mailbox");
154 static int sched_threads = GNILND_SCHED_THREADS;
155 module_param(sched_threads, int, 0444);
156 MODULE_PARM_DESC(sched_threads, "number of threads for moving data");
158 static int net_hash_size = 11;
159 module_param(net_hash_size, int, 0444);
160 MODULE_PARM_DESC(net_hash_size, "prime number for net hash sizing, default 11");
162 static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
163 module_param(hardware_timeout, int, 0444);
164 MODULE_PARM_DESC(hardware_timeout, "maximum time for traffic to get from one node to another");
166 static int mdd_timeout = GNILND_MDD_TIMEOUT;
167 module_param(mdd_timeout, int, 0644);
168 MODULE_PARM_DESC(mdd_timeout, "maximum time (in minutes) for mdd to be held");
170 static int sched_timeout = GNILND_SCHED_TIMEOUT;
171 module_param(sched_timeout, int, 0644);
172 MODULE_PARM_DESC(sched_timeout, "scheduler aliveness in seconds max time");
174 static int sched_nice = GNILND_SCHED_NICE;
175 module_param(sched_nice, int, 0444);
176 MODULE_PARM_DESC(sched_nice, "scheduler's nice setting, default compute 0 service -20");
178 static int reverse_rdma = GNILND_REVERSE_RDMA;
179 module_param(reverse_rdma, int, 0644);
180 MODULE_PARM_DESC(reverse_rdma, "Normal 0: Reverse GET: 1 Reverse Put: 2 Reverse Both: 3");
182 static int dgram_timeout = GNILND_DGRAM_TIMEOUT;
183 module_param(dgram_timeout, int, 0644);
184 MODULE_PARM_DESC(dgram_timeout, "dgram thread aliveness seconds max time");
186 static int efault_lbug = 0;
187 module_param(efault_lbug, int, 0644);
188 MODULE_PARM_DESC(efault_lbug, "If a compute receives an EFAULT in a message should it LBUG. 0 off 1 on");
190 static int fast_reconn = GNILND_FAST_RECONNECT;
191 module_param(fast_reconn, int, 0644);
192 MODULE_PARM_DESC(fast_reconn, "fast reconnect on connection timeout");
194 static int max_conn_purg = GNILND_PURGATORY_MAX;
195 module_param(max_conn_purg, int, 0644);
196 MODULE_PARM_DESC(max_conn_purg, "Max number of connections per peer in purgatory");
198 static int thread_affinity = 0;
199 module_param(thread_affinity, int, 0444);
200 MODULE_PARM_DESC(thread_affinity, "scheduler thread affinity default 0 (disabled)");
202 static int thread_safe = GNILND_TS_ENABLE;
203 module_param(thread_safe, int, 0444);
204 MODULE_PARM_DESC(thread_safe, "Use kgni thread safe API if available");
206 static int reg_fail_timeout = GNILND_REGFAILTO_DISABLE;
207 module_param(reg_fail_timeout, int, 0644);
208 MODULE_PARM_DESC(reg_fail_timeout, "fmablk registration timeout LBUG");
210 static int to_reconn_disable;
211 module_param(to_reconn_disable, int, 0644);
212 MODULE_PARM_DESC(to_reconn_disable,
213 "Timed out connection waits for peer before reconnecting");
215 static int vzalloc_no_retry = GNILND_VZALLOC_RETRY;
216 module_param(vzalloc_no_retry, int, 0644);
217 MODULE_PARM_DESC(vzalloc_no_retry,
218 "Should we pass the no_retry flag to vmalloc 1: no_retry 0: normal");
220 kgn_tunables_t kgnilnd_tunables = {
221 .kgn_min_reconnect_interval = &min_reconnect_interval,
222 .kgn_max_reconnect_interval = &max_reconnect_interval,
223 .kgn_credits = &credits,
224 .kgn_peer_credits = &peer_credits,
225 .kgn_concurrent_sends = &concurrent_sends,
226 .kgn_fma_cq_size = &fma_cq_size,
227 .kgn_timeout = &timeout,
228 .kgn_max_immediate = &max_immediate,
229 .kgn_checksum = &checksum,
230 .kgn_checksum_dump = &checksum_dump,
231 .kgn_bte_put_dlvr_mode = &bte_put_dlvr_mode,
232 .kgn_bte_get_dlvr_mode = &bte_get_dlvr_mode,
233 .kgn_bte_relaxed_ordering = &bte_relaxed_ordering,
236 .kgn_max_retransmits = &max_retransmits,
237 .kgn_nwildcard = &nwildcard,
239 .kgn_rdmaq_intervals = &rdmaq_intervals,
241 .kgn_peer_hash_size = &hash_size,
242 .kgn_peer_health = &peer_health,
243 .kgn_peer_timeout = &peer_timeout,
244 .kgn_vmap_cksum = &vmap_cksum,
245 .kgn_mbox_per_block = &mbox_per_block,
246 .kgn_nphys_mbox = &nphys_mbox,
247 .kgn_mbox_credits = &mbox_credits,
248 .kgn_sched_threads = &sched_threads,
249 .kgn_net_hash_size = &net_hash_size,
250 .kgn_hardware_timeout = &hardware_timeout,
251 .kgn_mdd_timeout = &mdd_timeout,
252 .kgn_sched_timeout = &sched_timeout,
253 .kgn_sched_nice = &sched_nice,
254 .kgn_reverse_rdma = &reverse_rdma,
255 .kgn_dgram_timeout = &dgram_timeout,
256 .kgn_eager_credits = &eager_credits,
257 .kgn_fast_reconn = &fast_reconn,
258 .kgn_efault_lbug = &efault_lbug,
259 .kgn_thread_affinity = &thread_affinity,
260 .kgn_thread_safe = &thread_safe,
261 .kgn_reg_fail_timeout = ®_fail_timeout,
262 .kgn_to_reconn_disable = &to_reconn_disable,
263 .kgn_max_purgatory = &max_conn_purg,
264 .kgn_vzalloc_noretry = &vzalloc_no_retry
268 kgnilnd_tunables_init(void)
272 switch (*kgnilnd_tunables.kgn_checksum) {
274 CERROR("Invalid checksum module parameter: %d\n",
275 *kgnilnd_tunables.kgn_checksum);
278 case GNILND_CHECKSUM_OFF:
279 /* no checksumming */
281 case GNILND_CHECKSUM_SMSG_HEADER:
282 LCONSOLE_INFO("SMSG header only checksumming enabled\n");
284 case GNILND_CHECKSUM_SMSG:
285 LCONSOLE_INFO("SMSG checksumming enabled\n");
287 case GNILND_CHECKSUM_SMSG_BTE:
288 LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
292 if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
293 LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
294 *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
299 if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
300 *kgnilnd_tunables.kgn_mbox_per_block = 1;
303 if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
304 *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
305 } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
306 LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
307 *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);