2 * Copyright (C) 2004 Cluster File Systems, Inc.
4 * Copyright (C) 2009-2012 Cray, Inc.
6 * Derived from work by: Eric Barton <eric@bartonsoftware.com>
7 * Author: Nic Henke <nic@cray.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 static int credits = 256;
29 module_param(credits, int, 0444);
30 MODULE_PARM_DESC(credits, "# concurrent sends");
32 static int eager_credits = 256 * 1024;
33 module_param(eager_credits, int, 0644);
34 MODULE_PARM_DESC(eager_credits, "# eager buffers");
36 static int peer_credits = 16;
37 module_param(peer_credits, int, 0444);
38 MODULE_PARM_DESC(peer_credits, "# LNet peer credits");
40 /* NB - we'll not actually limit sends to this, we just size the mailbox buffer
41 * such that at most we'll have concurrent_sends * max_immediate messages
43 static int concurrent_sends = 0;
44 module_param(concurrent_sends, int, 0444);
45 MODULE_PARM_DESC(concurrent_sends, "# concurrent HW sends to 1 peer");
47 /* default for 2k nodes @ 16 peer credits */
48 static int fma_cq_size = 32768;
49 module_param(fma_cq_size, int, 0444);
50 MODULE_PARM_DESC(fma_cq_size, "size of the completion queue");
52 static int timeout = GNILND_BASE_TIMEOUT;
53 /* can't change @ runtime because LNet gets NI data at startup from
55 module_param(timeout, int, 0444);
56 MODULE_PARM_DESC(timeout, "communications timeout (seconds)");
58 /* time to wait between datagram timeout and sending of next dgram */
59 static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
60 module_param(min_reconnect_interval, int, 0644);
61 MODULE_PARM_DESC(min_reconnect_interval, "minimum connection retry interval (seconds)");
63 /* if this goes longer than timeout, we'll timeout the TX before
65 static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
66 module_param(max_reconnect_interval, int, 0644);
67 MODULE_PARM_DESC(max_reconnect_interval, "maximum connection retry interval (seconds)");
69 static int max_immediate = 2048;
70 module_param(max_immediate, int, 0444);
71 MODULE_PARM_DESC(max_immediate, "immediate/RDMA breakpoint");
73 static int checksum = GNILND_CHECKSUM_DEFAULT;
74 module_param(checksum, int, 0644);
75 MODULE_PARM_DESC(checksum, "0: None, 1: headers, 2: short msg, 3: all traffic");
77 static int checksum_dump = 0;
78 module_param(checksum_dump, int, 0644);
79 MODULE_PARM_DESC(checksum_dump, "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
81 static int bte_dlvr_mode = GNILND_RDMA_DLVR_OPTION;
82 module_param(bte_dlvr_mode, int, 0644);
83 MODULE_PARM_DESC(bte_dlvr_mode, "enable hashing for BTE (RDMA) transfers");
85 static int bte_relaxed_ordering = 1;
86 module_param(bte_relaxed_ordering, int, 0644);
87 MODULE_PARM_DESC(bte_relaxed_ordering, "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
90 static int ptag = GNI_PTAG_LND_KNC;
92 static int ptag = GNI_PTAG_LND;
94 module_param(ptag, int, 0444);
95 MODULE_PARM_DESC(ptag, "ptag for Gemini CDM");
97 static int pkey = GNI_JOB_CREATE_COOKIE(GNI_PKEY_LND, 0);
98 module_param(pkey, int, 0444);
99 MODULE_PARM_DESC(pkey, "pkey for CDM");
101 static int max_retransmits = 1024;
102 module_param(max_retransmits, int, 0444);
103 MODULE_PARM_DESC(max_retransmits, "max retransmits for FMA");
105 static int nwildcard = 4;
106 module_param(nwildcard, int, 0444);
107 MODULE_PARM_DESC(nwildcard, "# wildcard datagrams to post per net (interface)");
109 static int nice = -20;
110 module_param(nice, int, 0444);
111 MODULE_PARM_DESC(nice, "nice value for kgnilnd threads, default -20");
113 static int rdmaq_intervals = 4;
114 module_param(rdmaq_intervals, int, 0644);
115 MODULE_PARM_DESC(rdmaq_intervals, "# intervals per second for rdmaq throttling, default 4, 0 to disable");
117 static int loops = 100;
118 module_param(loops, int, 0644);
119 MODULE_PARM_DESC(loops, "# of loops before scheduler is friendly, default 100");
121 static int hash_size = 503;
122 module_param(hash_size, int, 0444);
123 MODULE_PARM_DESC(hash_size, "prime number for peer/conn hash sizing, default 503");
125 static int peer_health = 0;
126 module_param(peer_health, int, 0444);
127 MODULE_PARM_DESC(peer_health, "Disable peer timeout for LNet peer health, default off, > 0 to enable");
129 static int peer_timeout = -1;
130 module_param(peer_timeout, int, 0444);
131 MODULE_PARM_DESC(peer_timeout, "Peer timeout used for peer_health, default based on gnilnd timeout, > -1 to manually set");
133 static int vmap_cksum = 0;
134 module_param(vmap_cksum, int, 0644);
135 MODULE_PARM_DESC(vmap_cksum, "use vmap for all kiov checksumming, default off");
137 static int mbox_per_block = GNILND_FMABLK;
138 module_param(mbox_per_block, int, 0644);
139 MODULE_PARM_DESC(mbox_per_block, "mailboxes per block");
141 static int nphys_mbox = 0;
142 module_param(nphys_mbox, int, 0444);
143 MODULE_PARM_DESC(nphys_mbox, "# mbox to preallocate from physical memory, default 0");
145 static int mbox_credits = GNILND_MBOX_CREDITS;
146 module_param(mbox_credits, int, 0644);
147 MODULE_PARM_DESC(mbox_credits, "number of credits per mailbox");
149 static int sched_threads = GNILND_SCHED_THREADS;
150 module_param(sched_threads, int, 0444);
151 MODULE_PARM_DESC(sched_threads, "number of threads for moving data");
153 static int net_hash_size = 11;
154 module_param(net_hash_size, int, 0444);
155 MODULE_PARM_DESC(net_hash_size, "prime number for net hash sizing, default 11");
157 static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
158 module_param(hardware_timeout, int, 0444);
159 MODULE_PARM_DESC(hardware_timeout, "maximum time for traffic to get from one node to another");
161 static int mdd_timeout = GNILND_MDD_TIMEOUT;
162 module_param(mdd_timeout, int, 0644);
163 MODULE_PARM_DESC(mdd_timeout, "maximum time (in minutes) for mdd to be held");
165 static int sched_timeout = GNILND_SCHED_TIMEOUT;
166 module_param(sched_timeout, int, 0644);
167 MODULE_PARM_DESC(sched_timeout, "scheduler aliveness in seconds max time");
169 static int sched_nice = GNILND_SCHED_NICE;
170 module_param(sched_nice, int, 0444);
171 MODULE_PARM_DESC(sched_nice, "scheduler's nice setting, default compute 0 service -20");
173 static int reverse_rdma = GNILND_REVERSE_RDMA;
174 module_param(reverse_rdma, int, 0644);
175 MODULE_PARM_DESC(reverse_rdma, "Normal 0: Reverse GET: 1 Reverse Put: 2 Reverse Both: 3");
177 static int dgram_timeout = GNILND_DGRAM_TIMEOUT;
178 module_param(dgram_timeout, int, 0644);
179 MODULE_PARM_DESC(dgram_timeout, "dgram thread aliveness seconds max time");
181 static int efault_lbug = 0;
182 module_param(efault_lbug, int, 0644);
183 MODULE_PARM_DESC(efault_lbug, "If a compute receives an EFAULT in a message should it LBUG. 0 off 1 on");
185 static int fast_reconn = GNILND_FAST_RECONNECT;
186 module_param(fast_reconn, int, 0644);
187 MODULE_PARM_DESC(fast_reconn, "fast reconnect on connection timeout");
189 static int max_conn_purg = GNILND_PURGATORY_MAX;
190 module_param(max_conn_purg, int, 0644);
191 MODULE_PARM_DESC(max_conn_purg, "Max number of connections per peer in purgatory");
193 static int thread_affinity = 0;
194 module_param(thread_affinity, int, 0444);
195 MODULE_PARM_DESC(thread_affinity, "scheduler thread affinity default 0 (disabled)");
197 static int thread_safe = GNILND_TS_ENABLE;
198 module_param(thread_safe, int, 0444);
199 MODULE_PARM_DESC(thread_safe, "Use kgni thread safe API if available");
201 static int reg_fail_timeout = GNILND_REGFAILTO_DISABLE;
202 module_param(reg_fail_timeout, int, 0644);
203 MODULE_PARM_DESC(reg_fail_timeout, "fmablk registration timeout LBUG");
205 kgn_tunables_t kgnilnd_tunables = {
206 .kgn_min_reconnect_interval = &min_reconnect_interval,
207 .kgn_max_reconnect_interval = &max_reconnect_interval,
208 .kgn_credits = &credits,
209 .kgn_peer_credits = &peer_credits,
210 .kgn_concurrent_sends = &concurrent_sends,
211 .kgn_fma_cq_size = &fma_cq_size,
212 .kgn_timeout = &timeout,
213 .kgn_max_immediate = &max_immediate,
214 .kgn_checksum = &checksum,
215 .kgn_checksum_dump = &checksum_dump,
216 .kgn_bte_dlvr_mode = &bte_dlvr_mode,
217 .kgn_bte_relaxed_ordering = &bte_relaxed_ordering,
220 .kgn_max_retransmits = &max_retransmits,
221 .kgn_nwildcard = &nwildcard,
223 .kgn_rdmaq_intervals = &rdmaq_intervals,
225 .kgn_peer_hash_size = &hash_size,
226 .kgn_peer_health = &peer_health,
227 .kgn_peer_timeout = &peer_timeout,
228 .kgn_vmap_cksum = &vmap_cksum,
229 .kgn_mbox_per_block = &mbox_per_block,
230 .kgn_nphys_mbox = &nphys_mbox,
231 .kgn_mbox_credits = &mbox_credits,
232 .kgn_sched_threads = &sched_threads,
233 .kgn_net_hash_size = &net_hash_size,
234 .kgn_hardware_timeout = &hardware_timeout,
235 .kgn_mdd_timeout = &mdd_timeout,
236 .kgn_sched_timeout = &sched_timeout,
237 .kgn_sched_nice = &sched_nice,
238 .kgn_reverse_rdma = &reverse_rdma,
239 .kgn_dgram_timeout = &dgram_timeout,
240 .kgn_eager_credits = &eager_credits,
241 .kgn_fast_reconn = &fast_reconn,
242 .kgn_efault_lbug = &efault_lbug,
243 .kgn_thread_affinity = &thread_affinity,
244 .kgn_thread_safe = &thread_safe,
245 .kgn_reg_fail_timeout = ®_fail_timeout,
246 .kgn_max_purgatory = &max_conn_purg
249 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
250 static struct ctl_table kgnilnd_ctl_table[] = {
253 .procname = "min_reconnect_interval",
254 .data = &min_reconnect_interval,
255 .maxlen = sizeof(int),
257 .proc_handler = &proc_dointvec
261 .procname = "max_reconnect_interval",
262 .data = &max_reconnect_interval,
263 .maxlen = sizeof(int),
265 .proc_handler = &proc_dointvec
269 .procname = "credits",
271 .maxlen = sizeof(int),
273 .proc_handler = &proc_dointvec
277 .procname = "peer_credits",
278 .data = &peer_credits,
279 .maxlen = sizeof(int),
281 .proc_handler = &proc_dointvec
285 .procname = "fma_cq_size",
286 .data = &fma_cq_size,
287 .maxlen = sizeof(int),
289 .proc_handler = &proc_dointvec
293 .procname = "timeout",
295 .maxlen = sizeof(int),
297 .proc_handler = &proc_dointvec
301 .procname = "max_immediate",
302 .data = &max_immediate,
303 .maxlen = sizeof(int),
305 .proc_handler = &proc_dointvec
309 .procname = "checksum",
311 .maxlen = sizeof(int),
313 .proc_handler = &proc_dointvec
317 .procname = "bte_dlvr_mode",
318 .data = &bte_dlvr_mode,
319 .maxlen = sizeof(int),
321 .proc_handler = &proc_dointvec
327 .maxlen = sizeof(int),
329 .proc_handler = &proc_dointvec
335 .maxlen = sizeof(int),
337 .proc_handler = &proc_dointvec
341 .procname = "nwildcard",
343 .maxlen = sizeof(int),
345 .proc_handler = &proc_dointvec
349 .procname = "bte_relaxed_ordering",
350 .data = &bte_relaxed_ordering,
351 .maxlen = sizeof(int),
353 .proc_handler = &proc_dointvec
357 .procname = "checksum_dump",
358 .data = &checksum_dump,
359 .maxlen = sizeof(int),
361 .proc_handler = &proc_dointvec
367 .maxlen = sizeof(int),
369 .proc_handler = &proc_dointvec
373 .procname = "rdmaq_intervals",
374 .data = &rdmaq_intervals,
375 .maxlen = sizeof(int),
377 .proc_handler = &proc_dointvec
383 .maxlen = sizeof(int),
385 .proc_handler = &proc_dointvec
389 .procname = "hash_size",
391 .maxlen = sizeof(int),
393 .proc_handler = &proc_dointvec
397 .procname = "peer_health",
398 .data = &peer_health,
399 .maxlen = sizeof(int),
401 .proc_handler = &proc_dointvec
405 .procname = "vmap_cksum",
407 .maxlen = sizeof(int),
409 .proc_handler = &proc_dointvec
413 .procname = "mbox_per_block",
414 .data = &mbox_per_block,
415 .maxlen = sizeof(int),
417 .proc_handler = &proc_dointvec
421 .procname = "mbox_credits"
422 .data = &mbox_credits,
423 .maxlen = sizeof(int),
425 .proc_handler = &proc_dointvec
429 .procname = "sched_threads"
430 .data = &sched_threads,
431 .maxlen = sizeof(int),
433 .proc_handler = &proc_dointvec
437 .procname = "net_hash_size",
438 .data = &net_hash_size,
439 .maxlen = sizeof(int),
441 .proc_handler = &proc_dointvec
445 .procname = "hardware_timeout",
446 .data = &hardware_timeout,
447 .maxlen = sizeof(int),
449 .proc_handler = &proc_dointvec
453 .procname = "mdd_timeout",
454 .data = &mdd_timeout,
455 .maxlen = sizeof(int),
457 .proc_handler = &proc_dointvec
461 .procname = "max_retransmits"
462 .data = &max_retransmits,
463 .maxlen = sizeof(int),
465 .proc_handler = &proc_dointvec
469 .procname = "concurrent_sends",
470 .data = &concurrent_sends,
471 .maxlen = sizeof(int),
473 .proc_handler = &proc_dointvec
477 .procname = "nphys_mbox",
479 .maxlen = sizeof(int),
481 .proc_handler = &proc_dointvec
485 .procname = "sched_timeout",
486 .data = &sched_timeout,
487 .maxlen = sizeof(int),
489 .proc_handler = &proc_dointvec
493 .procname = "sched_nice",
495 .maxlen = sizeof(int),
497 .proc_handler = &proc_dointvec
501 .procname = "reverse_rdma",
502 .data = &reverse_rdma,
503 .maxlen = sizeof(int),
505 .proc_handler = &proc_dointvec
508 .procname = "dgram_timeout"
509 .data = &dgram_timeout,
510 .maxlen = sizeof(int),
512 .proc_handler = &proc_dointvec
516 .procname = "peer_timeout"
517 .data = &peer_timeout,
518 .maxlen = sizeof(int),
520 .proc_handler = &proc_dointvec
524 .procname = "eager_credits",
525 .data = &eager_credits,
526 .maxlen = sizeof(int),
528 .proc_handler = &proc_dointvec
532 .procname = "efault_lbug"
533 .data = &efault_lbug,
534 .maxlen = sizeof(int),
536 .proc_handler = &proc_dointvec
540 .procname = "thread_affinity"
541 .data = &thread_affinity,
542 .maxlen = sizeof(int),
544 .proc_handler = &proc_dointvec
548 .procname = "thread_safe"
549 .data = &thread_safe,
550 .maxlen = sizeof(int),
552 .proc_handler = &proc_dointvec
556 .procname = "reg_fail_timeout"
557 .data = ®_fail_timeout,
558 .maxlen = sizeof(int),
560 .proc_handler = &proc_dointvec
564 .procname = "max_conn_purg"
565 .data = &max_conn_purg,
566 .maxlen = sizeof(int),
568 .proc_handler = &proc_dointvec
573 static struct ctl_table kgnilnd_top_ctl_table[] = {
576 .procname = "gnilnd",
580 .child = kgnilnd_ctl_table
587 kgnilnd_tunables_init()
591 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
592 kgnilnd_tunables.kgn_sysctl =
593 register_sysctl_table(kgnilnd_top_ctl_table, 0);
595 if (kgnilnd_tunables.kgn_sysctl == NULL)
596 CWARN("Can't setup /proc tunables\n");
598 switch (*kgnilnd_tunables.kgn_checksum) {
600 CERROR("Invalid checksum module parameter: %d\n",
601 *kgnilnd_tunables.kgn_checksum);
604 case GNILND_CHECKSUM_OFF:
605 /* no checksumming */
607 case GNILND_CHECKSUM_SMSG_HEADER:
608 LCONSOLE_INFO("SMSG header only checksumming enabled\n");
610 case GNILND_CHECKSUM_SMSG:
611 LCONSOLE_INFO("SMSG checksumming enabled\n");
613 case GNILND_CHECKSUM_SMSG_BTE:
614 LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
618 if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
619 LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
620 *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
625 if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
626 *kgnilnd_tunables.kgn_mbox_per_block = 1;
629 if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
630 *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
631 } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
632 LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
633 *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
641 kgnilnd_tunables_fini()
643 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
644 if (kgnilnd_tunables.kgn_sysctl != NULL)
645 unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);