2 * Copyright (C) 2004 Cluster File Systems, Inc.
4 * Copyright (C) 2009-2012 Cray, Inc.
6 * Derived from work by: Eric Barton <eric@bartonsoftware.com>
7 * Author: Nic Henke <nic@cray.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 static int credits = 256;
29 CFS_MODULE_PARM(credits, "i", int, 0444,
30 "# concurrent sends");
32 static int eager_credits = 256 * 1024;
33 CFS_MODULE_PARM(eager_credits, "i", int, 0444,
36 static int peer_credits = 16;
37 CFS_MODULE_PARM(peer_credits, "i", int, 0444,
38 "# LNet peer credits");
40 /* NB - we'll not actually limit sends to this, we just size the mailbox buffer
41 * such that at most we'll have concurrent_sends * max_immediate messages
43 static int concurrent_sends = 0;
44 CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
45 "# concurrent HW sends to 1 peer");
47 /* default for 2k nodes @ 16 peer credits */
48 static int fma_cq_size = 32768;
49 CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
50 "size of the completion queue");
52 static int timeout = GNILND_BASE_TIMEOUT;
53 /* can't change @ runtime because LNet gets NI data at startup from
55 CFS_MODULE_PARM(timeout, "i", int, 0444,
56 "communications timeout (seconds)");
58 /* time to wait between datagram timeout and sending of next dgram */
59 static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
60 CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
61 "minimum connection retry interval (seconds)");
63 /* if this goes longer than timeout, we'll timeout the TX before
65 static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
66 CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
67 "maximum connection retry interval (seconds)");
69 static int max_immediate = 8192;
70 CFS_MODULE_PARM(max_immediate, "i", int, 0644,
71 "immediate/RDMA breakpoint");
73 static int checksum = GNILND_CHECKSUM_DEFAULT;
74 CFS_MODULE_PARM(checksum, "i", int, 0644,
75 "0: None, 1: headers, 2: short msg, 3: all traffic");
77 static int checksum_dump = 0;
78 CFS_MODULE_PARM(checksum_dump, "i", int, 0644,
79 "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
81 static int bte_dlvr_mode = GNILND_RDMA_DLVR_OPTION;
82 CFS_MODULE_PARM(bte_dlvr_mode, "i", int, 0644,
83 "enable hashing for BTE (RDMA) transfers");
85 static int bte_relaxed_ordering = 1;
86 CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644,
87 "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
90 static int ptag = GNI_PTAG_LND_KNC;
92 static int ptag = GNI_PTAG_LND;
94 CFS_MODULE_PARM(ptag, "i", int, 0444,
95 "ptag for Gemini CDM");
97 static int pkey = GNI_JOB_CREATE_COOKIE(GNI_PKEY_LND, 0);
98 CFS_MODULE_PARM(pkey, "i", int, 0444, "pkey for CDM");
100 static int max_retransmits = 1024;
101 CFS_MODULE_PARM(max_retransmits, "i", int, 0444,
102 "max retransmits for FMA");
104 static int nwildcard = 4;
105 CFS_MODULE_PARM(nwildcard, "i", int, 0444,
106 "# wildcard datagrams to post per net (interface)");
108 static int nice = -20;
109 CFS_MODULE_PARM(nice, "i", int, 0444,
110 "nice value for kgnilnd threads, default -20");
112 static int rdmaq_intervals = 4;
113 CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644,
114 "# intervals per second for rdmaq throttling, default 4, 0 to disable");
116 static int loops = 100;
117 CFS_MODULE_PARM(loops, "i", int, 0644,
118 "# of loops before scheduler is friendly, default 100");
120 static int hash_size = 503;
121 CFS_MODULE_PARM(hash_size, "i", int, 0444,
122 "prime number for peer/conn hash sizing, default 503");
124 static int peer_health = 0;
125 CFS_MODULE_PARM(peer_health, "i", int, 0444,
126 "Disable peer timeout for LNet peer health, default off, > 0 to enable");
128 static int peer_timeout = -1;
129 CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
130 "Peer timeout used for peer_health, default based on gnilnd timeout, > -1 to manually set");
132 static int vmap_cksum = 0;
133 CFS_MODULE_PARM(vmap_cksum, "i", int, 0644,
134 "use vmap for all kiov checksumming, default off");
136 static int mbox_per_block = GNILND_FMABLK;
137 CFS_MODULE_PARM(mbox_per_block, "i", int, 0644,
138 "mailboxes per block");
140 static int nphys_mbox = 0;
141 CFS_MODULE_PARM(nphys_mbox, "i", int, 0444,
142 "# mbox to preallocate from physical memory, default 0");
144 static int mbox_credits = GNILND_MBOX_CREDITS;
145 CFS_MODULE_PARM(mbox_credits, "i", int, 0644,
146 "number of credits per mailbox");
148 static int sched_threads = GNILND_SCHED_THREADS;
149 CFS_MODULE_PARM(sched_threads, "i", int, 0444,
150 "number of threads for moving data");
152 static int net_hash_size = 11;
153 CFS_MODULE_PARM(net_hash_size, "i", int, 0444,
154 "prime number for net hash sizing, default 11");
156 static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
157 CFS_MODULE_PARM(hardware_timeout, "i", int, 0444,
158 "maximum time for traffic to get from one node to another");
160 static int mdd_timeout = GNILND_MDD_TIMEOUT;
161 CFS_MODULE_PARM(mdd_timeout, "i", int, 0644,
162 "maximum time (in minutes) for mdd to be held");
164 static int sched_timeout = GNILND_SCHED_TIMEOUT;
165 CFS_MODULE_PARM(sched_timeout, "i", int, 0644,
166 "scheduler aliveness in seconds max time");
168 static int sched_nice = GNILND_SCHED_NICE;
169 CFS_MODULE_PARM(sched_nice, "i", int, 0444,
170 "scheduler's nice setting, default compute 0 service -20");
172 static int reverse_rdma = GNILND_REVERSE_RDMA;
173 CFS_MODULE_PARM(reverse_rdma, "i", int, 0644,
174 "Normal 0: Reverse GET: 1 Reverse Put: 2 Reverse Both: 3");
176 static int dgram_timeout = GNILND_DGRAM_TIMEOUT;
177 CFS_MODULE_PARM(dgram_timeout, "i", int, 0644,
178 "dgram thread aliveness seconds max time");
180 static int efault_lbug = 0;
181 CFS_MODULE_PARM(efault_lbug, "i", int, 0644,
182 "If a compute receives an EFAULT in"
183 " a message should it LBUG. 0 off 1 on");
185 static int fast_reconn = GNILND_FAST_RECONNECT;
186 CFS_MODULE_PARM(fast_reconn, "i", int, 0644,
187 "fast reconnect on connection timeout");
189 static int max_conn_purg = GNILND_PURGATORY_MAX;
190 CFS_MODULE_PARM(max_conn_purg, "i", int, 0644,
191 "Max number of connections per peer in purgatory");
193 static int thread_affinity = 0;
194 CFS_MODULE_PARM(thread_affinity, "i", int, 0444,
195 "scheduler thread affinity default 0 (disabled)");
197 static int thread_safe = GNILND_TS_ENABLE;
198 CFS_MODULE_PARM(thread_safe, "i", int, 0444,
199 "Use kgni thread safe API if available");
201 kgn_tunables_t kgnilnd_tunables = {
202 .kgn_min_reconnect_interval = &min_reconnect_interval,
203 .kgn_max_reconnect_interval = &max_reconnect_interval,
204 .kgn_credits = &credits,
205 .kgn_peer_credits = &peer_credits,
206 .kgn_concurrent_sends = &concurrent_sends,
207 .kgn_fma_cq_size = &fma_cq_size,
208 .kgn_timeout = &timeout,
209 .kgn_max_immediate = &max_immediate,
210 .kgn_checksum = &checksum,
211 .kgn_checksum_dump = &checksum_dump,
212 .kgn_bte_dlvr_mode = &bte_dlvr_mode,
213 .kgn_bte_relaxed_ordering = &bte_relaxed_ordering,
216 .kgn_max_retransmits = &max_retransmits,
217 .kgn_nwildcard = &nwildcard,
219 .kgn_rdmaq_intervals = &rdmaq_intervals,
221 .kgn_peer_hash_size = &hash_size,
222 .kgn_peer_health = &peer_health,
223 .kgn_peer_timeout = &peer_timeout,
224 .kgn_vmap_cksum = &vmap_cksum,
225 .kgn_mbox_per_block = &mbox_per_block,
226 .kgn_nphys_mbox = &nphys_mbox,
227 .kgn_mbox_credits = &mbox_credits,
228 .kgn_sched_threads = &sched_threads,
229 .kgn_net_hash_size = &net_hash_size,
230 .kgn_hardware_timeout = &hardware_timeout,
231 .kgn_mdd_timeout = &mdd_timeout,
232 .kgn_sched_timeout = &sched_timeout,
233 .kgn_sched_nice = &sched_nice,
234 .kgn_reverse_rdma = &reverse_rdma,
235 .kgn_dgram_timeout = &dgram_timeout,
236 .kgn_eager_credits = &eager_credits,
237 .kgn_fast_reconn = &fast_reconn,
238 .kgn_efault_lbug = &efault_lbug,
239 .kgn_thread_affinity = &thread_affinity,
240 .kgn_thread_safe = &thread_safe,
241 .kgn_max_purgatory = &max_conn_purg
244 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
245 static struct ctl_table kgnilnd_ctl_table[] = {
248 .procname = "min_reconnect_interval",
249 .data = &min_reconnect_interval,
250 .maxlen = sizeof(int),
252 .proc_handler = &proc_dointvec
256 .procname = "max_reconnect_interval",
257 .data = &max_reconnect_interval,
258 .maxlen = sizeof(int),
260 .proc_handler = &proc_dointvec
264 .procname = "credits",
266 .maxlen = sizeof(int),
268 .proc_handler = &proc_dointvec
272 .procname = "peer_credits",
273 .data = &peer_credits,
274 .maxlen = sizeof(int),
276 .proc_handler = &proc_dointvec
280 .procname = "fma_cq_size",
281 .data = &fma_cq_size,
282 .maxlen = sizeof(int),
284 .proc_handler = &proc_dointvec
288 .procname = "timeout",
290 .maxlen = sizeof(int),
292 .proc_handler = &proc_dointvec
296 .procname = "max_immediate",
297 .data = &max_immediate,
298 .maxlen = sizeof(int),
300 .proc_handler = &proc_dointvec
304 .procname = "checksum",
306 .maxlen = sizeof(int),
308 .proc_handler = &proc_dointvec
312 .procname = "bte_dlvr_mode",
313 .data = &bte_dlvr_mode,
314 .maxlen = sizeof(int),
316 .proc_handler = &proc_dointvec
322 .maxlen = sizeof(int),
324 .proc_handler = &proc_dointvec
330 .maxlen = sizeof(int),
332 .proc_handler = &proc_dointvec
336 .procname = "nwildcard",
338 .maxlen = sizeof(int),
340 .proc_handler = &proc_dointvec
344 .procname = "bte_relaxed_ordering",
345 .data = &bte_relaxed_ordering,
346 .maxlen = sizeof(int),
348 .proc_handler = &proc_dointvec
352 .procname = "checksum_dump",
353 .data = &checksum_dump,
354 .maxlen = sizeof(int),
356 .proc_handler = &proc_dointvec
362 .maxlen = sizeof(int),
364 .proc_handler = &proc_dointvec
368 .procname = "rdmaq_intervals",
369 .data = &rdmaq_intervals,
370 .maxlen = sizeof(int),
372 .proc_handler = &proc_dointvec
378 .maxlen = sizeof(int),
380 .proc_handler = &proc_dointvec
384 .procname = "hash_size",
386 .maxlen = sizeof(int),
388 .proc_handler = &proc_dointvec
392 .procname = "peer_health",
393 .data = &peer_health,
394 .maxlen = sizeof(int),
396 .proc_handler = &proc_dointvec
400 .procname = "vmap_cksum",
402 .maxlen = sizeof(int),
404 .proc_handler = &proc_dointvec
408 .procname = "mbox_per_block",
409 .data = &mbox_per_block,
410 .maxlen = sizeof(int),
412 .proc_handler = &proc_dointvec
416 .procname = "mbox_credits"
417 .data = &mbox_credits,
418 .maxlen = sizeof(int),
420 .proc_handler = &proc_dointvec
424 .procname = "sched_threads"
425 .data = &sched_threads,
426 .maxlen = sizeof(int),
428 .proc_handler = &proc_dointvec
432 .procname = "net_hash_size",
433 .data = &net_hash_size,
434 .maxlen = sizeof(int),
436 .proc_handler = &proc_dointvec
440 .procname = "hardware_timeout",
441 .data = &hardware_timeout,
442 .maxlen = sizeof(int),
444 .proc_handler = &proc_dointvec
448 .procname = "mdd_timeout",
449 .data = &mdd_timeout,
450 .maxlen = sizeof(int),
452 .proc_handler = &proc_dointvec
456 .procname = "max_retransmits"
457 .data = &max_retransmits,
458 .maxlen = sizeof(int),
460 .proc_handler = &proc_dointvec
464 .procname = "concurrent_sends",
465 .data = &concurrent_sends,
466 .maxlen = sizeof(int),
468 .proc_handler = &proc_dointvec
472 .procname = "nphys_mbox",
474 .maxlen = sizeof(int),
476 .proc_handler = &proc_dointvec
480 .procname = "sched_timeout",
481 .data = &sched_timeout,
482 .maxlen = sizeof(int),
484 .proc_handler = &proc_dointvec
488 .procname = "sched_nice",
490 .maxlen = sizeof(int),
492 .proc_handler = &proc_dointvec
496 .procname = "reverse_rdma",
497 .data = &reverse_rdma,
498 .maxlen = sizeof(int),
500 .proc_handler = &proc_dointvec
503 .procname = "dgram_timeout"
504 .data = &dgram_timeout,
505 .maxlen = sizeof(int),
507 .proc_handler = &proc_dointvec
511 .procname = "peer_timeout"
512 .data = &peer_timeout,
513 .maxlen = sizeof(int),
515 .proc_handler = &proc_dointvec
519 .procname = "eager_credits",
520 .data = &eager_credits,
521 .maxlen = sizeof(int),
523 .proc_handler = &proc_dointvec
527 .procname = "efault_lbug"
528 .data = &efault_lbug,
529 .maxlen = sizeof(int),
531 .proc_handler = &proc_dointvec
535 .procname = "thread_affinity"
536 .data = &thread_affinity,
537 .maxlen = sizeof(int),
539 .proc_handler = &proc_dointvec
543 .procname = "thread_safe"
544 .data = &thread_safe,
545 .maxlen = sizeof(int),
547 .proc_handler = &proc_dointvec
551 .procname = "max_conn_purg"
552 .data = &max_conn_purg,
553 .maxlen = sizeof(int),
555 .proc_handler = &proc_dointvec
560 static struct ctl_table kgnilnd_top_ctl_table[] = {
563 .procname = "gnilnd",
567 .child = kgnilnd_ctl_table
574 kgnilnd_tunables_init()
578 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
579 kgnilnd_tunables.kgn_sysctl =
580 cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0);
582 if (kgnilnd_tunables.kgn_sysctl == NULL)
583 CWARN("Can't setup /proc tunables\n");
585 switch (*kgnilnd_tunables.kgn_checksum) {
587 CERROR("Invalid checksum module parameter: %d\n",
588 *kgnilnd_tunables.kgn_checksum);
591 case GNILND_CHECKSUM_OFF:
592 /* no checksumming */
594 case GNILND_CHECKSUM_SMSG_HEADER:
595 LCONSOLE_INFO("SMSG header only checksumming enabled\n");
597 case GNILND_CHECKSUM_SMSG:
598 LCONSOLE_INFO("SMSG checksumming enabled\n");
600 case GNILND_CHECKSUM_SMSG_BTE:
601 LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
605 if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
606 LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
607 *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
612 if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
613 *kgnilnd_tunables.kgn_mbox_per_block = 1;
616 if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
617 *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
618 } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
619 LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
620 *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
628 kgnilnd_tunables_fini()
630 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
631 if (kgnilnd_tunables.kgn_sysctl != NULL)
632 cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);