2 * Copyright (C) 2004 Cluster File Systems, Inc.
4 * Copyright (C) 2009-2012 Cray, Inc.
6 * Derived from work by: Eric Barton <eric@bartonsoftware.com>
7 * Author: Nic Henke <nic@cray.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 static int credits = 256;
29 CFS_MODULE_PARM(credits, "i", int, 0444,
30 "# concurrent sends");
32 static int peer_credits = 16;
33 CFS_MODULE_PARM(peer_credits, "i", int, 0444,
34 "# LNet peer credits");
36 /* NB - we'll not actually limit sends to this, we just size the mailbox buffer
37 * such that at most we'll have concurrent_sends * max_immediate messages
39 static int concurrent_sends = 0;
40 CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
41 "# concurrent HW sends to 1 peer");
43 /* default for 2k nodes @ 16 peer credits */
44 static int fma_cq_size = 32768;
45 CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
46 "size of the completion queue");
48 static int timeout = GNILND_BASE_TIMEOUT;
49 /* can't change @ runtime because LNet gets NI data at startup from
51 CFS_MODULE_PARM(timeout, "i", int, 0444,
52 "communications timeout (seconds)");
54 /* time to wait between datagram timeout and sending of next dgram */
55 static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
56 CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
57 "minimum connection retry interval (seconds)");
59 /* if this goes longer than timeout, we'll timeout the TX before
61 static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
62 CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
63 "maximum connection retry interval (seconds)");
65 static int max_immediate = (2<<10);
66 CFS_MODULE_PARM(max_immediate, "i", int, 0644,
67 "immediate/RDMA breakpoint");
69 #ifdef CONFIG_CRAY_GEMINI
70 static int checksum = GNILND_CHECKSUM_SMSG_BTE;
72 static int checksum = 0;
74 CFS_MODULE_PARM(checksum, "i", int, 0644,
75 "0: None, 1: headers, 2: short msg, 3: all traffic");
77 static int checksum_dump = 0;
78 CFS_MODULE_PARM(checksum_dump, "i", int, 0644,
79 "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
81 static int bte_hash = 1;
82 CFS_MODULE_PARM(bte_hash, "i", int, 0644,
83 "enable hashing for BTE (RDMA) transfers");
85 static int bte_adapt = 1;
86 CFS_MODULE_PARM(bte_adapt, "i", int, 0644,
87 "enable adaptive request and response for BTE (RDMA) transfers");
89 static int bte_relaxed_ordering = 1;
90 CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644,
91 "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
93 static int ptag = GNI_PTAG_LND;
94 CFS_MODULE_PARM(ptag, "i", int, 0444,
95 "ptag for Gemini CDM");
97 static int max_retransmits = 1024;
98 CFS_MODULE_PARM(max_retransmits, "i", int, 0644,
99 "max retransmits for FMA");
101 static int nwildcard = 4;
102 CFS_MODULE_PARM(nwildcard, "i", int, 0444,
103 "# wildcard datagrams to post per net (interface)");
105 static int nice = -20;
106 CFS_MODULE_PARM(nice, "i", int, 0444,
107 "nice value for kgnilnd threads, default -20");
109 static int rdmaq_intervals = 4;
110 CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644,
111 "# intervals per second for rdmaq throttling, default 4, 0 to disable");
113 static int loops = 100;
114 CFS_MODULE_PARM(loops, "i", int, 0644,
115 "# of loops before scheduler is friendly, default 100");
117 static int hash_size = 503;
118 CFS_MODULE_PARM(hash_size, "i", int, 0444,
119 "prime number for peer/conn hash sizing, default 503");
121 static int peer_health = 0;
122 CFS_MODULE_PARM(peer_health, "i", int, 0444,
123 "Disable peer timeout for LNet peer health, default off, > 0 to enable");
125 static int vmap_cksum = 0;
126 CFS_MODULE_PARM(vmap_cksum, "i", int, 0644,
127 "use vmap for all kiov checksumming, default off");
129 static int mbox_per_block = GNILND_FMABLK;
130 CFS_MODULE_PARM(mbox_per_block, "i", int, 0644,
131 "mailboxes per block");
133 static int nphys_mbox = 0;
134 CFS_MODULE_PARM(nphys_mbox, "i", int, 0444,
135 "# mbox to preallocate from physical memory, default 0");
137 static int mbox_credits = GNILND_MBOX_CREDITS;
138 CFS_MODULE_PARM(mbox_credits, "i", int, 0644,
139 "number of credits per mailbox");
141 static int sched_threads = GNILND_SCHED_THREADS;
142 CFS_MODULE_PARM(sched_threads, "i", int, 0444,
143 "number of threads for moving data");
145 static int net_hash_size = 11;
146 CFS_MODULE_PARM(net_hash_size, "i", int, 0444,
147 "prime number for net hash sizing, default 11");
149 static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
150 CFS_MODULE_PARM(hardware_timeout, "i", int, 0444,
151 "maximum time for traffic to get from one node to another");
153 static int mdd_timeout = GNILND_MDD_TIMEOUT;
154 CFS_MODULE_PARM(mdd_timeout, "i", int, 0644,
155 "maximum time (in minutes) for mdd to be held");
157 kgn_tunables_t kgnilnd_tunables = {
158 .kgn_min_reconnect_interval = &min_reconnect_interval,
159 .kgn_max_reconnect_interval = &max_reconnect_interval,
160 .kgn_credits = &credits,
161 .kgn_peer_credits = &peer_credits,
162 .kgn_concurrent_sends = &concurrent_sends,
163 .kgn_fma_cq_size = &fma_cq_size,
164 .kgn_timeout = &timeout,
165 .kgn_max_immediate = &max_immediate,
166 .kgn_checksum = &checksum,
167 .kgn_checksum_dump = &checksum_dump,
168 .kgn_bte_hash = &bte_hash,
169 .kgn_bte_adapt = &bte_adapt,
170 .kgn_bte_relaxed_ordering = &bte_relaxed_ordering,
172 .kgn_max_retransmits = &max_retransmits,
173 .kgn_nwildcard = &nwildcard,
175 .kgn_rdmaq_intervals = &rdmaq_intervals,
177 .kgn_peer_hash_size = &hash_size,
178 .kgn_peer_health = &peer_health,
179 .kgn_vmap_cksum = &vmap_cksum,
180 .kgn_mbox_per_block = &mbox_per_block,
181 .kgn_nphys_mbox = &nphys_mbox,
182 .kgn_mbox_credits = &mbox_credits,
183 .kgn_sched_threads = &sched_threads,
184 .kgn_net_hash_size = &net_hash_size,
185 .kgn_hardware_timeout = &hardware_timeout,
186 .kgn_mdd_timeout = &mdd_timeout
189 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
190 static cfs_sysctl_table_t kgnilnd_ctl_table[] = {
193 .procname = "min_reconnect_interval",
194 .data = &min_reconnect_interval,
195 .maxlen = sizeof(int),
197 .proc_handler = &proc_dointvec
201 .procname = "max_reconnect_interval",
202 .data = &max_reconnect_interval,
203 .maxlen = sizeof(int),
205 .proc_handler = &proc_dointvec
209 .procname = "credits",
211 .maxlen = sizeof(int),
213 .proc_handler = &proc_dointvec
217 .procname = "peer_credits",
218 .data = &peer_credits,
219 .maxlen = sizeof(int),
221 .proc_handler = &proc_dointvec
225 .procname = "fma_cq_size",
226 .data = &fma_cq_size,
227 .maxlen = sizeof(int),
229 .proc_handler = &proc_dointvec
233 .procname = "timeout",
235 .maxlen = sizeof(int),
237 .proc_handler = &proc_dointvec
241 .procname = "max_immediate",
242 .data = &max_immediate,
243 .maxlen = sizeof(int),
245 .proc_handler = &proc_dointvec
249 .procname = "checksum",
251 .maxlen = sizeof(int),
253 .proc_handler = &proc_dointvec
257 .procname = "bte_hash",
259 .maxlen = sizeof(int),
261 .proc_handler = &proc_dointvec
265 .procname = "bte_adapt",
267 .maxlen = sizeof(int),
269 .proc_handler = &proc_dointvec
275 .maxlen = sizeof(int),
277 .proc_handler = &proc_dointvec
281 .procname = "nwildcard",
283 .maxlen = sizeof(int),
285 .proc_handler = &proc_dointvec
289 .procname = "bte_relaxed_ordering",
290 .data = &bte_relaxed_ordering,
291 .maxlen = sizeof(int),
293 .proc_handler = &proc_dointvec
297 .procname = "checksum_dump",
298 .data = &checksum_dump,
299 .maxlen = sizeof(int),
301 .proc_handler = &proc_dointvec
307 .maxlen = sizeof(int),
309 .proc_handler = &proc_dointvec
313 .procname = "rdmaq_intervals",
314 .data = &rdmaq_intervals,
315 .maxlen = sizeof(int),
317 .proc_handler = &proc_dointvec
323 .maxlen = sizeof(int),
325 .proc_handler = &proc_dointvec
329 .procname = "hash_size",
331 .maxlen = sizeof(int),
333 .proc_handler = &proc_dointvec
337 .procname = "peer_health",
338 .data = &peer_health,
339 .maxlen = sizeof(int),
341 .proc_handler = &proc_dointvec
345 .procname = "vmap_cksum",
347 .maxlen = sizeof(int),
349 .proc_handler = &proc_dointvec
353 .procname = "mbox_per_block",
354 .data = &mbox_per_block,
355 .maxlen = sizeof(int),
357 .proc_handler = &proc_dointvec
361 .procname = "mbox_credits"
362 .data = &mbox_credits,
363 .maxlen = sizeof(int),
365 .proc_handler = &proc_dointvec
369 .procname = "sched_threads"
370 .data = &sched_threads,
371 .maxlen = sizeof(int),
373 .proc_handler = &proc_dointvec
377 .procname = "net_hash_size",
378 .data = &net_hash_size,
379 .maxlen = sizeof(int),
381 .proc_handler = &proc_dointvec
385 .procname = "hardware_timeout",
386 .data = &hardware_timeout,
387 .maxlen = sizeof(int),
389 .proc_handler = &proc_dointvec
393 .procname = "mdd_timeout",
394 .data = &mdd_timeout,
395 .maxlen = sizeof(int),
397 .proc_handler = &proc_dointvec
401 .procname = "max_retransmits"
402 .data = &max_retransmits,
403 .maxlen = sizeof(int),
405 .proc_handler = &proc_dointvec
409 .procname = "concurrent_sends",
410 .data = &concurrent_sends,
411 .maxlen = sizeof(int),
413 .proc_handler = &proc_dointvec
417 .procname = "nphys_mbox",
419 .maxlen = sizeof(int),
421 .proc_handler = &proc_dointvec
426 static cfs_sysctl_table_t kgnilnd_top_ctl_table[] = {
429 .procname = "gnilnd",
433 .child = kgnilnd_ctl_table
440 kgnilnd_tunables_init()
444 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
445 kgnilnd_tunables.kgn_sysctl =
446 cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0);
448 if (kgnilnd_tunables.kgn_sysctl == NULL)
449 CWARN("Can't setup /proc tunables\n");
451 switch (*kgnilnd_tunables.kgn_checksum) {
453 CERROR("Invalid checksum module parameter: %d\n",
454 *kgnilnd_tunables.kgn_checksum);
457 case GNILND_CHECKSUM_OFF:
458 /* no checksumming */
460 case GNILND_CHECKSUM_SMSG_HEADER:
461 LCONSOLE_INFO("SMSG header only checksumming enabled\n");
463 case GNILND_CHECKSUM_SMSG:
464 LCONSOLE_INFO("SMSG checksumming enabled\n");
466 case GNILND_CHECKSUM_SMSG_BTE:
467 LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
471 if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
472 LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
473 *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
478 if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
479 *kgnilnd_tunables.kgn_mbox_per_block = 1;
482 if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
483 *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
484 } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
485 LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
486 *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
494 kgnilnd_tunables_fini()
496 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
497 if (kgnilnd_tunables.kgn_sysctl != NULL)
498 cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);