Whamcloud - gitweb
d89dc3e73eebc03232331ac44a69939d8a4c7cfa
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_modparams.c
1 /*
2  * Copyright (C) 2004 Cluster File Systems, Inc.
3  *
4  * Copyright (C) 2009-2012 Cray, Inc.
5  *
6  *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
7  *   Author: Nic Henke <nic@cray.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  */
25
26 #include "gnilnd.h"
27
28 static int credits = 256;
29 CFS_MODULE_PARM(credits, "i", int, 0444,
30                 "# concurrent sends");
31
32 static int eager_credits = 256 * 1024;
33 CFS_MODULE_PARM(eager_credits, "i", int, 0444,
34                 "# eager buffers");
35
36 static int peer_credits = 16;
37 CFS_MODULE_PARM(peer_credits, "i", int, 0444,
38                 "# LNet peer credits");
39
40 /* NB - we'll not actually limit sends to this, we just size the mailbox buffer
41  * such that at most we'll have concurrent_sends * max_immediate messages
42  * in the mailbox */
43 static int concurrent_sends = 0;
44 CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
45                 "# concurrent HW sends to 1 peer");
46
47 /* default for 2k nodes @ 16 peer credits */
48 static int fma_cq_size = 32768;
49 CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
50                 "size of the completion queue");
51
52 static int timeout = GNILND_BASE_TIMEOUT;
53 /* can't change @ runtime because LNet gets NI data at startup from
54  * this value */
55 CFS_MODULE_PARM(timeout, "i", int, 0444,
56                 "communications timeout (seconds)");
57
58 /* time to wait between datagram timeout and sending of next dgram */
59 static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
60 CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
61                 "minimum connection retry interval (seconds)");
62
63 /* if this goes longer than timeout, we'll timeout the TX before
64  * the dgram */
65 static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
66 CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
67                 "maximum connection retry interval (seconds)");
68
69 static int max_immediate = 8192;
70 CFS_MODULE_PARM(max_immediate, "i", int, 0644,
71                 "immediate/RDMA breakpoint");
72
73 static int checksum = GNILND_CHECKSUM_DEFAULT;
74 CFS_MODULE_PARM(checksum, "i", int, 0644,
75                 "0: None, 1: headers, 2: short msg, 3: all traffic");
76
77 static int checksum_dump = 0;
78 CFS_MODULE_PARM(checksum_dump, "i", int, 0644,
79                 "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
80
81 static int bte_dlvr_mode = GNILND_RDMA_DLVR_OPTION;
82 CFS_MODULE_PARM(bte_dlvr_mode, "i", int, 0644,
83                 "enable hashing for BTE (RDMA) transfers");
84
85 static int bte_relaxed_ordering = 1;
86 CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644,
87                 "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
88
89 #ifdef CONFIG_MK1OM
90 static int ptag = GNI_PTAG_LND_KNC;
91 #else
92 static int ptag = GNI_PTAG_LND;
93 #endif
94 CFS_MODULE_PARM(ptag, "i", int, 0444,
95                 "ptag for Gemini CDM");
96
97 static int pkey = GNI_JOB_CREATE_COOKIE(GNI_PKEY_LND, 0);
98 CFS_MODULE_PARM(pkey, "i", int, 0444, "pkey for CDM");
99
100 static int max_retransmits = 1024;
101 CFS_MODULE_PARM(max_retransmits, "i", int, 0444,
102                 "max retransmits for FMA");
103
104 static int nwildcard = 4;
105 CFS_MODULE_PARM(nwildcard, "i", int, 0444,
106                 "# wildcard datagrams to post per net (interface)");
107
108 static int nice = -20;
109 CFS_MODULE_PARM(nice, "i", int, 0444,
110                 "nice value for kgnilnd threads, default -20");
111
112 static int rdmaq_intervals = 4;
113 CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644,
114                 "# intervals per second for rdmaq throttling, default 4, 0 to disable");
115
116 static int loops = 100;
117 CFS_MODULE_PARM(loops, "i", int, 0644,
118                 "# of loops before scheduler is friendly, default 100");
119
120 static int hash_size = 503;
121 CFS_MODULE_PARM(hash_size, "i", int, 0444,
122                 "prime number for peer/conn hash sizing, default 503");
123
124 static int peer_health = 0;
125 CFS_MODULE_PARM(peer_health, "i", int, 0444,
126                 "Disable peer timeout for LNet peer health, default off, > 0 to enable");
127
128 static int peer_timeout = -1;
129 CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
130                 "Peer timeout used for peer_health, default based on gnilnd timeout, > -1 to manually set");
131
132 static int vmap_cksum = 0;
133 CFS_MODULE_PARM(vmap_cksum, "i", int, 0644,
134                 "use vmap for all kiov checksumming, default off");
135
136 static int mbox_per_block = GNILND_FMABLK;
137 CFS_MODULE_PARM(mbox_per_block, "i", int, 0644,
138                 "mailboxes per block");
139
140 static int nphys_mbox = 0;
141 CFS_MODULE_PARM(nphys_mbox, "i", int, 0444,
142                 "# mbox to preallocate from physical memory, default 0");
143
144 static int mbox_credits = GNILND_MBOX_CREDITS;
145 CFS_MODULE_PARM(mbox_credits, "i", int, 0644,
146                 "number of credits per mailbox");
147
148 static int sched_threads = GNILND_SCHED_THREADS;
149 CFS_MODULE_PARM(sched_threads, "i", int, 0444,
150                 "number of threads for moving data");
151
152 static int net_hash_size = 11;
153 CFS_MODULE_PARM(net_hash_size, "i", int, 0444,
154                 "prime number for net hash sizing, default 11");
155
156 static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
157 CFS_MODULE_PARM(hardware_timeout, "i", int, 0444,
158                 "maximum time for traffic to get from one node to another");
159
160 static int mdd_timeout = GNILND_MDD_TIMEOUT;
161 CFS_MODULE_PARM(mdd_timeout, "i", int, 0644,
162                 "maximum time (in minutes) for mdd to be held");
163
164 static int sched_timeout = GNILND_SCHED_TIMEOUT;
165 CFS_MODULE_PARM(sched_timeout, "i", int, 0644,
166                 "scheduler aliveness in seconds max time");
167
168 static int sched_nice = GNILND_SCHED_NICE;
169 CFS_MODULE_PARM(sched_nice, "i", int, 0444,
170                 "scheduler's nice setting, default compute 0 service -20");
171
172 static int reverse_rdma = GNILND_REVERSE_RDMA;
173 CFS_MODULE_PARM(reverse_rdma, "i", int, 0644,
174                 "Normal 0: Reverse GET: 1 Reverse Put: 2 Reverse Both: 3");
175
176 static int dgram_timeout = GNILND_DGRAM_TIMEOUT;
177 CFS_MODULE_PARM(dgram_timeout, "i", int, 0644,
178                 "dgram thread aliveness seconds max time");
179
180 static int efault_lbug = 0;
181 CFS_MODULE_PARM(efault_lbug, "i", int, 0644,
182                 "If a compute receives an EFAULT in"
183                 " a message should it LBUG. 0 off 1 on");
184
185 static int fast_reconn = GNILND_FAST_RECONNECT;
186 CFS_MODULE_PARM(fast_reconn, "i", int, 0644,
187                 "fast reconnect on connection timeout");
188
189 static int max_conn_purg = GNILND_PURGATORY_MAX;
190 CFS_MODULE_PARM(max_conn_purg, "i", int, 0644,
191                 "Max number of connections per peer in purgatory");
192
193 static int thread_affinity = 0;
194 CFS_MODULE_PARM(thread_affinity, "i", int, 0444,
195                 "scheduler thread affinity default 0 (diabled)");
196
197 kgn_tunables_t kgnilnd_tunables = {
198         .kgn_min_reconnect_interval = &min_reconnect_interval,
199         .kgn_max_reconnect_interval = &max_reconnect_interval,
200         .kgn_credits                = &credits,
201         .kgn_peer_credits           = &peer_credits,
202         .kgn_concurrent_sends       = &concurrent_sends,
203         .kgn_fma_cq_size            = &fma_cq_size,
204         .kgn_timeout                = &timeout,
205         .kgn_max_immediate          = &max_immediate,
206         .kgn_checksum               = &checksum,
207         .kgn_checksum_dump          = &checksum_dump,
208         .kgn_bte_dlvr_mode          = &bte_dlvr_mode,
209         .kgn_bte_relaxed_ordering   = &bte_relaxed_ordering,
210         .kgn_ptag                   = &ptag,
211         .kgn_pkey                   = &pkey,
212         .kgn_max_retransmits        = &max_retransmits,
213         .kgn_nwildcard              = &nwildcard,
214         .kgn_nice                   = &nice,
215         .kgn_rdmaq_intervals        = &rdmaq_intervals,
216         .kgn_loops                  = &loops,
217         .kgn_peer_hash_size         = &hash_size,
218         .kgn_peer_health            = &peer_health,
219         .kgn_peer_timeout           = &peer_timeout,
220         .kgn_vmap_cksum             = &vmap_cksum,
221         .kgn_mbox_per_block         = &mbox_per_block,
222         .kgn_nphys_mbox             = &nphys_mbox,
223         .kgn_mbox_credits           = &mbox_credits,
224         .kgn_sched_threads          = &sched_threads,
225         .kgn_net_hash_size          = &net_hash_size,
226         .kgn_hardware_timeout       = &hardware_timeout,
227         .kgn_mdd_timeout            = &mdd_timeout,
228         .kgn_sched_timeout          = &sched_timeout,
229         .kgn_sched_nice             = &sched_nice,
230         .kgn_reverse_rdma           = &reverse_rdma,
231         .kgn_dgram_timeout          = &dgram_timeout,
232         .kgn_eager_credits          = &eager_credits,
233         .kgn_fast_reconn            = &fast_reconn,
234         .kgn_efault_lbug            = &efault_lbug,
235         .kgn_thread_affinity        = &thread_affinity,
236         .kgn_max_purgatory          = &max_conn_purg
237 };
238
239 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
240 static struct ctl_table kgnilnd_ctl_table[] = {
241         {
242                 INIT_CTL_NAME
243                 .procname = "min_reconnect_interval",
244                 .data     = &min_reconnect_interval,
245                 .maxlen   = sizeof(int),
246                 .mode     = 0644,
247                 .proc_handler = &proc_dointvec
248         },
249         {
250                 INIT_CTL_NAME
251                 .procname = "max_reconnect_interval",
252                 .data     = &max_reconnect_interval,
253                 .maxlen   = sizeof(int),
254                 .mode     = 0644,
255                 .proc_handler = &proc_dointvec
256         },
257         {
258                 INIT_CTL_NAME
259                 .procname = "credits",
260                 .data     = &credits,
261                 .maxlen   = sizeof(int),
262                 .mode     = 0444,
263                 .proc_handler = &proc_dointvec
264         },
265         {
266                 INIT_CTL_NAME
267                 .procname = "peer_credits",
268                 .data     = &peer_credits,
269                 .maxlen   = sizeof(int),
270                 .mode     = 0444,
271                 .proc_handler = &proc_dointvec
272         },
273         {
274                 INIT_CTL_NAME
275                 .procname = "fma_cq_size",
276                 .data     = &fma_cq_size,
277                 .maxlen   = sizeof(int),
278                 .mode     = 0444,
279                 .proc_handler = &proc_dointvec
280         },
281         {
282                 INIT_CTL_NAME
283                 .procname = "timeout",
284                 .data     = &timeout,
285                 .maxlen   = sizeof(int),
286                 .mode     = 0444,
287                 .proc_handler = &proc_dointvec
288         },
289         {
290                 INIT_CTL_NAME
291                 .procname = "max_immediate",
292                 .data     = &max_immediate,
293                 .maxlen   = sizeof(int),
294                 .mode     = 0444,
295                 .proc_handler = &proc_dointvec
296         },
297         {
298                 INIT_CTL_NAME
299                 .procname = "checksum",
300                 .data     = &checksum,
301                 .maxlen   = sizeof(int),
302                 .mode     = 0644,
303                 .proc_handler = &proc_dointvec
304         },
305         {
306                 INIT_CTL_NAME
307                 .procname = "bte_dlvr_mode",
308                 .data     = &bte_dlvr_mode,
309                 .maxlen   = sizeof(int),
310                 .mode     = 0644,
311                 .proc_handler = &proc_dointvec
312         },
313         {
314                 INIT_CTL_NAME
315                 .procname = "ptag",
316                 .data     = &ptag,
317                 .maxlen   = sizeof(int),
318                 .mode     = 0444,
319                 .proc_handler = &proc_dointvec
320         },
321         {
322                 INIT_CTL_NAME
323                 .procname = "pkey",
324                 .data     = &pkey,
325                 .maxlen   = sizeof(int),
326                 .mode     = 0444,
327                 .proc_handler = &proc_dointvec
328         },
329         {
330                 INIT_CTL_NAME
331                 .procname = "nwildcard",
332                 .data     = &nwildcard,
333                 .maxlen   = sizeof(int),
334                 .mode     = 0444,
335                 .proc_handler = &proc_dointvec
336         },
337         {
338                 INIT_CTL_NAME
339                 .procname = "bte_relaxed_ordering",
340                 .data     = &bte_relaxed_ordering,
341                 .maxlen   = sizeof(int),
342                 .mode     = 0644,
343                 .proc_handler = &proc_dointvec
344         },
345         {
346                 INIT_CTL_NAME
347                 .procname = "checksum_dump",
348                 .data     = &checksum_dump,
349                 .maxlen   = sizeof(int),
350                 .mode     = 0644,
351                 .proc_handler = &proc_dointvec
352         },
353         {
354                 INIT_CTL_NAME
355                 .procname = "nice",
356                 .data     = &nice,
357                 .maxlen   = sizeof(int),
358                 .mode     = 0444,
359                 .proc_handler = &proc_dointvec
360         },
361         {
362                 INIT_CTL_NAME
363                 .procname = "rdmaq_intervals",
364                 .data     = &rdmaq_intervals,
365                 .maxlen   = sizeof(int),
366                 .mode     = 0644,
367                 .proc_handler = &proc_dointvec
368         },
369         {
370                 INIT_CTL_NAME
371                 .procname = "loops",
372                 .data     = &loops,
373                 .maxlen   = sizeof(int),
374                 .mode     = 0644,
375                 .proc_handler = &proc_dointvec
376         },
377         {
378                 INIT_CTL_NAME
379                 .procname = "hash_size",
380                 .data     = &hash_size,
381                 .maxlen   = sizeof(int),
382                 .mode     = 0444,
383                 .proc_handler = &proc_dointvec
384         },
385         {
386                 INIT_CTL_NAME
387                 .procname = "peer_health",
388                 .data     = &peer_health,
389                 .maxlen   = sizeof(int),
390                 .mode     = 0444,
391                 .proc_handler = &proc_dointvec
392         },
393         {
394                 INIT_CTL_NAME
395                 .procname = "vmap_cksum",
396                 .data     = &vmap_cksum,
397                 .maxlen   = sizeof(int),
398                 .mode     = 0644,
399                 .proc_handler = &proc_dointvec
400         },
401         {
402                 INIT_CTL_NAME
403                 .procname = "mbox_per_block",
404                 .data     = &mbox_per_block,
405                 .maxlen   = sizeof(int),
406                 .mode     = 0644,
407                 .proc_handler = &proc_dointvec
408         },
409         {
410                 INIT_CTL_NAME
411                 .procname = "mbox_credits"
412                 .data     = &mbox_credits,
413                 .maxlen   = sizeof(int),
414                 .mode     = 0644,
415                 .proc_handler = &proc_dointvec
416         },
417         {
418                 INIT_CTL_NAME
419                 .procname = "sched_threads"
420                 .data     = &sched_threads,
421                 .maxlen   = sizeof(int),
422                 .mode     = 0444,
423                 .proc_handler = &proc_dointvec
424         },
425         {
426                 INIT_CTL_NAME
427                 .procname = "net_hash_size",
428                 .data     = &net_hash_size,
429                 .maxlen   = sizeof(int),
430                 .mode     = 0444,
431                 .proc_handler = &proc_dointvec
432         },
433         {
434                 INIT_CTL_NAME
435                 .procname = "hardware_timeout",
436                 .data     = &hardware_timeout,
437                 .maxlen   = sizeof(int),
438                 .mode     = 0444,
439                 .proc_handler = &proc_dointvec
440         },
441         {
442                 INIT_CTL_NAME
443                 .procname = "mdd_timeout",
444                 .data     = &mdd_timeout,
445                 .maxlen   = sizeof(int),
446                 .mode     = 0644,
447                 .proc_handler = &proc_dointvec
448         },
449         {
450                 INIT_CTL_NAME
451                 .procname = "max_retransmits"
452                 .data     = &max_retransmits,
453                 .maxlen   = sizeof(int),
454                 .mode     = 0444,
455                 .proc_handler = &proc_dointvec
456         },
457         {
458                 INIT_CTL_NAME
459                 .procname = "concurrent_sends",
460                 .data     = &concurrent_sends,
461                 .maxlen   = sizeof(int),
462                 .mode     = 0444,
463                 .proc_handler = &proc_dointvec
464         },
465         {
466                 INIT_CTL_NAME
467                 .procname = "nphys_mbox",
468                 .data     = &nphys_mbox,
469                 .maxlen   = sizeof(int),
470                 .mode     = 0444,
471                 .proc_handler = &proc_dointvec
472         },
473         {
474                 INIT_CTL_NAME
475                 .procname = "sched_timeout",
476                 .data     = &sched_timeout,
477                 .maxlen   = sizeof(int),
478                 .mode     = 0644,
479                 .proc_handler = &proc_dointvec
480         },
481         {
482                 INIT_CTL_NAME
483                 .procname = "sched_nice",
484                 .data     = &sched_nice,
485                 .maxlen   = sizeof(int),
486                 .mode     = 0444,
487                 .proc_handler = &proc_dointvec
488         },
489         {
490                 INIT_CTL_NAME
491                 .procname = "reverse_rdma",
492                 .data     = &reverse_rdma,
493                 .maxlen   = sizeof(int),
494                 .mode     = 0644,
495                 .proc_handler = &proc_dointvec
496         },
497                 INIT_CTL_NAME
498                 .procname = "dgram_timeout"
499                 .data     = &dgram_timeout,
500                 .maxlen   = sizeof(int),
501                 .mode     = 0644,
502                 .proc_handler = &proc_dointvec
503         },
504         {
505                 INIT_CTL_NAME
506                 .procname = "peer_timeout"
507                 .data     = &peer_timeout,
508                 .maxlen   = sizeof(int),
509                 .mode     = 0444,
510                 .proc_handler = &proc_dointvec
511         },
512         {
513                 INIT_CTL_NAME
514                 .procname = "eager_credits",
515                 .data     = &eager_credits,
516                 .maxlen   = sizeof(int),
517                 .mode     = 0644,
518                 .proc_handler = &proc_dointvec
519         },
520         {
521                 INIT_CTL_NAME
522                 .procname = "efault_lbug"
523                 .data     = &efault_lbug,
524                 .maxlen   = sizeof(int),
525                 .mode     = 0644,
526                 .proc_handler = &proc_dointvec
527         },
528         {
529                 INIT_CTL_NAME
530                 .procname = "thread_affinity"
531                 .data     = &thread_affinity,
532                 .maxlen   = sizeof(int),
533                 .mode     = 0444,
534                 .proc_handler = &proc_dointvec
535         },
536         {
537                 INIT_CTL_NAME
538                 .procname = "max_conn_purg"
539                 .data     = &max_conn_purg,
540                 .maxlen   = sizeof(int),
541                 .mode     = 0644,
542                 .proc_handler = &proc_dointvec
543         },
544         { 0 }
545 };
546
547 static struct ctl_table kgnilnd_top_ctl_table[] = {
548         {
549                 INIT_CTL_NAME
550                 .procname = "gnilnd",
551                 .data     = NULL,
552                 .maxlen   = 0,
553                 .mode     = 0555,
554                 .child    = kgnilnd_ctl_table
555         },
556         { 0 }
557 };
558 #endif
559
560 int
561 kgnilnd_tunables_init()
562 {
563         int rc = 0;
564
565 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
566         kgnilnd_tunables.kgn_sysctl =
567                 cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0);
568
569         if (kgnilnd_tunables.kgn_sysctl == NULL)
570                 CWARN("Can't setup /proc tunables\n");
571 #endif
572         switch (*kgnilnd_tunables.kgn_checksum) {
573         default:
574                 CERROR("Invalid checksum module parameter: %d\n",
575                        *kgnilnd_tunables.kgn_checksum);
576                 rc = -EINVAL;
577                 GOTO(out, rc);
578         case GNILND_CHECKSUM_OFF:
579                 /* no checksumming */
580                 break;
581         case GNILND_CHECKSUM_SMSG_HEADER:
582                 LCONSOLE_INFO("SMSG header only checksumming enabled\n");
583                 break;
584         case GNILND_CHECKSUM_SMSG:
585                 LCONSOLE_INFO("SMSG checksumming enabled\n");
586                 break;
587         case GNILND_CHECKSUM_SMSG_BTE:
588                 LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
589                 break;
590         }
591
592         if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
593                 LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
594                 *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
595                 rc = -EINVAL;
596                 GOTO(out, rc);
597         }
598
599         if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
600                 *kgnilnd_tunables.kgn_mbox_per_block = 1;
601         }
602
603         if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
604                 *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
605         } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
606                 LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
607                                *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
608                 rc = -EINVAL;
609         }
610 out:
611         return rc;
612 }
613
614 void
615 kgnilnd_tunables_fini()
616 {
617 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
618         if (kgnilnd_tunables.kgn_sysctl != NULL)
619                 cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);
620 #endif
621 }