Whamcloud - gitweb
LU-6261 gnilnd: Add ability to bind scheduler threads to cpus.
[fs/lustre-release.git] / lnet / klnds / gnilnd / gnilnd_modparams.c
1 /*
2  * Copyright (C) 2004 Cluster File Systems, Inc.
3  *
4  * Copyright (C) 2009-2012 Cray, Inc.
5  *
6  *   Derived from work by: Eric Barton <eric@bartonsoftware.com>
7  *   Author: Nic Henke <nic@cray.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  *
24  */
25
26 #include "gnilnd.h"
27
28 static int credits = 256;
29 CFS_MODULE_PARM(credits, "i", int, 0444,
30                 "# concurrent sends");
31
32 static int eager_credits = 256 * 1024;
33 CFS_MODULE_PARM(eager_credits, "i", int, 0444,
34                 "# eager buffers");
35
36 static int peer_credits = 16;
37 CFS_MODULE_PARM(peer_credits, "i", int, 0444,
38                 "# LNet peer credits");
39
40 /* NB - we'll not actually limit sends to this, we just size the mailbox buffer
41  * such that at most we'll have concurrent_sends * max_immediate messages
42  * in the mailbox */
43 static int concurrent_sends = 0;
44 CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
45                 "# concurrent HW sends to 1 peer");
46
47 /* default for 2k nodes @ 16 peer credits */
48 static int fma_cq_size = 32768;
49 CFS_MODULE_PARM(fma_cq_size, "i", int, 0444,
50                 "size of the completion queue");
51
52 static int timeout = GNILND_BASE_TIMEOUT;
53 /* can't change @ runtime because LNet gets NI data at startup from
54  * this value */
55 CFS_MODULE_PARM(timeout, "i", int, 0444,
56                 "communications timeout (seconds)");
57
58 /* time to wait between datagram timeout and sending of next dgram */
59 static int min_reconnect_interval = GNILND_MIN_RECONNECT_TO;
60 CFS_MODULE_PARM(min_reconnect_interval, "i", int, 0644,
61                 "minimum connection retry interval (seconds)");
62
63 /* if this goes longer than timeout, we'll timeout the TX before
64  * the dgram */
65 static int max_reconnect_interval = GNILND_MAX_RECONNECT_TO;
66 CFS_MODULE_PARM(max_reconnect_interval, "i", int, 0644,
67                 "maximum connection retry interval (seconds)");
68
69 static int max_immediate = 8192;
70 CFS_MODULE_PARM(max_immediate, "i", int, 0644,
71                 "immediate/RDMA breakpoint");
72
73 static int checksum = GNILND_CHECKSUM_DEFAULT;
74 CFS_MODULE_PARM(checksum, "i", int, 0644,
75                 "0: None, 1: headers, 2: short msg, 3: all traffic");
76
77 static int checksum_dump = 0;
78 CFS_MODULE_PARM(checksum_dump, "i", int, 0644,
79                 "0: None, 1: dump log on failure, 2: payload data to D_INFO log");
80
81 static int bte_dlvr_mode = GNILND_RDMA_DLVR_OPTION;
82 CFS_MODULE_PARM(bte_dlvr_mode, "i", int, 0644,
83                 "enable hashing for BTE (RDMA) transfers");
84
85 static int bte_relaxed_ordering = 1;
86 CFS_MODULE_PARM(bte_relaxed_ordering, "i", int, 0644,
87                 "enable relaxed ordering (PASSPW) for BTE (RDMA) transfers");
88
89 #ifdef CONFIG_MK1OM
90 static int ptag = GNI_PTAG_LND_KNC;
91 #else
92 static int ptag = GNI_PTAG_LND;
93 #endif
94 CFS_MODULE_PARM(ptag, "i", int, 0444,
95                 "ptag for Gemini CDM");
96
97 static int max_retransmits = 1024;
98 CFS_MODULE_PARM(max_retransmits, "i", int, 0444,
99                 "max retransmits for FMA");
100
101 static int nwildcard = 4;
102 CFS_MODULE_PARM(nwildcard, "i", int, 0444,
103                 "# wildcard datagrams to post per net (interface)");
104
105 static int nice = -20;
106 CFS_MODULE_PARM(nice, "i", int, 0444,
107                 "nice value for kgnilnd threads, default -20");
108
109 static int rdmaq_intervals = 4;
110 CFS_MODULE_PARM(rdmaq_intervals, "i", int, 0644,
111                 "# intervals per second for rdmaq throttling, default 4, 0 to disable");
112
113 static int loops = 100;
114 CFS_MODULE_PARM(loops, "i", int, 0644,
115                 "# of loops before scheduler is friendly, default 100");
116
117 static int hash_size = 503;
118 CFS_MODULE_PARM(hash_size, "i", int, 0444,
119                 "prime number for peer/conn hash sizing, default 503");
120
121 static int peer_health = 0;
122 CFS_MODULE_PARM(peer_health, "i", int, 0444,
123                 "Disable peer timeout for LNet peer health, default off, > 0 to enable");
124
125 static int peer_timeout = -1;
126 CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
127                 "Peer timeout used for peer_health, default based on gnilnd timeout, > -1 to manually set");
128
129 static int vmap_cksum = 0;
130 CFS_MODULE_PARM(vmap_cksum, "i", int, 0644,
131                 "use vmap for all kiov checksumming, default off");
132
133 static int mbox_per_block = GNILND_FMABLK;
134 CFS_MODULE_PARM(mbox_per_block, "i", int, 0644,
135                 "mailboxes per block");
136
137 static int nphys_mbox = 0;
138 CFS_MODULE_PARM(nphys_mbox, "i", int, 0444,
139                 "# mbox to preallocate from physical memory, default 0");
140
141 static int mbox_credits = GNILND_MBOX_CREDITS;
142 CFS_MODULE_PARM(mbox_credits, "i", int, 0644,
143                 "number of credits per mailbox");
144
145 static int sched_threads = GNILND_SCHED_THREADS;
146 CFS_MODULE_PARM(sched_threads, "i", int, 0444,
147                 "number of threads for moving data");
148
149 static int net_hash_size = 11;
150 CFS_MODULE_PARM(net_hash_size, "i", int, 0444,
151                 "prime number for net hash sizing, default 11");
152
153 static int hardware_timeout = GNILND_HARDWARE_TIMEOUT;
154 CFS_MODULE_PARM(hardware_timeout, "i", int, 0444,
155                 "maximum time for traffic to get from one node to another");
156
157 static int mdd_timeout = GNILND_MDD_TIMEOUT;
158 CFS_MODULE_PARM(mdd_timeout, "i", int, 0644,
159                 "maximum time (in minutes) for mdd to be held");
160
161 static int sched_timeout = GNILND_SCHED_TIMEOUT;
162 CFS_MODULE_PARM(sched_timeout, "i", int, 0644,
163                 "scheduler aliveness in seconds max time");
164
165 static int sched_nice = GNILND_SCHED_NICE;
166 CFS_MODULE_PARM(sched_nice, "i", int, 0444,
167                 "scheduler's nice setting, default compute 0 service -20");
168
169 static int reverse_rdma = GNILND_REVERSE_RDMA;
170 CFS_MODULE_PARM(reverse_rdma, "i", int, 0644,
171                 "Normal 0: Reverse GET: 1 Reverse Put: 2 Reverse Both: 3");
172
173 static int dgram_timeout = GNILND_DGRAM_TIMEOUT;
174 CFS_MODULE_PARM(dgram_timeout, "i", int, 0644,
175                 "dgram thread aliveness seconds max time");
176
177 static int efault_lbug = 0;
178 CFS_MODULE_PARM(efault_lbug, "i", int, 0644,
179                 "If a compute receives an EFAULT in"
180                 " a message should it LBUG. 0 off 1 on");
181
182 static int fast_reconn = GNILND_FAST_RECONNECT;
183 CFS_MODULE_PARM(fast_reconn, "i", int, 0644,
184                 "fast reconnect on connection timeout");
185
186 static int max_conn_purg = GNILND_PURGATORY_MAX;
187 CFS_MODULE_PARM(max_conn_purg, "i", int, 0644,
188                 "Max number of connections per peer in purgatory");
189
190 static int thread_affinity = 0;
191 CFS_MODULE_PARM(thread_affinity, "i", int, 0444,
192                 "scheduler thread affinity default 0 (diabled)");
193
194 kgn_tunables_t kgnilnd_tunables = {
195         .kgn_min_reconnect_interval = &min_reconnect_interval,
196         .kgn_max_reconnect_interval = &max_reconnect_interval,
197         .kgn_credits                = &credits,
198         .kgn_peer_credits           = &peer_credits,
199         .kgn_concurrent_sends       = &concurrent_sends,
200         .kgn_fma_cq_size            = &fma_cq_size,
201         .kgn_timeout                = &timeout,
202         .kgn_max_immediate          = &max_immediate,
203         .kgn_checksum               = &checksum,
204         .kgn_checksum_dump          = &checksum_dump,
205         .kgn_bte_dlvr_mode          = &bte_dlvr_mode,
206         .kgn_bte_relaxed_ordering   = &bte_relaxed_ordering,
207         .kgn_ptag                   = &ptag,
208         .kgn_max_retransmits        = &max_retransmits,
209         .kgn_nwildcard              = &nwildcard,
210         .kgn_nice                   = &nice,
211         .kgn_rdmaq_intervals        = &rdmaq_intervals,
212         .kgn_loops                  = &loops,
213         .kgn_peer_hash_size         = &hash_size,
214         .kgn_peer_health            = &peer_health,
215         .kgn_peer_timeout           = &peer_timeout,
216         .kgn_vmap_cksum             = &vmap_cksum,
217         .kgn_mbox_per_block         = &mbox_per_block,
218         .kgn_nphys_mbox             = &nphys_mbox,
219         .kgn_mbox_credits           = &mbox_credits,
220         .kgn_sched_threads          = &sched_threads,
221         .kgn_net_hash_size          = &net_hash_size,
222         .kgn_hardware_timeout       = &hardware_timeout,
223         .kgn_mdd_timeout            = &mdd_timeout,
224         .kgn_sched_timeout          = &sched_timeout,
225         .kgn_sched_nice             = &sched_nice,
226         .kgn_reverse_rdma           = &reverse_rdma,
227         .kgn_dgram_timeout          = &dgram_timeout,
228         .kgn_eager_credits          = &eager_credits,
229         .kgn_fast_reconn            = &fast_reconn,
230         .kgn_efault_lbug            = &efault_lbug,
231         .kgn_thread_affinity        = &thread_affinity,
232         .kgn_max_purgatory          = &max_conn_purg
233 };
234
235 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
236 static struct ctl_table kgnilnd_ctl_table[] = {
237         {
238                 INIT_CTL_NAME
239                 .procname = "min_reconnect_interval",
240                 .data     = &min_reconnect_interval,
241                 .maxlen   = sizeof(int),
242                 .mode     = 0644,
243                 .proc_handler = &proc_dointvec
244         },
245         {
246                 INIT_CTL_NAME
247                 .procname = "max_reconnect_interval",
248                 .data     = &max_reconnect_interval,
249                 .maxlen   = sizeof(int),
250                 .mode     = 0644,
251                 .proc_handler = &proc_dointvec
252         },
253         {
254                 INIT_CTL_NAME
255                 .procname = "credits",
256                 .data     = &credits,
257                 .maxlen   = sizeof(int),
258                 .mode     = 0444,
259                 .proc_handler = &proc_dointvec
260         },
261         {
262                 INIT_CTL_NAME
263                 .procname = "peer_credits",
264                 .data     = &peer_credits,
265                 .maxlen   = sizeof(int),
266                 .mode     = 0444,
267                 .proc_handler = &proc_dointvec
268         },
269         {
270                 INIT_CTL_NAME
271                 .procname = "fma_cq_size",
272                 .data     = &fma_cq_size,
273                 .maxlen   = sizeof(int),
274                 .mode     = 0444,
275                 .proc_handler = &proc_dointvec
276         },
277         {
278                 INIT_CTL_NAME
279                 .procname = "timeout",
280                 .data     = &timeout,
281                 .maxlen   = sizeof(int),
282                 .mode     = 0444,
283                 .proc_handler = &proc_dointvec
284         },
285         {
286                 INIT_CTL_NAME
287                 .procname = "max_immediate",
288                 .data     = &max_immediate,
289                 .maxlen   = sizeof(int),
290                 .mode     = 0444,
291                 .proc_handler = &proc_dointvec
292         },
293         {
294                 INIT_CTL_NAME
295                 .procname = "checksum",
296                 .data     = &checksum,
297                 .maxlen   = sizeof(int),
298                 .mode     = 0644,
299                 .proc_handler = &proc_dointvec
300         },
301         {
302                 INIT_CTL_NAME
303                 .procname = "bte_dlvr_mode",
304                 .data     = &bte_dlvr_mode,
305                 .maxlen   = sizeof(int),
306                 .mode     = 0644,
307                 .proc_handler = &proc_dointvec
308         },
309         {
310                 INIT_CTL_NAME
311                 .procname = "ptag",
312                 .data     = &ptag,
313                 .maxlen   = sizeof(int),
314                 .mode     = 0444,
315                 .proc_handler = &proc_dointvec
316         },
317         {
318                 INIT_CTL_NAME
319                 .procname = "nwildcard",
320                 .data     = &nwildcard,
321                 .maxlen   = sizeof(int),
322                 .mode     = 0444,
323                 .proc_handler = &proc_dointvec
324         },
325         {
326                 INIT_CTL_NAME
327                 .procname = "bte_relaxed_ordering",
328                 .data     = &bte_relaxed_ordering,
329                 .maxlen   = sizeof(int),
330                 .mode     = 0644,
331                 .proc_handler = &proc_dointvec
332         },
333         {
334                 INIT_CTL_NAME
335                 .procname = "checksum_dump",
336                 .data     = &checksum_dump,
337                 .maxlen   = sizeof(int),
338                 .mode     = 0644,
339                 .proc_handler = &proc_dointvec
340         },
341         {
342                 INIT_CTL_NAME
343                 .procname = "nice",
344                 .data     = &nice,
345                 .maxlen   = sizeof(int),
346                 .mode     = 0444,
347                 .proc_handler = &proc_dointvec
348         },
349         {
350                 INIT_CTL_NAME
351                 .procname = "rdmaq_intervals",
352                 .data     = &rdmaq_intervals,
353                 .maxlen   = sizeof(int),
354                 .mode     = 0644,
355                 .proc_handler = &proc_dointvec
356         },
357         {
358                 INIT_CTL_NAME
359                 .procname = "loops",
360                 .data     = &loops,
361                 .maxlen   = sizeof(int),
362                 .mode     = 0644,
363                 .proc_handler = &proc_dointvec
364         },
365         {
366                 INIT_CTL_NAME
367                 .procname = "hash_size",
368                 .data     = &hash_size,
369                 .maxlen   = sizeof(int),
370                 .mode     = 0444,
371                 .proc_handler = &proc_dointvec
372         },
373         {
374                 INIT_CTL_NAME
375                 .procname = "peer_health",
376                 .data     = &peer_health,
377                 .maxlen   = sizeof(int),
378                 .mode     = 0444,
379                 .proc_handler = &proc_dointvec
380         },
381         {
382                 INIT_CTL_NAME
383                 .procname = "vmap_cksum",
384                 .data     = &vmap_cksum,
385                 .maxlen   = sizeof(int),
386                 .mode     = 0644,
387                 .proc_handler = &proc_dointvec
388         },
389         {
390                 INIT_CTL_NAME
391                 .procname = "mbox_per_block",
392                 .data     = &mbox_per_block,
393                 .maxlen   = sizeof(int),
394                 .mode     = 0644,
395                 .proc_handler = &proc_dointvec
396         },
397         {
398                 INIT_CTL_NAME
399                 .procname = "mbox_credits"
400                 .data     = &mbox_credits,
401                 .maxlen   = sizeof(int),
402                 .mode     = 0644,
403                 .proc_handler = &proc_dointvec
404         },
405         {
406                 INIT_CTL_NAME
407                 .procname = "sched_threads"
408                 .data     = &sched_threads,
409                 .maxlen   = sizeof(int),
410                 .mode     = 0444,
411                 .proc_handler = &proc_dointvec
412         },
413         {
414                 INIT_CTL_NAME
415                 .procname = "net_hash_size",
416                 .data     = &net_hash_size,
417                 .maxlen   = sizeof(int),
418                 .mode     = 0444,
419                 .proc_handler = &proc_dointvec
420         },
421         {
422                 INIT_CTL_NAME
423                 .procname = "hardware_timeout",
424                 .data     = &hardware_timeout,
425                 .maxlen   = sizeof(int),
426                 .mode     = 0444,
427                 .proc_handler = &proc_dointvec
428         },
429         {
430                 INIT_CTL_NAME
431                 .procname = "mdd_timeout",
432                 .data     = &mdd_timeout,
433                 .maxlen   = sizeof(int),
434                 .mode     = 0644,
435                 .proc_handler = &proc_dointvec
436         },
437         {
438                 INIT_CTL_NAME
439                 .procname = "max_retransmits"
440                 .data     = &max_retransmits,
441                 .maxlen   = sizeof(int),
442                 .mode     = 0444,
443                 .proc_handler = &proc_dointvec
444         },
445         {
446                 INIT_CTL_NAME
447                 .procname = "concurrent_sends",
448                 .data     = &concurrent_sends,
449                 .maxlen   = sizeof(int),
450                 .mode     = 0444,
451                 .proc_handler = &proc_dointvec
452         },
453         {
454                 INIT_CTL_NAME
455                 .procname = "nphys_mbox",
456                 .data     = &nphys_mbox,
457                 .maxlen   = sizeof(int),
458                 .mode     = 0444,
459                 .proc_handler = &proc_dointvec
460         },
461         {
462                 INIT_CTL_NAME
463                 .procname = "sched_timeout",
464                 .data     = &sched_timeout,
465                 .maxlen   = sizeof(int),
466                 .mode     = 0644,
467                 .proc_handler = &proc_dointvec
468         },
469         {
470                 INIT_CTL_NAME
471                 .procname = "sched_nice",
472                 .data     = &sched_nice,
473                 .maxlen   = sizeof(int),
474                 .mode     = 0444,
475                 .proc_handler = &proc_dointvec
476         },
477         {
478                 INIT_CTL_NAME
479                 .procname = "reverse_rdma",
480                 .data     = &reverse_rdma,
481                 .maxlen   = sizeof(int),
482                 .mode     = 0644,
483                 .proc_handler = &proc_dointvec
484         },
485                 INIT_CTL_NAME
486                 .procname = "dgram_timeout"
487                 .data     = &dgram_timeout,
488                 .maxlen   = sizeof(int),
489                 .mode     = 0644,
490                 .proc_handler = &proc_dointvec
491         },
492         {
493                 INIT_CTL_NAME
494                 .procname = "peer_timeout"
495                 .data     = &peer_timeout,
496                 .maxlen   = sizeof(int),
497                 .mode     = 0444,
498                 .proc_handler = &proc_dointvec
499         },
500         {
501                 INIT_CTL_NAME
502                 .procname = "eager_credits",
503                 .data     = &eager_credits,
504                 .maxlen   = sizeof(int),
505                 .mode     = 0644,
506                 .proc_handler = &proc_dointvec
507         },
508         {
509                 INIT_CTL_NAME
510                 .procname = "efault_lbug"
511                 .data     = &efault_lbug,
512                 .maxlen   = sizeof(int),
513                 .mode     = 0644,
514                 .proc_handler = &proc_dointvec
515         },
516         {
517                 INIT_CTL_NAME
518                 .procname = "thread_affinity"
519                 .data     = &thread_affinity,
520                 .maxlen   = sizeof(int),
521                 .mode     = 0444,
522                 .proc_handler = &proc_dointvec
523         },
524         {
525                 INIT_CTL_NAME
526                 .procname = "max_conn_purg"
527                 .data     = &max_conn_purg,
528                 .maxlen   = sizeof(int),
529                 .mode     = 0644,
530                 .proc_handler = &proc_dointvec
531         },
532         { 0 }
533 };
534
535 static struct ctl_table kgnilnd_top_ctl_table[] = {
536         {
537                 INIT_CTL_NAME
538                 .procname = "gnilnd",
539                 .data     = NULL,
540                 .maxlen   = 0,
541                 .mode     = 0555,
542                 .child    = kgnilnd_ctl_table
543         },
544         { 0 }
545 };
546 #endif
547
548 int
549 kgnilnd_tunables_init()
550 {
551         int rc = 0;
552
553 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
554         kgnilnd_tunables.kgn_sysctl =
555                 cfs_register_sysctl_table(kgnilnd_top_ctl_table, 0);
556
557         if (kgnilnd_tunables.kgn_sysctl == NULL)
558                 CWARN("Can't setup /proc tunables\n");
559 #endif
560         switch (*kgnilnd_tunables.kgn_checksum) {
561         default:
562                 CERROR("Invalid checksum module parameter: %d\n",
563                        *kgnilnd_tunables.kgn_checksum);
564                 rc = -EINVAL;
565                 GOTO(out, rc);
566         case GNILND_CHECKSUM_OFF:
567                 /* no checksumming */
568                 break;
569         case GNILND_CHECKSUM_SMSG_HEADER:
570                 LCONSOLE_INFO("SMSG header only checksumming enabled\n");
571                 break;
572         case GNILND_CHECKSUM_SMSG:
573                 LCONSOLE_INFO("SMSG checksumming enabled\n");
574                 break;
575         case GNILND_CHECKSUM_SMSG_BTE:
576                 LCONSOLE_INFO("SMSG + BTE checksumming enabled\n");
577                 break;
578         }
579
580         if (*kgnilnd_tunables.kgn_max_immediate > GNILND_MAX_IMMEDIATE) {
581                 LCONSOLE_ERROR("kgnilnd module parameter 'max_immediate' too large %d > %d\n",
582                 *kgnilnd_tunables.kgn_max_immediate, GNILND_MAX_IMMEDIATE);
583                 rc = -EINVAL;
584                 GOTO(out, rc);
585         }
586
587         if (*kgnilnd_tunables.kgn_mbox_per_block < 1) {
588                 *kgnilnd_tunables.kgn_mbox_per_block = 1;
589         }
590
591         if (*kgnilnd_tunables.kgn_concurrent_sends == 0) {
592                 *kgnilnd_tunables.kgn_concurrent_sends = *kgnilnd_tunables.kgn_peer_credits;
593         } else if (*kgnilnd_tunables.kgn_concurrent_sends > *kgnilnd_tunables.kgn_peer_credits) {
594                 LCONSOLE_ERROR("kgnilnd parameter 'concurrent_sends' too large: %d > %d (peer_credits)\n",
595                                *kgnilnd_tunables.kgn_concurrent_sends, *kgnilnd_tunables.kgn_peer_credits);
596                 rc = -EINVAL;
597         }
598 out:
599         return rc;
600 }
601
602 void
603 kgnilnd_tunables_fini()
604 {
605 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
606         if (kgnilnd_tunables.kgn_sysctl != NULL)
607                 cfs_unregister_sysctl_table(kgnilnd_tunables.kgn_sysctl);
608 #endif
609 }