Whamcloud - gitweb
LU-9679 modules: Use LIST_HEAD for declaring list_heads
[fs/lustre-release.git] / lnet / klnds / o2iblnd / o2iblnd_modparams.c
index d09da47..2a329e5 100644 (file)
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
+/*
+ * GPL HEADER START
  *
- * Copyright (C) 2006 Cluster File Systems, Inc.
- *   Author: Eric Barton <eric@bartonsoftware.com>
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   This file is part of Lustre, http://www.lustre.org.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.gnu.org/licenses/gpl-2.0.html
  *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, 2017, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lnet/klnds/o2iblnd/o2iblnd_modparams.c
  *
+ * Author: Eric Barton <eric@bartonsoftware.com>
  */
 
 #include "o2iblnd.h"
 
+#define CURRENT_LND_VERSION 1
+
 static int service = 987;
-CFS_MODULE_PARM(service, "i", int, 0444,
-                "service number (within RDMA_PS_TCP)");
+module_param(service, int, 0444);
+MODULE_PARM_DESC(service, "service number (within RDMA_PS_TCP)");
 
 static int cksum = 0;
-CFS_MODULE_PARM(cksum, "i", int, 0644,
-               "set non-zero to enable message (not RDMA) checksums");
+module_param(cksum, int, 0644);
+MODULE_PARM_DESC(cksum, "set non-zero to enable message (not RDMA) checksums");
 
 static int timeout = 50;
-CFS_MODULE_PARM(timeout, "i", int, 0644,
-               "timeout (seconds)");
+module_param(timeout, int, 0644);
+MODULE_PARM_DESC(timeout, "timeout (seconds)");
+
+/* Number of threads in each scheduler pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's set to zero. */
+static int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "number of threads in each scheduler pool");
+
+static unsigned int conns_per_peer = 1;
+module_param(conns_per_peer, uint, 0444);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
 
-static int ntx = 256;
-CFS_MODULE_PARM(ntx, "i", int, 0444,
-               "# of message descriptors");
+/* NB: this value is shared by all CPTs, it can grow at runtime */
+static int ntx = 512;
+module_param(ntx, int, 0444);
+MODULE_PARM_DESC(ntx, "# of message descriptors allocated for each pool");
 
-static int credits = 64;
-CFS_MODULE_PARM(credits, "i", int, 0444,
-               "# concurrent sends");
+/* NB: this value is shared by all CPTs */
+static int credits = 256;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
 
 static int peer_credits = 8;
-CFS_MODULE_PARM(peer_credits, "i", int, 0444,
-               "# concurrent sends to 1 peer");
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
+
+static int peer_credits_hiw = 0;
+module_param(peer_credits_hiw, int, 0444);
+MODULE_PARM_DESC(peer_credits_hiw, "when eagerly to return credits");
+
+static int peer_buffer_credits = 0;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
+
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
 
 static char *ipif_name = "ib0";
-CFS_MODULE_PARM(ipif_name, "s", charp, 0444,
-                "IPoIB interface name");
+module_param(ipif_name, charp, 0444);
+MODULE_PARM_DESC(ipif_name, "IPoIB interface name");
 
 static int retry_count = 5;
-CFS_MODULE_PARM(retry_count, "i", int, 0644,
-                "Retransmissions when no ACK received");
+module_param(retry_count, int, 0644);
+MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
 
 static int rnr_retry_count = 6;
-CFS_MODULE_PARM(rnr_retry_count, "i", int, 0644,
-                "RNR retransmissions");
+module_param(rnr_retry_count, int, 0644);
+MODULE_PARM_DESC(rnr_retry_count, "RNR retransmissions");
 
 static int keepalive = 100;
-CFS_MODULE_PARM(keepalive, "i", int, 0644,
-                "Idle time in seconds before sending a keepalive");
-
-static int ib_mtu = 0;
-CFS_MODULE_PARM(ib_mtu, "i", int, 0444,
-                "IB MTU 256/512/1024/2048/4096");
-
-#if IBLND_MAP_ON_DEMAND
-static int concurrent_sends = IBLND_RX_MSGS;
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "Idle time in seconds before sending a keepalive");
+
+static int ib_mtu;
+module_param(ib_mtu, int, 0444);
+MODULE_PARM_DESC(ib_mtu, "IB MTU 256/512/1024/2048/4096");
+
+static int concurrent_sends;
+module_param(concurrent_sends, int, 0444);
+MODULE_PARM_DESC(concurrent_sends, "send work-queue sizing");
+
+static int use_fastreg_gaps;
+module_param(use_fastreg_gaps, int, 0444);
+MODULE_PARM_DESC(use_fastreg_gaps, "Enable discontiguous fastreg fragment support. Expect performance drop");
+
+/*
+ * map_on_demand is a flag used to determine if we can use FMR or FastReg.
+ * This is applicable for kernels which support global memory regions. For
+ * later kernels this flag is always enabled, since we will always either
+ * use FMR or FastReg
+ * For kernels which support global memory regions map_on_demand defaults
+ * to 0 which means we will be using global memory regions exclusively.
+ * If it is set to a value other than 0, then we will behave as follows:
+ *  1. Always default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Attempt to transmit using global memory regions only if
+ *     map-on-demand is not turned on, otherwise use FMR or FastReg
+ *  5. In case of transmitting tx with GAPS over FMR we will need to
+ *     transmit it with multiple fragments. Look at the comments in
+ *     kiblnd_fmr_map_tx() for an explanation of the behavior.
+ *
+ * For later kernels we default map_on_demand to 1 and not allow
+ * it to be set to 0, since there is no longer support for global memory
+ * regions. Behavior:
+ *  1. Default the number of fragments to IBLND_MAX_RDMA_FRAGS
+ *  2. Create FMR/FastReg pools
+ *  3. Negotiate the supported number of fragments per connection
+ *  4. Look at the comments in kiblnd_fmr_map_tx() for an explanation of
+ *     the behavior when transmit with GAPS verses contiguous.
+ */
+#ifdef HAVE_IB_GET_DMA_MR
+#define IBLND_DEFAULT_MAP_ON_DEMAND 0
+#define MOD_STR "map on demand"
 #else
-static int concurrent_sends = IBLND_MSG_QUEUE_SIZE;
+#define IBLND_DEFAULT_MAP_ON_DEMAND 1
+#define MOD_STR "map on demand (obsolete)"
 #endif
-CFS_MODULE_PARM(concurrent_sends, "i", int, 0444,
-                "send work-queue sizing");
+static int map_on_demand = IBLND_DEFAULT_MAP_ON_DEMAND;
+module_param(map_on_demand, int, 0444);
+MODULE_PARM_DESC(map_on_demand, MOD_STR);
 
-#if IBLND_MAP_ON_DEMAND
+/* NB: this value is shared by all CPTs, it can grow at runtime */
 static int fmr_pool_size = 512;
-CFS_MODULE_PARM(fmr_pool_size, "i", int, 0444,
-                "size of the fmr pool (>= ntx)");
+module_param(fmr_pool_size, int, 0444);
+MODULE_PARM_DESC(fmr_pool_size, "size of fmr pool on each CPT (>= ntx / 4)");
 
+/* NB: this value is shared by all CPTs, it can grow at runtime */
 static int fmr_flush_trigger = 384;
-CFS_MODULE_PARM(fmr_flush_trigger, "i", int, 0444,
-                "# dirty FMRs that triggers pool flush");
+module_param(fmr_flush_trigger, int, 0444);
+MODULE_PARM_DESC(fmr_flush_trigger, "# dirty FMRs that triggers pool flush");
 
 static int fmr_cache = 1;
-CFS_MODULE_PARM(fmr_cache, "i", int, 0444,
-                "non-zero to enable FMR caching");
-#endif
+module_param(fmr_cache, int, 0444);
+MODULE_PARM_DESC(fmr_cache, "non-zero to enable FMR caching");
+
+/*
+ * 0: disable failover
+ * 1: enable failover if necessary
+ * 2: force to failover (for debug)
+ */
+static int dev_failover = 0;
+module_param(dev_failover, int, 0444);
+MODULE_PARM_DESC(dev_failover, "HCA failover for bonding (0 off, 1 on, other values reserved)");
+
+static int require_privileged_port;
+module_param(require_privileged_port, int, 0644);
+MODULE_PARM_DESC(require_privileged_port, "require privileged port when accepting connection");
 
-kib_tunables_t kiblnd_tunables = {
+static int use_privileged_port = 1;
+module_param(use_privileged_port, int, 0644);
+MODULE_PARM_DESC(use_privileged_port, "use privileged port when initiating connection");
+
+static unsigned int wrq_sge = 2;
+module_param(wrq_sge, uint, 0444);
+MODULE_PARM_DESC(wrq_sge, "# scatter/gather element per work request");
+
+struct kib_tunables kiblnd_tunables = {
+        .kib_dev_failover           = &dev_failover,
         .kib_service                = &service,
         .kib_cksum                  = &cksum,
         .kib_timeout                = &timeout,
         .kib_keepalive              = &keepalive,
-        .kib_ntx                    = &ntx,
-        .kib_credits                = &credits,
-        .kib_peercredits            = &peer_credits,
         .kib_default_ipif           = &ipif_name,
         .kib_retry_count            = &retry_count,
         .kib_rnr_retry_count        = &rnr_retry_count,
-        .kib_concurrent_sends       = &concurrent_sends,
         .kib_ib_mtu                 = &ib_mtu,
-#if IBLND_MAP_ON_DEMAND
-        .kib_fmr_pool_size          = &fmr_pool_size,
-        .kib_fmr_flush_trigger      = &fmr_flush_trigger,
-        .kib_fmr_cache              = &fmr_cache,
-#endif
+        .kib_require_priv_port      = &require_privileged_port,
+       .kib_use_priv_port          = &use_privileged_port,
+       .kib_nscheds                = &nscheds,
+       .kib_wrq_sge                = &wrq_sge,
+       .kib_use_fastreg_gaps       = &use_fastreg_gaps,
 };
 
-#if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
-
-static char ipif_basename_space[32];
-
-static ctl_table kiblnd_ctl_table[] = {
-       {1, "service", &service, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-       {2, "cksum", &cksum, 
-        sizeof(int), 0644, NULL, &proc_dointvec},
-       {3, "timeout", &timeout, 
-        sizeof(int), 0644, NULL, &proc_dointvec},
-       {4, "ntx", &ntx, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-       {5, "credits", &credits, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-       {6, "peer_credits", &peer_credits, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-       {7, "ipif_name", ipif_basename_space, 
-        sizeof(ipif_basename_space), 0444, NULL, &proc_dostring},
-       {8, "retry_count", &retry_count, 
-        sizeof(int), 0644, NULL, &proc_dointvec},
-       {9, "rnr_retry_count", &rnr_retry_count, 
-        sizeof(int), 0644, NULL, &proc_dointvec},
-       {10, "keepalive", &keepalive, 
-        sizeof(int), 0644, NULL, &proc_dointvec},
-       {11, "concurrent_sends", &concurrent_sends, 
-        sizeof(int), 0644, NULL, &proc_dointvec},
-       {12, "ib_mtu", &ib_mtu, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-#if IBLND_MAP_ON_DEMAND
-       {12, "fmr_pool_size", &fmr_pool_size, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-       {13, "fmr_flush_trigger", &fmr_flush_trigger, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-       {14, "fmr_cache", &fmr_cache, 
-        sizeof(int), 0444, NULL, &proc_dointvec},
-#endif
-       {0}
-};
-
-static ctl_table kiblnd_top_ctl_table[] = {
-       {203, "o2iblnd", NULL, 0, 0555, kiblnd_ctl_table},
-       {0}
-};
+static struct lnet_ioctl_config_o2iblnd_tunables default_tunables;
 
-void
-kiblnd_initstrtunable(char *space, char *str, int size)
+/* # messages/RDMAs in-flight */
+int
+kiblnd_msg_queue_size(int version, struct lnet_ni *ni)
 {
-        strncpy(space, str, size);
-        space[size-1] = 0;
+       if (version == IBLND_MSG_VERSION_1)
+               return IBLND_MSG_QUEUE_SIZE_V1;
+       else if (ni)
+               return ni->ni_net->net_tunables.lct_peer_tx_credits;
+       else
+               return peer_credits;
 }
 
-void
-kiblnd_sysctl_init (void)
+int
+kiblnd_tunables_setup(struct lnet_ni *ni)
 {
-        kiblnd_initstrtunable(ipif_basename_space, ipif_name,
-                              sizeof(ipif_basename_space));
+       struct lnet_ioctl_config_o2iblnd_tunables *tunables;
+       struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
 
-       kiblnd_tunables.kib_sysctl =
-               cfs_register_sysctl_table(kiblnd_top_ctl_table, 0);
+       /*
+        * if there was no tunables specified, setup the tunables to be
+        * defaulted
+        */
+       if (!ni->ni_lnd_tunables_set)
+               memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib,
+                      &default_tunables, sizeof(*tunables));
 
-       if (kiblnd_tunables.kib_sysctl == NULL)
-               CWARN("Can't setup /proc tunables\n");
-}
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_o2ib;
 
-void
-kiblnd_sysctl_fini (void)
-{
-       if (kiblnd_tunables.kib_sysctl != NULL)
-               cfs_unregister_sysctl_table(kiblnd_tunables.kib_sysctl);
-}
+       /* Current API version */
+       tunables->lnd_version = CURRENT_LND_VERSION;
 
-#else
+       if (kiblnd_translate_mtu(*kiblnd_tunables.kib_ib_mtu) < 0) {
+               CERROR("Invalid ib_mtu %d, expected 256/512/1024/2048/4096\n",
+                      *kiblnd_tunables.kib_ib_mtu);
+               return -EINVAL;
+       }
 
-void
-kiblnd_sysctl_init (void)
-{
-}
+       net_tunables = &ni->ni_net->net_tunables;
 
-void
-kiblnd_sysctl_fini (void)
-{
-}
+       if (net_tunables->lct_peer_timeout == -1)
+               net_tunables->lct_peer_timeout = peer_timeout;
+
+       if (net_tunables->lct_max_tx_credits == -1)
+               net_tunables->lct_max_tx_credits = credits;
+
+       if (net_tunables->lct_peer_tx_credits == -1)
+               net_tunables->lct_peer_tx_credits = peer_credits;
+
+       if (net_tunables->lct_peer_rtr_credits == -1)
+               net_tunables->lct_peer_rtr_credits = peer_buffer_credits;
+
+       if (net_tunables->lct_peer_tx_credits < IBLND_CREDITS_DEFAULT)
+               net_tunables->lct_peer_tx_credits = IBLND_CREDITS_DEFAULT;
+
+       if (net_tunables->lct_peer_tx_credits > IBLND_CREDITS_MAX)
+               net_tunables->lct_peer_tx_credits = IBLND_CREDITS_MAX;
+
+       if (net_tunables->lct_peer_tx_credits >
+           net_tunables->lct_max_tx_credits)
+               net_tunables->lct_peer_tx_credits =
+                       net_tunables->lct_max_tx_credits;
 
+#ifndef HAVE_IB_GET_DMA_MR
+       /*
+        * For kernels which do not support global memory regions, always
+        * enable map_on_demand
+        */
+       if (tunables->lnd_map_on_demand == 0)
+               tunables->lnd_map_on_demand = 1;
 #endif
 
-int
-kiblnd_tunables_init (void)
-{
-        kiblnd_sysctl_init();
+       if (!tunables->lnd_peercredits_hiw)
+               tunables->lnd_peercredits_hiw = peer_credits_hiw;
+
+       if (tunables->lnd_peercredits_hiw < net_tunables->lct_peer_tx_credits / 2)
+               tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits / 2;
+
+       if (tunables->lnd_peercredits_hiw >= net_tunables->lct_peer_tx_credits)
+               tunables->lnd_peercredits_hiw = net_tunables->lct_peer_tx_credits - 1;
+
+       if (tunables->lnd_concurrent_sends == 0)
+                       tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits;
+
+       if (tunables->lnd_concurrent_sends > net_tunables->lct_peer_tx_credits * 2)
+               tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits * 2;
+
+       if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits / 2)
+               tunables->lnd_concurrent_sends = net_tunables->lct_peer_tx_credits / 2;
 
-        if (*kiblnd_tunables.kib_concurrent_sends > IBLND_RX_MSGS)
-                *kiblnd_tunables.kib_concurrent_sends = IBLND_RX_MSGS;
-        if (*kiblnd_tunables.kib_concurrent_sends < IBLND_MSG_QUEUE_SIZE)
-                *kiblnd_tunables.kib_concurrent_sends = IBLND_MSG_QUEUE_SIZE;
+       if (tunables->lnd_concurrent_sends < net_tunables->lct_peer_tx_credits) {
+               CWARN("Concurrent sends %d is lower than message "
+                     "queue size: %d, performance may drop slightly.\n",
+                     tunables->lnd_concurrent_sends,
+                     net_tunables->lct_peer_tx_credits);
+       }
+
+       if (!tunables->lnd_fmr_pool_size)
+               tunables->lnd_fmr_pool_size = fmr_pool_size;
+       if (!tunables->lnd_fmr_flush_trigger)
+               tunables->lnd_fmr_flush_trigger = fmr_flush_trigger;
+       if (!tunables->lnd_fmr_cache)
+               tunables->lnd_fmr_cache = fmr_cache;
+       if (!tunables->lnd_ntx)
+               tunables->lnd_ntx = ntx;
+       if (!tunables->lnd_conns_per_peer) {
+               tunables->lnd_conns_per_peer = (conns_per_peer) ?
+                       conns_per_peer : 1;
+       }
 
        return 0;
 }
 
-void
-kiblnd_tunables_fini (void)
+int
+kiblnd_tunables_init(void)
 {
-        kiblnd_sysctl_fini();
+       default_tunables.lnd_version = CURRENT_LND_VERSION;
+       default_tunables.lnd_peercredits_hiw = peer_credits_hiw;
+       default_tunables.lnd_map_on_demand = map_on_demand;
+       default_tunables.lnd_concurrent_sends = concurrent_sends;
+       default_tunables.lnd_fmr_pool_size = fmr_pool_size;
+       default_tunables.lnd_fmr_flush_trigger = fmr_flush_trigger;
+       default_tunables.lnd_fmr_cache = fmr_cache;
+       default_tunables.lnd_ntx = ntx;
+       default_tunables.lnd_conns_per_peer = conns_per_peer;
+       return 0;
 }
-
-
-