Whamcloud - gitweb
LU-17705 ptlrpc: replace synchronize_rcu() with rcu_barrier()
[fs/lustre-release.git] / lnet / klnds / socklnd / socklnd_modparams.c
index 6fa44f3..6cb4eab 100644 (file)
@@ -1,7 +1,7 @@
-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Copyright (c) 2011, 2012, Intel Corporation.
  *
  *   Author: Eric Barton <eric@bartonsoftware.com>
  *
 
 #include "socklnd.h"
 
-static int sock_timeout = 50;
-CFS_MODULE_PARM(sock_timeout, "i", int, 0644,
-                "dead socket timeout (seconds)");
+#include <linux/kvm_host.h>
+#if defined(__x86_64__) || defined(__i386__)
+#include <asm/hypervisor.h>
+#endif
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+#include <linux/inetdevice.h>
+#include <linux/ethtool.h>
+#endif
+
+#define CURRENT_LND_VERSION 1
+
+static int sock_timeout;
+module_param(sock_timeout, int, 0644);
+MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
 
-static int credits = 256;
-CFS_MODULE_PARM(credits, "i", int, 0444,
-                "# concurrent sends");
+static int credits = DEFAULT_CREDITS;
+module_param(credits, int, 0444);
+MODULE_PARM_DESC(credits, "# concurrent sends");
 
-static int peer_credits = 8;
-CFS_MODULE_PARM(peer_credits, "i", int, 0444,
-                "# concurrent sends to 1 peer");
+static int peer_credits = DEFAULT_PEER_CREDITS;
+module_param(peer_credits, int, 0444);
+MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
 
-static int peer_buffer_credits = 0;
-CFS_MODULE_PARM(peer_buffer_credits, "i", int, 0444,
-                "# per-peer router buffer credits");
+static int peer_buffer_credits;
+module_param(peer_buffer_credits, int, 0444);
+MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
 
-static int peer_timeout = 0;
-CFS_MODULE_PARM(peer_timeout, "i", int, 0444,
-                "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
+module_param(peer_timeout, int, 0444);
+MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
+
+/* Number of daemons in each thread pool which is percpt,
+ * we will estimate reasonable value based on CPUs if it's not set. */
+static unsigned int nscheds;
+module_param(nscheds, int, 0444);
+MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting");
 
 static int nconnds = 4;
-CFS_MODULE_PARM(nconnds, "i", int, 0444,
-                "# connection daemons");
+module_param(nconnds, int, 0444);
+MODULE_PARM_DESC(nconnds, "# connection daemons while starting");
+
+static int nconnds_max = 64;
+module_param(nconnds_max, int, 0444);
+MODULE_PARM_DESC(nconnds_max, "max # connection daemons");
 
 static int min_reconnectms = 1000;
-CFS_MODULE_PARM(min_reconnectms, "i", int, 0644,
-                "min connection retry interval (mS)");
+module_param(min_reconnectms, int, 0644);
+MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)");
 
 static int max_reconnectms = 60000;
-CFS_MODULE_PARM(max_reconnectms, "i", int, 0644,
-                "max connection retry interval (mS)");
+module_param(max_reconnectms, int, 0644);
+MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)");
 
-#if defined(__APPLE__) && !defined(__DARWIN8__)
-# define DEFAULT_EAGER_ACK 1
-#else
-# define DEFAULT_EAGER_ACK 0
-#endif
-static int eager_ack = DEFAULT_EAGER_ACK;
-CFS_MODULE_PARM(eager_ack, "i", int, 0644,
-                "send tcp ack packets eagerly");
+static int eager_ack;
+module_param(eager_ack, int, 0644);
+MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly");
 
 static int typed_conns = 1;
-CFS_MODULE_PARM(typed_conns, "i", int, 0444,
-                "use different sockets for bulk");
+module_param(typed_conns, int, 0444);
+MODULE_PARM_DESC(typed_conns, "use different sockets for bulk");
 
 static int min_bulk = (1<<10);
-CFS_MODULE_PARM(min_bulk, "i", int, 0644,
-                "smallest 'large' message");
-
-#ifdef __APPLE__
-# ifdef __DARWIN8__
-#  define DEFAULT_BUFFER_SIZE (224*1024)
-# else
-#  define DEFAULT_BUFFER_SIZE (1152 * 1024)
-# endif
-#else
+module_param(min_bulk, int, 0644);
+MODULE_PARM_DESC(min_bulk, "smallest 'large' message");
+
 # define DEFAULT_BUFFER_SIZE 0
-#endif
 static int tx_buffer_size = DEFAULT_BUFFER_SIZE;
-CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644,
-                "socket tx buffer size (0 for system default)");
+module_param(tx_buffer_size, int, 0644);
+MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)");
 
 static int rx_buffer_size = DEFAULT_BUFFER_SIZE;
-CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644,
-                "socket rx buffer size (0 for system default)");
+module_param(rx_buffer_size, int, 0644);
+MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)");
 
 static int nagle = 0;
-CFS_MODULE_PARM(nagle, "i", int, 0644,
-                "enable NAGLE?");
+module_param(nagle, int, 0644);
+MODULE_PARM_DESC(nagle, "enable NAGLE?");
 
 static int round_robin = 1;
-CFS_MODULE_PARM(round_robin, "i", int, 0644,
-                "Round robin for multiple interfaces");
+module_param(round_robin, int, 0644);
+MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces");
 
 static int keepalive = 30;
-CFS_MODULE_PARM(keepalive, "i", int, 0644,
-                "# seconds before send keepalive");
+module_param(keepalive, int, 0644);
+MODULE_PARM_DESC(keepalive, "# seconds before send keepalive");
 
 static int keepalive_idle = 30;
-CFS_MODULE_PARM(keepalive_idle, "i", int, 0644,
-                "# idle seconds before probe");
+module_param(keepalive_idle, int, 0644);
+MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe");
 
-#ifdef HAVE_BGL_SUPPORT
-#define DEFAULT_KEEPALIVE_COUNT  100
-#else
 #define DEFAULT_KEEPALIVE_COUNT  5
-#endif
 static int keepalive_count = DEFAULT_KEEPALIVE_COUNT;
-CFS_MODULE_PARM(keepalive_count, "i", int, 0644,
-                "# missed probes == dead");
+module_param(keepalive_count, int, 0644);
+MODULE_PARM_DESC(keepalive_count, "# missed probes == dead");
 
 static int keepalive_intvl = 5;
-CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644,
-                "seconds between probes");
+module_param(keepalive_intvl, int, 0644);
+MODULE_PARM_DESC(keepalive_intvl, "seconds between probes");
 
 static int enable_csum = 0;
-CFS_MODULE_PARM(enable_csum, "i", int, 0644,
-                "enable check sum");
+module_param(enable_csum, int, 0644);
+MODULE_PARM_DESC(enable_csum, "enable check sum");
 
 static int inject_csum_error = 0;
-CFS_MODULE_PARM(inject_csum_error, "i", int, 0644,
-                "set non-zero to inject a checksum error");
-#ifdef CPU_AFFINITY
+module_param(inject_csum_error, int, 0644);
+MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
+
 static int enable_irq_affinity = 0;
-CFS_MODULE_PARM(enable_irq_affinity, "i", int, 0644,
-                "enable IRQ affinity");
-#endif
+module_param(enable_irq_affinity, int, 0644);
+MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");
+
+static int nonblk_zcack = 1;
+module_param(nonblk_zcack, int, 0644);
+MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection");
 
 static unsigned int zc_min_payload = (16 << 10);
-CFS_MODULE_PARM(zc_min_payload, "i", int, 0644,
-                "minimum payload size to zero copy");
+module_param(zc_min_payload, int, 0644);
+MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy");
 
 static unsigned int zc_recv = 0;
-CFS_MODULE_PARM(zc_recv, "i", int, 0644,
-                "enable ZC recv for Chelsio driver");
+module_param(zc_recv, int, 0644);
+MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver");
 
 static unsigned int zc_recv_min_nfrags = 16;
-CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644,
-                "minimum # of fragments to enable ZC recv");
+module_param(zc_recv_min_nfrags, int, 0644);
+MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
+
+static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER;
+module_param(conns_per_peer, uint, 0644);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
+
+/* By default skip_mr_route_setup is 0 (do not skip) */
+static unsigned int skip_mr_route_setup;
+module_param(skip_mr_route_setup, uint, 0444);
+MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR");
 
 #ifdef SOCKNAL_BACKOFF
 static int backoff_init = 3;
-CFS_MODULE_PARM(backoff_init, "i", int, 0644,
-                "seconds for initial tcp backoff");
+module_param(backoff_init, int, 0644);
+MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff");
 
 static int backoff_max = 3;
-CFS_MODULE_PARM(backoff_max, "i", int, 0644,
-                "seconds for maximum tcp backoff");
+module_param(backoff_max, int, 0644);
+MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff");
 #endif
 
 #if SOCKNAL_VERSION_DEBUG
 static int protocol = 3;
-CFS_MODULE_PARM(protocol, "i", int, 0644,
-                "protocol version");
+module_param(protocol, int, 0644);
+MODULE_PARM_DESC(protocol, "protocol version");
 #endif
 
-ksock_tunables_t ksocknal_tunables;
+static int tos = -1;
+static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static const struct kernel_param_ops param_ops_tos = {
+       .set = param_set_tos,
+       .get = param_get_int,
+};
 
-int ksocknal_tunables_init(void)
+#define param_check_tos(name, p) \
+       __param_check(name, p, int)
+module_param(tos, tos, 0444);
+#else
+module_param_call(tos, param_set_tos, param_get_int, &tos, 0444);
+#endif
+MODULE_PARM_DESC(tos, "Set the type of service (=-1 to disable)");
+
+static inline bool is_native_host(void)
+{
+#ifdef HAVE_HYPERVISOR_IS_TYPE
+       return hypervisor_is_type(X86_HYPER_NATIVE);
+#elif defined(__x86_64__) || defined(__i386__)
+       return x86_hyper == NULL;
+#else
+       return true;
+#endif
+}
+
+struct ksock_tunables ksocknal_tunables;
+struct lnet_ioctl_config_socklnd_tunables ksock_default_tunables;
+
+static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp)
+{
+       int rc, t;
+
+       if (!val)
+               return -EINVAL;
+
+       rc = kstrtoint(val, 0, &t);
+       if (rc)
+               return rc;
+
+       if (t < -1 || t > 0xff)
+               return -ERANGE;
+
+       *((int *)kp->arg) = t;
+
+       return 0;
+}
+
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni)
+{
+       struct net_device *dev;
+       int intf_idx = -1;
+       int ret = -1;
+
+       DECLARE_CONST_IN_IFADDR(ifa);
+
+       /* check if ni has interface assigned */
+       if (!ni->ni_net_ns || !ni->ni_interface)
+               return 0;
+
+       rtnl_lock();
+       for_each_netdev(ni->ni_net_ns, dev) {
+               int flags = dev_get_flags(dev);
+               struct in_device *in_dev;
+
+               if (flags & IFF_LOOPBACK) /* skip the loopback IF */
+                       continue;
+
+               if (!(flags & IFF_UP))
+                       continue;
+
+               in_dev = __in_dev_get_rtnl(dev);
+               if (!in_dev)
+                       continue;
+
+               in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+                       if (strcmp(ifa->ifa_label, ni->ni_interface) == 0)
+                               intf_idx = dev->ifindex;
+               }
+               endfor_ifa(in_dev);
+
+               if (intf_idx >= 0)
+                       break;
+       }
+       if (intf_idx >= 0) {
+               struct ethtool_link_ksettings cmd;
+               int ethtool_ret;
+
+               /* Some devices may not be providing link settings */
+               ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd);
+               if (!ethtool_ret)
+                       ret = cmd.base.speed;
+               else
+                       ret = ethtool_ret;
+       }
+       rtnl_unlock();
+
+       return ret;
+}
+
+static int ksocklnd_speed2cpp(int speed)
 {
+       /* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */
+       if (speed < 1000)
+               speed = 1000;
+
+       /* Pick heuristically optimal conns_per_peer value
+        * for the specified ethernet interface speed (Mbps)
+        */
+       return ilog2(speed/1000) / 2 + 1;
+}
+#endif
+
+static int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni)
+{
+       int cpp = 1;
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+       int speed = ksocklnd_ni_get_eth_intf_speed(ni);
 
-        /* initialize ksocknal_tunables structure */
-        ksocknal_tunables.ksnd_timeout            = &sock_timeout;
-        ksocknal_tunables.ksnd_nconnds            = &nconnds;
-        ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
-        ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
-        ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
-        ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
-        ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
-        ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
-        ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
-        ksocknal_tunables.ksnd_nagle              = &nagle;
-        ksocknal_tunables.ksnd_round_robin        = &round_robin;
-        ksocknal_tunables.ksnd_keepalive          = &keepalive;
-        ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
-        ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
-        ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
-        ksocknal_tunables.ksnd_credits            = &credits;
-        ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
-        ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
-        ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
-        ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
-        ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
-        ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
-        ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
-        ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
-
-#ifdef CPU_AFFINITY
-        ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
+       if (ni->ni_interface)
+               CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed);
+
+       if (speed > 0)
+               cpp = ksocklnd_speed2cpp(speed);
 #endif
+       return cpp;
+}
+
+int ksocknal_tunables_init(void)
+{
+       ksock_default_tunables.lnd_version = CURRENT_LND_VERSION;
+       ksock_default_tunables.lnd_conns_per_peer = conns_per_peer;
+       ksock_default_tunables.lnd_tos = tos;
+
+       /* initialize ksocknal_tunables structure */
+       ksocknal_tunables.ksnd_timeout            = &sock_timeout;
+       ksocknal_tunables.ksnd_nscheds            = &nscheds;
+       ksocknal_tunables.ksnd_nconnds            = &nconnds;
+       ksocknal_tunables.ksnd_nconnds_max        = &nconnds_max;
+       ksocknal_tunables.ksnd_min_reconnectms    = &min_reconnectms;
+       ksocknal_tunables.ksnd_max_reconnectms    = &max_reconnectms;
+       ksocknal_tunables.ksnd_eager_ack          = &eager_ack;
+       ksocknal_tunables.ksnd_typed_conns        = &typed_conns;
+       ksocknal_tunables.ksnd_min_bulk           = &min_bulk;
+       ksocknal_tunables.ksnd_tx_buffer_size     = &tx_buffer_size;
+       ksocknal_tunables.ksnd_rx_buffer_size     = &rx_buffer_size;
+       ksocknal_tunables.ksnd_nagle              = &nagle;
+       ksocknal_tunables.ksnd_round_robin        = &round_robin;
+       ksocknal_tunables.ksnd_keepalive          = &keepalive;
+       ksocknal_tunables.ksnd_keepalive_idle     = &keepalive_idle;
+       ksocknal_tunables.ksnd_keepalive_count    = &keepalive_count;
+       ksocknal_tunables.ksnd_keepalive_intvl    = &keepalive_intvl;
+       ksocknal_tunables.ksnd_credits            = &credits;
+       ksocknal_tunables.ksnd_peertxcredits      = &peer_credits;
+       ksocknal_tunables.ksnd_peerrtrcredits     = &peer_buffer_credits;
+       ksocknal_tunables.ksnd_peertimeout        = &peer_timeout;
+       ksocknal_tunables.ksnd_enable_csum        = &enable_csum;
+       ksocknal_tunables.ksnd_inject_csum_error  = &inject_csum_error;
+       ksocknal_tunables.ksnd_nonblk_zcack       = &nonblk_zcack;
+       ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
+       ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
+       ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+       if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
+               CWARN("socklnd conns_per_peer is capped at %u.\n",
+                     (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
+       }
+       ksocknal_tunables.ksnd_conns_per_peer     = &conns_per_peer;
+
+       if (enable_irq_affinity) {
+               CWARN("irq_affinity is removed from socklnd because modern "
+                     "computer always has fast CPUs and more cores than "
+                     "# NICs, although you still can set irq_affinity by "
+                     "another way, please check manual for details.\n");
+       }
+       ksocknal_tunables.ksnd_irq_affinity       = &enable_irq_affinity;
 
 #ifdef SOCKNAL_BACKOFF
-        ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
-        ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
+       ksocknal_tunables.ksnd_backoff_init       = &backoff_init;
+       ksocknal_tunables.ksnd_backoff_max        = &backoff_max;
 #endif
 
 #if SOCKNAL_VERSION_DEBUG
-        ksocknal_tunables.ksnd_protocol           = &protocol;
+       ksocknal_tunables.ksnd_protocol           = &protocol;
 #endif
 
-#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
-        ksocknal_tunables.ksnd_sysctl             =  NULL;
-#endif
+       if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+               *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
 
-        if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
-                *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+       /* When on a hypervisor set the minimum zero copy size
+        * above the maximum payload size
+        */
+       if (!is_native_host())
+               *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1;
 
-        /* initialize platform-sepcific tunables */
-        return ksocknal_lib_tunables_init();
-};
+       return 0;
+}
 
-void ksocknal_tunables_fini(void)
+void ksocknal_tunables_setup(struct lnet_ni *ni)
 {
-        ksocknal_lib_tunables_fini();
+       struct lnet_ioctl_config_socklnd_tunables *tunables;
+       struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
+
+       /* If no tunables specified, setup default tunables */
+       if (!ni->ni_lnd_tunables_set)
+               memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock,
+                      &ksock_default_tunables, sizeof(*tunables));
+
+       tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock;
+
+       /* Current API version */
+       tunables->lnd_version = CURRENT_LND_VERSION;
+
+       net_tunables = &ni->ni_net->net_tunables;
+
+       if (net_tunables->lct_peer_timeout == -1)
+               net_tunables->lct_peer_timeout =
+                       *ksocknal_tunables.ksnd_peertimeout;
+
+       if (net_tunables->lct_max_tx_credits == -1)
+               net_tunables->lct_max_tx_credits =
+                       *ksocknal_tunables.ksnd_credits;
+
+       if (net_tunables->lct_peer_tx_credits == -1)
+               net_tunables->lct_peer_tx_credits =
+                       *ksocknal_tunables.ksnd_peertxcredits;
+
+       if (net_tunables->lct_peer_tx_credits >
+           net_tunables->lct_max_tx_credits)
+               net_tunables->lct_peer_tx_credits =
+                       net_tunables->lct_max_tx_credits;
+
+       if (net_tunables->lct_peer_rtr_credits == -1)
+               net_tunables->lct_peer_rtr_credits =
+                       *ksocknal_tunables.ksnd_peerrtrcredits;
+
+       if (!tunables->lnd_conns_per_peer)
+               tunables->lnd_conns_per_peer =
+                       ksocklnd_lookup_conns_per_peer(ni);
+
+       if (tunables->lnd_tos < 0)
+               tunables->lnd_tos = tos;
+
+       tunables->lnd_timeout = ksocknal_timeout();
 }