#include "socklnd.h"
-static int sock_timeout = 50;
+#include <linux/kvm_host.h>
+#if defined(__x86_64__) || defined(__i386__)
+#include <asm/hypervisor.h>
+#endif
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+#include <linux/inetdevice.h>
+#include <linux/ethtool.h>
+#endif
+
+#define CURRENT_LND_VERSION 1
+
+static int sock_timeout;
module_param(sock_timeout, int, 0644);
MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)");
-static int credits = 256;
+static int credits = DEFAULT_CREDITS;
module_param(credits, int, 0444);
MODULE_PARM_DESC(credits, "# concurrent sends");
-static int peer_credits = 8;
+static int peer_credits = DEFAULT_PEER_CREDITS;
module_param(peer_credits, int, 0444);
MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer");
module_param(peer_buffer_credits, int, 0444);
MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits");
-static int peer_timeout = 180;
+static int peer_timeout = DEFAULT_PEER_TIMEOUT;
module_param(peer_timeout, int, 0444);
MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)");
module_param(inject_csum_error, int, 0644);
MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error");
-#ifdef CPU_AFFINITY
static int enable_irq_affinity = 0;
module_param(enable_irq_affinity, int, 0644);
MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity");
-#endif
static int nonblk_zcack = 1;
module_param(nonblk_zcack, int, 0644);
module_param(zc_recv_min_nfrags, int, 0644);
MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
+static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER;
+module_param(conns_per_peer, uint, 0644);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
+
+/* By default skip_mr_route_setup is 0 (do not skip) */
+static unsigned int skip_mr_route_setup;
+module_param(skip_mr_route_setup, uint, 0444);
+MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR");
+
#ifdef SOCKNAL_BACKOFF
static int backoff_init = 3;
module_param(backoff_init, int, 0644);
MODULE_PARM_DESC(protocol, "protocol version");
#endif
-ksock_tunables_t ksocknal_tunables;
+static int tos = -1;
+static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static const struct kernel_param_ops param_ops_tos = {
+ .set = param_set_tos,
+ .get = param_get_int,
+};
+
+#define param_check_tos(name, p) \
+ __param_check(name, p, int)
+module_param(tos, tos, 0444);
+#else
+module_param_call(tos, param_set_tos, param_get_int, &tos, 0444);
+#endif
+MODULE_PARM_DESC(tos, "Set the type of service (=-1 to disable)");
+
+static inline bool is_native_host(void)
+{
+#ifdef HAVE_HYPERVISOR_IS_TYPE
+ return hypervisor_is_type(X86_HYPER_NATIVE);
+#elif defined(__x86_64__) || defined(__i386__)
+ return x86_hyper == NULL;
+#else
+ return true;
+#endif
+}
+
+struct ksock_tunables ksocknal_tunables;
+struct lnet_ioctl_config_socklnd_tunables ksock_default_tunables;
+
+static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp)
+{
+ int rc, t;
+
+ if (!val)
+ return -EINVAL;
+
+ rc = kstrtoint(val, 0, &t);
+ if (rc)
+ return rc;
+
+ if (t < -1 || t > 0xff)
+ return -ERANGE;
+
+ *((int *)kp->arg) = t;
+
+ return 0;
+}
+
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni)
+{
+ struct net_device *dev;
+ int intf_idx = -1;
+ int ret = -1;
+
+ DECLARE_CONST_IN_IFADDR(ifa);
+
+ /* check if ni has interface assigned */
+ if (!ni->ni_net_ns || !ni->ni_interface)
+ return 0;
+
+ rtnl_lock();
+ for_each_netdev(ni->ni_net_ns, dev) {
+ int flags = dev_get_flags(dev);
+ struct in_device *in_dev;
+
+ if (flags & IFF_LOOPBACK) /* skip the loopback IF */
+ continue;
+
+ if (!(flags & IFF_UP))
+ continue;
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ continue;
+
+ in_dev_for_each_ifa_rtnl(ifa, in_dev) {
+ if (strcmp(ifa->ifa_label, ni->ni_interface) == 0)
+ intf_idx = dev->ifindex;
+ }
+ endfor_ifa(in_dev);
+
+ if (intf_idx >= 0)
+ break;
+ }
+ if (intf_idx >= 0) {
+ struct ethtool_link_ksettings cmd;
+ int ethtool_ret;
+
+ /* Some devices may not be providing link settings */
+ ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd);
+ if (!ethtool_ret)
+ ret = cmd.base.speed;
+ else
+ ret = ethtool_ret;
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+
+static int ksocklnd_speed2cpp(int speed)
+{
+ /* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */
+ if (speed < 1000)
+ speed = 1000;
+
+ /* Pick heuristically optimal conns_per_peer value
+ * for the specified ethernet interface speed (Mbps)
+ */
+ return ilog2(speed/1000) / 2 + 1;
+}
+#endif
+
+static int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni)
+{
+ int cpp = 1;
+#ifdef HAVE_ETHTOOL_LINK_SETTINGS
+ int speed = ksocklnd_ni_get_eth_intf_speed(ni);
+
+ if (ni->ni_interface)
+ CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed);
+
+ if (speed > 0)
+ cpp = ksocklnd_speed2cpp(speed);
+#endif
+ return cpp;
+}
int ksocknal_tunables_init(void)
{
+ ksock_default_tunables.lnd_version = CURRENT_LND_VERSION;
+ ksock_default_tunables.lnd_conns_per_peer = conns_per_peer;
+ ksock_default_tunables.lnd_tos = tos;
- /* initialize ksocknal_tunables structure */
- ksocknal_tunables.ksnd_timeout = &sock_timeout;
+ /* initialize ksocknal_tunables structure */
+ ksocknal_tunables.ksnd_timeout = &sock_timeout;
ksocknal_tunables.ksnd_nscheds = &nscheds;
- ksocknal_tunables.ksnd_nconnds = &nconnds;
- ksocknal_tunables.ksnd_nconnds_max = &nconnds_max;
- ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
- ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
- ksocknal_tunables.ksnd_eager_ack = &eager_ack;
- ksocknal_tunables.ksnd_typed_conns = &typed_conns;
- ksocknal_tunables.ksnd_min_bulk = &min_bulk;
- ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size;
- ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size;
- ksocknal_tunables.ksnd_nagle = &nagle;
- ksocknal_tunables.ksnd_round_robin = &round_robin;
- ksocknal_tunables.ksnd_keepalive = &keepalive;
- ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle;
- ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
- ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
- ksocknal_tunables.ksnd_credits = &credits;
- ksocknal_tunables.ksnd_peertxcredits = &peer_credits;
- ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits;
- ksocknal_tunables.ksnd_peertimeout = &peer_timeout;
- ksocknal_tunables.ksnd_enable_csum = &enable_csum;
- ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
- ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack;
- ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload;
- ksocknal_tunables.ksnd_zc_recv = &zc_recv;
- ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
-
-#ifdef CPU_AFFINITY
+ ksocknal_tunables.ksnd_nconnds = &nconnds;
+ ksocknal_tunables.ksnd_nconnds_max = &nconnds_max;
+ ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms;
+ ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms;
+ ksocknal_tunables.ksnd_eager_ack = &eager_ack;
+ ksocknal_tunables.ksnd_typed_conns = &typed_conns;
+ ksocknal_tunables.ksnd_min_bulk = &min_bulk;
+ ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size;
+ ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size;
+ ksocknal_tunables.ksnd_nagle = &nagle;
+ ksocknal_tunables.ksnd_round_robin = &round_robin;
+ ksocknal_tunables.ksnd_keepalive = &keepalive;
+ ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle;
+ ksocknal_tunables.ksnd_keepalive_count = &keepalive_count;
+ ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl;
+ ksocknal_tunables.ksnd_credits = &credits;
+ ksocknal_tunables.ksnd_peertxcredits = &peer_credits;
+ ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits;
+ ksocknal_tunables.ksnd_peertimeout = &peer_timeout;
+ ksocknal_tunables.ksnd_enable_csum = &enable_csum;
+ ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error;
+ ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack;
+ ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload;
+ ksocknal_tunables.ksnd_zc_recv = &zc_recv;
+ ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+ if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
+ CWARN("socklnd conns_per_peer is capped at %u.\n",
+ (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
+ }
+ ksocknal_tunables.ksnd_conns_per_peer = &conns_per_peer;
+
if (enable_irq_affinity) {
CWARN("irq_affinity is removed from socklnd because modern "
"computer always has fast CPUs and more cores than "
"# NICs, although you still can set irq_affinity by "
"another way, please check manual for details.\n");
}
- ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity;
-#endif
+ ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity;
#ifdef SOCKNAL_BACKOFF
- ksocknal_tunables.ksnd_backoff_init = &backoff_init;
- ksocknal_tunables.ksnd_backoff_max = &backoff_max;
+ ksocknal_tunables.ksnd_backoff_init = &backoff_init;
+ ksocknal_tunables.ksnd_backoff_max = &backoff_max;
#endif
#if SOCKNAL_VERSION_DEBUG
- ksocknal_tunables.ksnd_protocol = &protocol;
+ ksocknal_tunables.ksnd_protocol = &protocol;
#endif
- if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
- *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+ if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10))
+ *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10);
+
+ /* When on a hypervisor set the minimum zero copy size
+ * above the maximum payload size
+ */
+ if (!is_native_host())
+ *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1;
return 0;
-};
+}
+
+void ksocknal_tunables_setup(struct lnet_ni *ni)
+{
+ struct lnet_ioctl_config_socklnd_tunables *tunables;
+ struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables;
+
+ /* If no tunables specified, setup default tunables */
+ if (!ni->ni_lnd_tunables_set)
+ memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock,
+ &ksock_default_tunables, sizeof(*tunables));
+
+ tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock;
+
+ /* Current API version */
+ tunables->lnd_version = CURRENT_LND_VERSION;
+
+ net_tunables = &ni->ni_net->net_tunables;
+
+ if (net_tunables->lct_peer_timeout == -1)
+ net_tunables->lct_peer_timeout =
+ *ksocknal_tunables.ksnd_peertimeout;
+
+ if (net_tunables->lct_max_tx_credits == -1)
+ net_tunables->lct_max_tx_credits =
+ *ksocknal_tunables.ksnd_credits;
+
+ if (net_tunables->lct_peer_tx_credits == -1)
+ net_tunables->lct_peer_tx_credits =
+ *ksocknal_tunables.ksnd_peertxcredits;
+
+ if (net_tunables->lct_peer_tx_credits >
+ net_tunables->lct_max_tx_credits)
+ net_tunables->lct_peer_tx_credits =
+ net_tunables->lct_max_tx_credits;
+
+ if (net_tunables->lct_peer_rtr_credits == -1)
+ net_tunables->lct_peer_rtr_credits =
+ *ksocknal_tunables.ksnd_peerrtrcredits;
+
+ if (!tunables->lnd_conns_per_peer)
+ tunables->lnd_conns_per_peer =
+ ksocklnd_lookup_conns_per_peer(ni);
+
+ if (tunables->lnd_tos < 0)
+ tunables->lnd_tos = tos;
+
+ tunables->lnd_timeout = ksocknal_timeout();
+}