X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fsocklnd%2Fsocklnd_modparams.c;h=967575acd90c51e70d466bbe9d70e1c800e36ab2;hp=d508509e5e5316f70205d78859e8b7ad99fa0398;hb=HEAD;hpb=0a9c9e444635dcf35a74bfb2f46efb3040ca17a0 diff --git a/lnet/klnds/socklnd/socklnd_modparams.c b/lnet/klnds/socklnd/socklnd_modparams.c index d508509..6cb4eab 100644 --- a/lnet/klnds/socklnd/socklnd_modparams.c +++ b/lnet/klnds/socklnd/socklnd_modparams.c @@ -1,7 +1,7 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Copyright (c) 2011, 2012, Intel Corporation. * * Author: Eric Barton * @@ -21,188 +21,401 @@ #include "socklnd.h" -static int sock_timeout = 50; -CFS_MODULE_PARM(sock_timeout, "i", int, 0644, - "dead socket timeout (seconds)"); +#include +#if defined(__x86_64__) || defined(__i386__) +#include +#endif +#ifdef HAVE_ETHTOOL_LINK_SETTINGS +#include +#include +#endif + +#define CURRENT_LND_VERSION 1 + +static int sock_timeout; +module_param(sock_timeout, int, 0644); +MODULE_PARM_DESC(sock_timeout, "dead socket timeout (seconds)"); + +static int credits = DEFAULT_CREDITS; +module_param(credits, int, 0444); +MODULE_PARM_DESC(credits, "# concurrent sends"); -static int credits = 256; -CFS_MODULE_PARM(credits, "i", int, 0444, - "# concurrent sends"); +static int peer_credits = DEFAULT_PEER_CREDITS; +module_param(peer_credits, int, 0444); +MODULE_PARM_DESC(peer_credits, "# concurrent sends to 1 peer"); -static int peer_credits = 8; -CFS_MODULE_PARM(peer_credits, "i", int, 0444, - "# concurrent sends to 1 peer"); +static int peer_buffer_credits; +module_param(peer_buffer_credits, int, 0444); +MODULE_PARM_DESC(peer_buffer_credits, "# per-peer router buffer credits"); + +static int peer_timeout = DEFAULT_PEER_TIMEOUT; +module_param(peer_timeout, int, 0444); +MODULE_PARM_DESC(peer_timeout, "Seconds without aliveness news to declare peer dead (<=0 to disable)"); + +/* Number of daemons in each thread pool which is percpt, + * we will estimate reasonable value based on CPUs if it's not set. */ +static unsigned int nscheds; +module_param(nscheds, int, 0444); +MODULE_PARM_DESC(nscheds, "# scheduler daemons in each pool while starting"); static int nconnds = 4; -CFS_MODULE_PARM(nconnds, "i", int, 0444, - "# connection daemons"); +module_param(nconnds, int, 0444); +MODULE_PARM_DESC(nconnds, "# connection daemons while starting"); + +static int nconnds_max = 64; +module_param(nconnds_max, int, 0444); +MODULE_PARM_DESC(nconnds_max, "max # connection daemons"); static int min_reconnectms = 1000; -CFS_MODULE_PARM(min_reconnectms, "i", int, 0644, - "min connection retry interval (mS)"); +module_param(min_reconnectms, int, 0644); +MODULE_PARM_DESC(min_reconnectms, "min connection retry interval (mS)"); static int max_reconnectms = 60000; -CFS_MODULE_PARM(max_reconnectms, "i", int, 0644, - "max connection retry interval (mS)"); +module_param(max_reconnectms, int, 0644); +MODULE_PARM_DESC(max_reconnectms, "max connection retry interval (mS)"); -#if defined(__APPLE__) && !defined(__DARWIN8__) -# define DEFAULT_EAGER_ACK 1 -#else -# define DEFAULT_EAGER_ACK 0 -#endif -static int eager_ack = DEFAULT_EAGER_ACK; -CFS_MODULE_PARM(eager_ack, "i", int, 0644, - "send tcp ack packets eagerly"); +static int eager_ack; +module_param(eager_ack, int, 0644); +MODULE_PARM_DESC(eager_ack, "send tcp ack packets eagerly"); static int typed_conns = 1; -CFS_MODULE_PARM(typed_conns, "i", int, 0444, - "use different sockets for bulk"); +module_param(typed_conns, int, 0444); +MODULE_PARM_DESC(typed_conns, "use different sockets for bulk"); static int min_bulk = (1<<10); -CFS_MODULE_PARM(min_bulk, "i", int, 0644, - "smallest 'large' message"); - -#ifdef __APPLE__ -# ifdef __DARWIN8__ -# define DEFAULT_BUFFER_SIZE (224*1024) -# else -# define DEFAULT_BUFFER_SIZE (1152 * 1024) -# endif -#else +module_param(min_bulk, int, 0644); +MODULE_PARM_DESC(min_bulk, "smallest 'large' message"); + # define DEFAULT_BUFFER_SIZE 0 -#endif static int tx_buffer_size = DEFAULT_BUFFER_SIZE; -CFS_MODULE_PARM(tx_buffer_size, "i", int, 0644, - "socket tx buffer size (0 for system default)"); +module_param(tx_buffer_size, int, 0644); +MODULE_PARM_DESC(tx_buffer_size, "socket tx buffer size (0 for system default)"); static int rx_buffer_size = DEFAULT_BUFFER_SIZE; -CFS_MODULE_PARM(rx_buffer_size, "i", int, 0644, - "socket rx buffer size (0 for system default)"); +module_param(rx_buffer_size, int, 0644); +MODULE_PARM_DESC(rx_buffer_size, "socket rx buffer size (0 for system default)"); static int nagle = 0; -CFS_MODULE_PARM(nagle, "i", int, 0644, - "enable NAGLE?"); +module_param(nagle, int, 0644); +MODULE_PARM_DESC(nagle, "enable NAGLE?"); static int round_robin = 1; -CFS_MODULE_PARM(round_robin, "i", int, 0644, - "Round robin for multiple interfaces"); +module_param(round_robin, int, 0644); +MODULE_PARM_DESC(round_robin, "Round robin for multiple interfaces"); static int keepalive = 30; -CFS_MODULE_PARM(keepalive, "i", int, 0644, - "# seconds before send keepalive"); +module_param(keepalive, int, 0644); +MODULE_PARM_DESC(keepalive, "# seconds before send keepalive"); static int keepalive_idle = 30; -CFS_MODULE_PARM(keepalive_idle, "i", int, 0644, - "# idle seconds before probe"); +module_param(keepalive_idle, int, 0644); +MODULE_PARM_DESC(keepalive_idle, "# idle seconds before probe"); -#ifdef HAVE_BGL_SUPPORT -#define DEFAULT_KEEPALIVE_COUNT 100 -#else #define DEFAULT_KEEPALIVE_COUNT 5 -#endif static int keepalive_count = DEFAULT_KEEPALIVE_COUNT; -CFS_MODULE_PARM(keepalive_count, "i", int, 0644, - "# missed probes == dead"); +module_param(keepalive_count, int, 0644); +MODULE_PARM_DESC(keepalive_count, "# missed probes == dead"); static int keepalive_intvl = 5; -CFS_MODULE_PARM(keepalive_intvl, "i", int, 0644, - "seconds between probes"); +module_param(keepalive_intvl, int, 0644); +MODULE_PARM_DESC(keepalive_intvl, "seconds between probes"); static int enable_csum = 0; -CFS_MODULE_PARM(enable_csum, "i", int, 0644, - "enable check sum"); +module_param(enable_csum, int, 0644); +MODULE_PARM_DESC(enable_csum, "enable check sum"); static int inject_csum_error = 0; -CFS_MODULE_PARM(inject_csum_error, "i", int, 0644, - "set non-zero to inject a checksum error"); -#ifdef CPU_AFFINITY +module_param(inject_csum_error, int, 0644); +MODULE_PARM_DESC(inject_csum_error, "set non-zero to inject a checksum error"); + static int enable_irq_affinity = 0; -CFS_MODULE_PARM(enable_irq_affinity, "i", int, 0644, - "enable IRQ affinity"); -#endif +module_param(enable_irq_affinity, int, 0644); +MODULE_PARM_DESC(enable_irq_affinity, "enable IRQ affinity"); + +static int nonblk_zcack = 1; +module_param(nonblk_zcack, int, 0644); +MODULE_PARM_DESC(nonblk_zcack, "always send ZC-ACK on non-blocking connection"); static unsigned int zc_min_payload = (16 << 10); -CFS_MODULE_PARM(zc_min_payload, "i", int, 0644, - "minimum payload size to zero copy"); +module_param(zc_min_payload, int, 0644); +MODULE_PARM_DESC(zc_min_payload, "minimum payload size to zero copy"); static unsigned int zc_recv = 0; -CFS_MODULE_PARM(zc_recv, "i", int, 0644, - "enable ZC recv for Chelsio driver"); +module_param(zc_recv, int, 0644); +MODULE_PARM_DESC(zc_recv, "enable ZC recv for Chelsio driver"); static unsigned int zc_recv_min_nfrags = 16; -CFS_MODULE_PARM(zc_recv_min_nfrags, "i", int, 0644, - "minimum # of fragments to enable ZC recv"); +module_param(zc_recv_min_nfrags, int, 0644); +MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv"); + +static unsigned int conns_per_peer = DEFAULT_CONNS_PER_PEER; +module_param(conns_per_peer, uint, 0644); +MODULE_PARM_DESC(conns_per_peer, "number of connections per peer"); + +/* By default skip_mr_route_setup is 0 (do not skip) */ +static unsigned int skip_mr_route_setup; +module_param(skip_mr_route_setup, uint, 0444); +MODULE_PARM_DESC(skip_mr_route_setup, "skip automatic setup of linux routes for MR"); #ifdef SOCKNAL_BACKOFF static int backoff_init = 3; -CFS_MODULE_PARM(backoff_init, "i", int, 0644, - "seconds for initial tcp backoff"); +module_param(backoff_init, int, 0644); +MODULE_PARM_DESC(backoff_init, "seconds for initial tcp backoff"); static int backoff_max = 3; -CFS_MODULE_PARM(backoff_max, "i", int, 0644, - "seconds for maximum tcp backoff"); +module_param(backoff_max, int, 0644); +MODULE_PARM_DESC(backoff_max, "seconds for maximum tcp backoff"); #endif #if SOCKNAL_VERSION_DEBUG static int protocol = 3; -CFS_MODULE_PARM(protocol, "i", int, 0644, - "protocol version"); +module_param(protocol, int, 0644); +MODULE_PARM_DESC(protocol, "protocol version"); #endif -ksock_tunables_t ksocknal_tunables; +static int tos = -1; +static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp); +#ifdef HAVE_KERNEL_PARAM_OPS +static const struct kernel_param_ops param_ops_tos = { + .set = param_set_tos, + .get = param_get_int, +}; + +#define param_check_tos(name, p) \ + __param_check(name, p, int) +module_param(tos, tos, 0444); +#else +module_param_call(tos, param_set_tos, param_get_int, &tos, 0444); +#endif +MODULE_PARM_DESC(tos, "Set the type of service (=-1 to disable)"); -int ksocknal_tunables_init(void) +static inline bool is_native_host(void) +{ +#ifdef HAVE_HYPERVISOR_IS_TYPE + return hypervisor_is_type(X86_HYPER_NATIVE); +#elif defined(__x86_64__) || defined(__i386__) + return x86_hyper == NULL; +#else + return true; +#endif +} + +struct ksock_tunables ksocknal_tunables; +struct lnet_ioctl_config_socklnd_tunables ksock_default_tunables; + +static int param_set_tos(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc, t; + + if (!val) + return -EINVAL; + + rc = kstrtoint(val, 0, &t); + if (rc) + return rc; + + if (t < -1 || t > 0xff) + return -ERANGE; + + *((int *)kp->arg) = t; + + return 0; +} + +#ifdef HAVE_ETHTOOL_LINK_SETTINGS +static int ksocklnd_ni_get_eth_intf_speed(struct lnet_ni *ni) { + struct net_device *dev; + int intf_idx = -1; + int ret = -1; + + DECLARE_CONST_IN_IFADDR(ifa); + + /* check if ni has interface assigned */ + if (!ni->ni_net_ns || !ni->ni_interface) + return 0; + + rtnl_lock(); + for_each_netdev(ni->ni_net_ns, dev) { + int flags = dev_get_flags(dev); + struct in_device *in_dev; + + if (flags & IFF_LOOPBACK) /* skip the loopback IF */ + continue; + + if (!(flags & IFF_UP)) + continue; + + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) + continue; + + in_dev_for_each_ifa_rtnl(ifa, in_dev) { + if (strcmp(ifa->ifa_label, ni->ni_interface) == 0) + intf_idx = dev->ifindex; + } + endfor_ifa(in_dev); + + if (intf_idx >= 0) + break; + } + if (intf_idx >= 0) { + struct ethtool_link_ksettings cmd; + int ethtool_ret; + + /* Some devices may not be providing link settings */ + ethtool_ret = __ethtool_get_link_ksettings(dev, &cmd); + if (!ethtool_ret) + ret = cmd.base.speed; + else + ret = ethtool_ret; + } + rtnl_unlock(); + + return ret; +} - /* initialize ksocknal_tunables structure */ - ksocknal_tunables.ksnd_timeout = &sock_timeout; - ksocknal_tunables.ksnd_nconnds = &nconnds; - ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; - ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; - ksocknal_tunables.ksnd_eager_ack = &eager_ack; - ksocknal_tunables.ksnd_typed_conns = &typed_conns; - ksocknal_tunables.ksnd_min_bulk = &min_bulk; - ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; - ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; - ksocknal_tunables.ksnd_nagle = &nagle; - ksocknal_tunables.ksnd_round_robin = &round_robin; - ksocknal_tunables.ksnd_keepalive = &keepalive; - ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; - ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; - ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; - ksocknal_tunables.ksnd_credits = &credits; - ksocknal_tunables.ksnd_peercredits = &peer_credits; - ksocknal_tunables.ksnd_enable_csum = &enable_csum; - ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; - ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; - ksocknal_tunables.ksnd_zc_recv = &zc_recv; - ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; - -#ifdef CPU_AFFINITY - ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; +static int ksocklnd_speed2cpp(int speed) +{ + /* Use the minimum of 1Gbps to avoid calling ilog2 with 0 */ + if (speed < 1000) + speed = 1000; + + /* Pick heuristically optimal conns_per_peer value + * for the specified ethernet interface speed (Mbps) + */ + return ilog2(speed/1000) / 2 + 1; +} #endif +static int ksocklnd_lookup_conns_per_peer(struct lnet_ni *ni) +{ + int cpp = 1; +#ifdef HAVE_ETHTOOL_LINK_SETTINGS + int speed = ksocklnd_ni_get_eth_intf_speed(ni); + + if (ni->ni_interface) + CDEBUG(D_NET, "intf %s speed %d\n", ni->ni_interface, speed); + + if (speed > 0) + cpp = ksocklnd_speed2cpp(speed); +#endif + return cpp; +} + +int ksocknal_tunables_init(void) +{ + ksock_default_tunables.lnd_version = CURRENT_LND_VERSION; + ksock_default_tunables.lnd_conns_per_peer = conns_per_peer; + ksock_default_tunables.lnd_tos = tos; + + /* initialize ksocknal_tunables structure */ + ksocknal_tunables.ksnd_timeout = &sock_timeout; + ksocknal_tunables.ksnd_nscheds = &nscheds; + ksocknal_tunables.ksnd_nconnds = &nconnds; + ksocknal_tunables.ksnd_nconnds_max = &nconnds_max; + ksocknal_tunables.ksnd_min_reconnectms = &min_reconnectms; + ksocknal_tunables.ksnd_max_reconnectms = &max_reconnectms; + ksocknal_tunables.ksnd_eager_ack = &eager_ack; + ksocknal_tunables.ksnd_typed_conns = &typed_conns; + ksocknal_tunables.ksnd_min_bulk = &min_bulk; + ksocknal_tunables.ksnd_tx_buffer_size = &tx_buffer_size; + ksocknal_tunables.ksnd_rx_buffer_size = &rx_buffer_size; + ksocknal_tunables.ksnd_nagle = &nagle; + ksocknal_tunables.ksnd_round_robin = &round_robin; + ksocknal_tunables.ksnd_keepalive = &keepalive; + ksocknal_tunables.ksnd_keepalive_idle = &keepalive_idle; + ksocknal_tunables.ksnd_keepalive_count = &keepalive_count; + ksocknal_tunables.ksnd_keepalive_intvl = &keepalive_intvl; + ksocknal_tunables.ksnd_credits = &credits; + ksocknal_tunables.ksnd_peertxcredits = &peer_credits; + ksocknal_tunables.ksnd_peerrtrcredits = &peer_buffer_credits; + ksocknal_tunables.ksnd_peertimeout = &peer_timeout; + ksocknal_tunables.ksnd_enable_csum = &enable_csum; + ksocknal_tunables.ksnd_inject_csum_error = &inject_csum_error; + ksocknal_tunables.ksnd_nonblk_zcack = &nonblk_zcack; + ksocknal_tunables.ksnd_zc_min_payload = &zc_min_payload; + ksocknal_tunables.ksnd_zc_recv = &zc_recv; + ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags; + if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) { + CWARN("socklnd conns_per_peer is capped at %u.\n", + (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1); + } + ksocknal_tunables.ksnd_conns_per_peer = &conns_per_peer; + + if (enable_irq_affinity) { + CWARN("irq_affinity is removed from socklnd because modern " + "computer always has fast CPUs and more cores than " + "# NICs, although you still can set irq_affinity by " + "another way, please check manual for details.\n"); + } + ksocknal_tunables.ksnd_irq_affinity = &enable_irq_affinity; + #ifdef SOCKNAL_BACKOFF - ksocknal_tunables.ksnd_backoff_init = &backoff_init; - ksocknal_tunables.ksnd_backoff_max = &backoff_max; + ksocknal_tunables.ksnd_backoff_init = &backoff_init; + ksocknal_tunables.ksnd_backoff_max = &backoff_max; #endif #if SOCKNAL_VERSION_DEBUG - ksocknal_tunables.ksnd_protocol = &protocol; + ksocknal_tunables.ksnd_protocol = &protocol; #endif -#if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM - ksocknal_tunables.ksnd_sysctl = NULL; -#endif + if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) + *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10); - if (*ksocknal_tunables.ksnd_zc_min_payload < (2 << 10)) - *ksocknal_tunables.ksnd_zc_min_payload = (2 << 10); + /* When on a hypervisor set the minimum zero copy size + * above the maximum payload size + */ + if (!is_native_host()) + *ksocknal_tunables.ksnd_zc_min_payload = (16 << 20) + 1; - /* initialize platform-sepcific tunables */ - return ksocknal_lib_tunables_init(); -}; + return 0; +} -void ksocknal_tunables_fini(void) +void ksocknal_tunables_setup(struct lnet_ni *ni) { - ksocknal_lib_tunables_fini(); + struct lnet_ioctl_config_socklnd_tunables *tunables; + struct lnet_ioctl_config_lnd_cmn_tunables *net_tunables; + + /* If no tunables specified, setup default tunables */ + if (!ni->ni_lnd_tunables_set) + memcpy(&ni->ni_lnd_tunables.lnd_tun_u.lnd_sock, + &ksock_default_tunables, sizeof(*tunables)); + + tunables = &ni->ni_lnd_tunables.lnd_tun_u.lnd_sock; + + /* Current API version */ + tunables->lnd_version = CURRENT_LND_VERSION; + + net_tunables = &ni->ni_net->net_tunables; + + if (net_tunables->lct_peer_timeout == -1) + net_tunables->lct_peer_timeout = + *ksocknal_tunables.ksnd_peertimeout; + + if (net_tunables->lct_max_tx_credits == -1) + net_tunables->lct_max_tx_credits = + *ksocknal_tunables.ksnd_credits; + + if (net_tunables->lct_peer_tx_credits == -1) + net_tunables->lct_peer_tx_credits = + *ksocknal_tunables.ksnd_peertxcredits; + + if (net_tunables->lct_peer_tx_credits > + net_tunables->lct_max_tx_credits) + net_tunables->lct_peer_tx_credits = + net_tunables->lct_max_tx_credits; + + if (net_tunables->lct_peer_rtr_credits == -1) + net_tunables->lct_peer_rtr_credits = + *ksocknal_tunables.ksnd_peerrtrcredits; + + if (!tunables->lnd_conns_per_peer) + tunables->lnd_conns_per_peer = + ksocklnd_lookup_conns_per_peer(ni); + + if (tunables->lnd_tos < 0) + tunables->lnd_tos = tos; + + tunables->lnd_timeout = ksocknal_timeout(); }