From 2b59dae54efc23066f33c4c19f945568de2ee3b2 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Fri, 19 Oct 2018 17:09:24 -0700 Subject: [PATCH] LU-11300 lnet: router sensitivity Introduce the router_sensitivity_percentage module parameter to control the sensitivity of routers to failures. It defaults to 100% which means a router interface needs to be fully healthy in order to be used. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I3e9333033f049918c1cdca58a72604c71884acbe Reviewed-on: https://review.whamcloud.com/33449 Tested-by: Jenkins Reviewed-by: Sebastien Buisson Reviewed-by: Chris Horn --- lnet/include/lnet/lib-lnet.h | 1 + lnet/lnet/router.c | 55 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 26c0e44..dd4b34e 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -573,6 +573,7 @@ extern unsigned int lnet_health_sensitivity; extern unsigned int lnet_recovery_interval; extern unsigned int lnet_peer_discovery_disabled; extern unsigned int lnet_drop_asym_route; +extern unsigned int router_sensitivity_percentage; extern int portal_rotor; void lnet_mt_event_handler(struct lnet_event *event); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 4aefdf1..ce2f12d 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -90,6 +90,61 @@ static int router_ping_timeout = 50; module_param(router_ping_timeout, int, 0644); MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query"); +/* + * A value between 0 and 100. 0 meaning that even if router's interfaces + * have the worse health still consider the gateway usable. + * 100 means that at least one interface on the route's remote net is 100% + * healthy to consider the route alive. + * The default is set to 100 to ensure we maintain the original behavior. + */ +unsigned int router_sensitivity_percentage = 100; +static int rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp); +static struct kernel_param_ops param_ops_rtr_sensitivity = { + .set = rtr_sensitivity_set, + .get = param_get_int, +}; +#define param_check_rtr_sensitivity(name, p) \ + __param_check(name, p, int) +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(router_sensitivity_percentage, rtr_sensitivity, S_IRUGO|S_IWUSR); +#else +module_param_call(router_sensitivity_percentage, rtr_sensitivity_set, param_get_int, + &router_sensitivity_percentage, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(router_sensitivity_percentage, + "How healthy a gateway should be to be used in percent"); + +static int +rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *sen = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'router_sensitivity_percentage'\n"); + return rc; + } + + if (value < 0 || value > 100) { + CERROR("Invalid value: %lu for 'router_sensitivity_percentage'\n", value); + return -EINVAL; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + *sen = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + int lnet_peers_start_down(void) { -- 1.8.3.1