From: Amir Shehata Date: Wed, 19 Dec 2018 23:55:49 +0000 (-0800) Subject: LU-11816 lnet: setup health timeout defaults X-Git-Tag: 2.12.55~25^2~39 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=8632e94aeb7e62da07f342a9897d15dfd8251148 LU-11816 lnet: setup health timeout defaults Enable health feature by default. Setup transaction timeout to a default 10 seconds and retry count to 3 when health is enabled. When health is disabled set default transaction timeout to 50. When toggling between health enabled/disabled the defaults will always kick in. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I153c2822898b44e33871ec827de7e61f153bb1db Reviewed-on: https://review.whamcloud.com/34252 Reviewed-by: Olaf Weber Tested-by: Jenkins Reviewed-by: Sebastien Buisson Reviewed-by: Chris Horn --- diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 07558df..3695d9f 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -80,10 +80,10 @@ MODULE_PARM_DESC(lnet_numa_range, /* * lnet_health_sensitivity determines by how much we decrement the health - * value on sending error. The value defaults to 0, which means health - * checking is turned off by default. + * value on sending error. The value defaults to 100, which means health + * interface health is decremented by 100 points every failure. */ -unsigned int lnet_health_sensitivity = 0; +unsigned int lnet_health_sensitivity = 100; static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp); #ifdef HAVE_KERNEL_PARAM_OPS static struct kernel_param_ops param_ops_health_sensitivity = { @@ -179,7 +179,10 @@ module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int, MODULE_PARM_DESC(lnet_drop_asym_route, "Set to 1 to drop asymmetrical route messages."); -unsigned lnet_transaction_timeout = 50; +#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50 +#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10 + +unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp); #ifdef HAVE_KERNEL_PARAM_OPS static struct kernel_param_ops param_ops_transaction_timeout = { @@ -197,7 +200,8 @@ module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int, MODULE_PARM_DESC(lnet_transaction_timeout, "Maximum number of seconds to wait for a peer response."); -unsigned lnet_retry_count = 0; +#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3 +unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp); #ifdef HAVE_KERNEL_PARAM_OPS static struct kernel_param_ops param_ops_retry_count = { @@ -252,11 +256,6 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - if (value > LNET_MAX_HEALTH_VALUE) { mutex_unlock(&the_lnet.ln_api_mutex); CERROR("Invalid health value. Maximum: %d value = %lu\n", @@ -264,6 +263,23 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) return -EINVAL; } + /* + * if we're turning on health then use the health timeout + * defaults. + */ + if (*sensitivity == 0 && value != 0) { + lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; + lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; + /* + * if we're turning off health then use the no health timeout + * default. + */ + } else if (*sensitivity != 0 && value == 0) { + lnet_transaction_timeout = + LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT; + lnet_retry_count = 0; + } + *sensitivity = value; mutex_unlock(&the_lnet.ln_api_mutex); @@ -295,11 +311,6 @@ recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - *interval = value; mutex_unlock(&the_lnet.ln_api_mutex); @@ -408,11 +419,6 @@ transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - if (value < lnet_retry_count || value == 0) { mutex_unlock(&the_lnet.ln_api_mutex); CERROR("Invalid value for lnet_transaction_timeout (%lu). " @@ -456,9 +462,10 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); - if (the_lnet.ln_state != LNET_STATE_RUNNING) { + if (lnet_health_sensitivity == 0) { mutex_unlock(&the_lnet.ln_api_mutex); - return 0; + CERROR("Can not set retry_count when health feature is turned off\n"); + return -EINVAL; } if (value > lnet_transaction_timeout) { @@ -469,11 +476,6 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp) return -EINVAL; } - if (value == *retry_count) { - mutex_unlock(&the_lnet.ln_api_mutex); - return 0; - } - *retry_count = value; if (value == 0)