From 2e0e446bcab61276f6bc3052f2f03a87a7346795 Mon Sep 17 00:00:00 2001 From: Oleg Drokin Date: Thu, 12 Sep 2019 18:04:55 +0000 Subject: [PATCH] Revert "LU-11816 lnet: setup health timeout defaults" This is causing frequent assertion failures like below: LNetError: 1701:0:(lib-move.c:3670:lnet_monitor_thr_stop()) ASSERTION( rc == 0 ) failed: [ 378.662897] LNetError: 1701:0:(lib-move.c:3670:lnet_monitor_thr_stop()) LBUG [ 378.665136] Pid: 1701, comm: rmmod 3.10.0-7.6-debug #1 SMP Fri Jul 12 02:40:17 EDT 2019 [ 378.667455] Call Trace: [ 378.668302] [] libcfs_call_trace+0x8c/0xc0 [libcfs] [ 378.670463] [] lbug_with_loc+0x4c/0xa0 [libcfs] [ 378.672398] [] lnet_monitor_thr_stop+0xe6/0x120 [lnet] [ 378.674727] [] LNetNIFini+0x6a/0x110 [lnet] [ 378.676532] [] ptlrpc_ni_fini+0x175/0x200 [ptlrpc] [ 378.678598] [] ptlrpc_exit_portals+0x13/0x20 [ptlrpc] [ 378.680850] [] ptlrpc_exit+0x22/0x678 [ptlrpc] [ 378.683338] [] SyS_delete_module+0x19b/0x300 [ 378.684809] [] system_call_fastpath+0x1c/0x21 [ 378.686727] [] 0xffffffffffffffff [ 378.688144] Kernel panic - not syncing: LBUG This reverts commit db81f3f293dbc0c9dba90ea1153f554b33fbb80b. Change-Id: Id12f9d3ec4af3ab37158b3e6049d2ea971d86913 Signed-off-by: Oleg Drokin Reviewed-on: https://review.whamcloud.com/36173 --- lnet/lnet/api-ni.c | 56 ++++++++++++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index bd614b3..9e53940 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -80,10 +80,10 @@ MODULE_PARM_DESC(lnet_numa_range, /* * lnet_health_sensitivity determines by how much we decrement the health - * value on sending error. The value defaults to 100, which means health - * interface health is decremented by 100 points every failure. + * value on sending error. The value defaults to 0, which means health + * checking is turned off by default. */ -unsigned int lnet_health_sensitivity = 100; +unsigned int lnet_health_sensitivity = 0; static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp); #ifdef HAVE_KERNEL_PARAM_OPS static struct kernel_param_ops param_ops_health_sensitivity = { @@ -179,10 +179,7 @@ module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int, MODULE_PARM_DESC(lnet_drop_asym_route, "Set to 1 to drop asymmetrical route messages."); -#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50 -#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10 - -unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; +unsigned lnet_transaction_timeout = 50; static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp); #ifdef HAVE_KERNEL_PARAM_OPS static struct kernel_param_ops param_ops_transaction_timeout = { @@ -200,8 +197,7 @@ module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int, MODULE_PARM_DESC(lnet_transaction_timeout, "Maximum number of seconds to wait for a peer response."); -#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3 -unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; +unsigned lnet_retry_count = 0; static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp); #ifdef HAVE_KERNEL_PARAM_OPS static struct kernel_param_ops param_ops_retry_count = { @@ -256,6 +252,11 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + if (value > LNET_MAX_HEALTH_VALUE) { mutex_unlock(&the_lnet.ln_api_mutex); CERROR("Invalid health value. Maximum: %d value = %lu\n", @@ -263,23 +264,6 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) return -EINVAL; } - /* - * if we're turning on health then use the health timeout - * defaults. - */ - if (*sensitivity == 0 && value != 0) { - lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT; - lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT; - /* - * if we're turning off health then use the no health timeout - * default. - */ - } else if (*sensitivity != 0 && value == 0) { - lnet_transaction_timeout = - LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT; - lnet_retry_count = 0; - } - *sensitivity = value; mutex_unlock(&the_lnet.ln_api_mutex); @@ -311,6 +295,11 @@ recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + *interval = value; mutex_unlock(&the_lnet.ln_api_mutex); @@ -419,6 +408,11 @@ transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + if (value < lnet_retry_count || value == 0) { mutex_unlock(&the_lnet.ln_api_mutex); CERROR("Invalid value for lnet_transaction_timeout (%lu). " @@ -462,10 +456,9 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp) */ mutex_lock(&the_lnet.ln_api_mutex); - if (lnet_health_sensitivity == 0) { + if (the_lnet.ln_state != LNET_STATE_RUNNING) { mutex_unlock(&the_lnet.ln_api_mutex); - CERROR("Can not set retry_count when health feature is turned off\n"); - return -EINVAL; + return 0; } if (value > lnet_transaction_timeout) { @@ -476,6 +469,11 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp) return -EINVAL; } + if (value == *retry_count) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + *retry_count = value; if (value == 0) -- 1.8.3.1