Whamcloud - gitweb
Revert "LU-11816 lnet: setup health timeout defaults" 73/36173/2
authorOleg Drokin <green@whamcloud.com>
Thu, 12 Sep 2019 18:04:55 +0000 (18:04 +0000)
committerOleg Drokin <green@whamcloud.com>
Thu, 12 Sep 2019 18:05:10 +0000 (18:05 +0000)
This is causing frequent assertion failures like below:
LNetError: 1701:0:(lib-move.c:3670:lnet_monitor_thr_stop()) ASSERTION( rc == 0 ) failed:
[  378.662897] LNetError: 1701:0:(lib-move.c:3670:lnet_monitor_thr_stop()) LBUG
[  378.665136] Pid: 1701, comm: rmmod 3.10.0-7.6-debug #1 SMP Fri Jul 12 02:40:17 EDT 2019
[  378.667455] Call Trace:
[  378.668302]  [<ffffffffa01927dc>] libcfs_call_trace+0x8c/0xc0 [libcfs]
[  378.670463]  [<ffffffffa019288c>] lbug_with_loc+0x4c/0xa0 [libcfs]
[  378.672398]  [<ffffffffa021d036>] lnet_monitor_thr_stop+0xe6/0x120 [lnet]
[  378.674727]  [<ffffffffa01fde8a>] LNetNIFini+0x6a/0x110 [lnet]
[  378.676532]  [<ffffffffa0622b15>] ptlrpc_ni_fini+0x175/0x200 [ptlrpc]
[  378.678598]  [<ffffffffa0622e53>] ptlrpc_exit_portals+0x13/0x20 [ptlrpc]
[  378.680850]  [<ffffffffa06b59aa>] ptlrpc_exit+0x22/0x678 [ptlrpc]
[  378.683338]  [<ffffffff81108aab>] SyS_delete_module+0x19b/0x300
[  378.684809]  [<ffffffff817c8e15>] system_call_fastpath+0x1c/0x21
[  378.686727]  [<ffffffffffffffff>] 0xffffffffffffffff
[  378.688144] Kernel panic - not syncing: LBUG

This reverts commit db81f3f293dbc0c9dba90ea1153f554b33fbb80b.

Change-Id: Id12f9d3ec4af3ab37158b3e6049d2ea971d86913
Signed-off-by: Oleg Drokin <green@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/36173

lnet/lnet/api-ni.c

index bd614b3..9e53940 100644 (file)
@@ -80,10 +80,10 @@ MODULE_PARM_DESC(lnet_numa_range,
 
 /*
  * lnet_health_sensitivity determines by how much we decrement the health
- * value on sending error. The value defaults to 100, which means health
- * interface health is decremented by 100 points every failure.
+ * value on sending error. The value defaults to 0, which means health
+ * checking is turned off by default.
  */
-unsigned int lnet_health_sensitivity = 100;
+unsigned int lnet_health_sensitivity = 0;
 static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
 #ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_health_sensitivity = {
@@ -179,10 +179,7 @@ module_param_call(lnet_drop_asym_route, drop_asym_route_set, param_get_int,
 MODULE_PARM_DESC(lnet_drop_asym_route,
                 "Set to 1 to drop asymmetrical route messages.");
 
-#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
-#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10
-
-unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+unsigned lnet_transaction_timeout = 50;
 static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
 #ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_transaction_timeout = {
@@ -200,8 +197,7 @@ module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
 MODULE_PARM_DESC(lnet_transaction_timeout,
                "Maximum number of seconds to wait for a peer response.");
 
-#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3
-unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+unsigned lnet_retry_count = 0;
 static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
 #ifdef HAVE_KERNEL_PARAM_OPS
 static struct kernel_param_ops param_ops_retry_count = {
@@ -256,6 +252,11 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
        if (value > LNET_MAX_HEALTH_VALUE) {
                mutex_unlock(&the_lnet.ln_api_mutex);
                CERROR("Invalid health value. Maximum: %d value = %lu\n",
@@ -263,23 +264,6 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
                return -EINVAL;
        }
 
-       /*
-        * if we're turning on health then use the health timeout
-        * defaults.
-        */
-       if (*sensitivity == 0 && value != 0) {
-               lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
-               lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
-       /*
-        * if we're turning off health then use the no health timeout
-        * default.
-        */
-       } else if (*sensitivity != 0 && value == 0) {
-               lnet_transaction_timeout =
-                       LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
-               lnet_retry_count = 0;
-       }
-
        *sensitivity = value;
 
        mutex_unlock(&the_lnet.ln_api_mutex);
@@ -311,6 +295,11 @@ recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
        *interval = value;
 
        mutex_unlock(&the_lnet.ln_api_mutex);
@@ -419,6 +408,11 @@ transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp)
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
        if (value < lnet_retry_count || value == 0) {
                mutex_unlock(&the_lnet.ln_api_mutex);
                CERROR("Invalid value for lnet_transaction_timeout (%lu). "
@@ -462,10 +456,9 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
         */
        mutex_lock(&the_lnet.ln_api_mutex);
 
-       if (lnet_health_sensitivity == 0) {
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
                mutex_unlock(&the_lnet.ln_api_mutex);
-               CERROR("Can not set retry_count when health feature is turned off\n");
-               return -EINVAL;
+               return 0;
        }
 
        if (value > lnet_transaction_timeout) {
@@ -476,6 +469,11 @@ retry_count_set(const char *val, cfs_kernel_param_arg_t *kp)
                return -EINVAL;
        }
 
+       if (value == *retry_count) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
        *retry_count = value;
 
        if (value == 0)