Whamcloud - gitweb
LU-11468 lnet: configure recovery interval 09/33309/9
authorAmir Shehata <ashehata@whamcloud.com>
Thu, 4 Oct 2018 00:01:38 +0000 (17:01 -0700)
committerOleg Drokin <green@whamcloud.com>
Tue, 6 Nov 2018 06:40:11 +0000 (06:40 +0000)
Added a module parameter to configure the interval between each
recovery ping. Some sites might not want to ping failed NIDs once
a second and might desire a longer interval. The interval defaults
to 1 second.
Monitor thread now wakes up depending on the smallest interval
it needs to monitor

Test-Parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ia96fa7dea0b3925686d785b4d4dde399742c86b7
Reviewed-on: https://review.whamcloud.com/33309
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Doug Oucharek <dougso@me.com>
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c

index 1c60b4d..0299a21 100644 (file)
@@ -556,6 +556,7 @@ extern unsigned lnet_transaction_timeout;
 extern unsigned lnet_retry_count;
 extern unsigned int lnet_numa_range;
 extern unsigned int lnet_health_sensitivity;
 extern unsigned lnet_retry_count;
 extern unsigned int lnet_numa_range;
 extern unsigned int lnet_health_sensitivity;
+extern unsigned int lnet_recovery_interval;
 extern unsigned int lnet_peer_discovery_disabled;
 extern int portal_rotor;
 
 extern unsigned int lnet_peer_discovery_disabled;
 extern int portal_rotor;
 
index d796e9c..b12ed66 100644 (file)
@@ -100,6 +100,27 @@ module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
 MODULE_PARM_DESC(lnet_health_sensitivity,
                "Value to decrement the health value by on error");
 
 MODULE_PARM_DESC(lnet_health_sensitivity,
                "Value to decrement the health value by on error");
 
+/*
+ * lnet_recovery_interval determines how often we should perform recovery
+ * on unhealthy interfaces.
+ */
+unsigned int lnet_recovery_interval = 1;
+static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
+static struct kernel_param_ops param_ops_recovery_interval = {
+       .set = recovery_interval_set,
+       .get = param_get_int,
+};
+#define param_check_recovery_interval(name, p) \
+               __param_check(name, p, int)
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
+                 &lnet_recovery_interval, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_recovery_interval,
+               "Interval to recover unhealthy interfaces in seconds");
+
 static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
 static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
 
 static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
 static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
 
@@ -232,6 +253,42 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
 }
 
 static int
 }
 
 static int
+recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+       int rc;
+       unsigned *interval = (unsigned *)kp->arg;
+       unsigned long value;
+
+       rc = kstrtoul(val, 0, &value);
+       if (rc) {
+               CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n");
+               return rc;
+       }
+
+       if (value < 1) {
+               CERROR("lnet_recovery_interval must be at least 1 second\n");
+               return -EINVAL;
+       }
+
+       /*
+        * The purpose of locking the api_mutex here is to ensure that
+        * the correct value ends up stored properly.
+        */
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               return 0;
+       }
+
+       *interval = value;
+
+       mutex_unlock(&the_lnet.ln_api_mutex);
+
+       return 0;
+}
+
+static int
 discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
        int rc;
 discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
 {
        int rc;
index 3f5b112..3e2624c 100644 (file)
@@ -3324,7 +3324,10 @@ lnet_recover_peer_nis(void)
 static int
 lnet_monitor_thread(void *arg)
 {
 static int
 lnet_monitor_thread(void *arg)
 {
-       int wakeup_counter = 0;
+       time64_t recovery_timeout = 0;
+       time64_t rsp_timeout = 0;
+       int interval;
+       time64_t now;
 
        /*
         * The monitor thread takes care of the following:
 
        /*
         * The monitor thread takes care of the following:
@@ -3339,20 +3342,23 @@ lnet_monitor_thread(void *arg)
        cfs_block_allsigs();
 
        while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
        cfs_block_allsigs();
 
        while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
+               now = ktime_get_real_seconds();
+
                if (lnet_router_checker_active())
                        lnet_check_routers();
 
                lnet_resend_pending_msgs();
 
                if (lnet_router_checker_active())
                        lnet_check_routers();
 
                lnet_resend_pending_msgs();
 
-               wakeup_counter++;
-               if (wakeup_counter >= lnet_transaction_timeout / 2) {
+               if (now >= rsp_timeout) {
                        lnet_finalize_expired_responses(false);
                        lnet_finalize_expired_responses(false);
-                       wakeup_counter = 0;
+                       rsp_timeout = now + (lnet_transaction_timeout / 2);
                }
 
                }
 
-               lnet_recover_local_nis();
-
-               lnet_recover_peer_nis();
+               if (now >= recovery_timeout) {
+                       lnet_recover_local_nis();
+                       lnet_recover_peer_nis();
+                       recovery_timeout = now + lnet_recovery_interval;
+               }
 
                /*
                 * TODO do we need to check if we should sleep without
 
                /*
                 * TODO do we need to check if we should sleep without
@@ -3363,9 +3369,11 @@ lnet_monitor_thread(void *arg)
                 * cases where we get a complaint that an idle thread
                 * is waking up unnecessarily.
                 */
                 * cases where we get a complaint that an idle thread
                 * is waking up unnecessarily.
                 */
+               interval = min(lnet_recovery_interval,
+                              lnet_transaction_timeout / 2);
                wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
                                                false,
                wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
                                                false,
-                                               cfs_time_seconds(1));
+                                               cfs_time_seconds(interval));
        }
 
        /* clean up the router checker */
        }
 
        /* clean up the router checker */