From: Amir Shehata Date: Thu, 4 Oct 2018 00:01:38 +0000 (-0700) Subject: LU-11468 lnet: configure recovery interval X-Git-Tag: 2.12.0-RC1~108 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=refs%2Fchanges%2F09%2F33309%2F9;p=fs%2Flustre-release.git LU-11468 lnet: configure recovery interval Added a module parameter to configure the interval between each recovery ping. Some sites might not want to ping failed NIDs once a second and might desire a longer interval. The interval defaults to 1 second. Monitor thread now wakes up depending on the smallest interval it needs to monitor Test-Parameters: trivial Signed-off-by: Amir Shehata Change-Id: Ia96fa7dea0b3925686d785b4d4dde399742c86b7 Reviewed-on: https://review.whamcloud.com/33309 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Doug Oucharek Reviewed-by: Sonia Sharma Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 1c60b4d..0299a21 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -556,6 +556,7 @@ extern unsigned lnet_transaction_timeout; extern unsigned lnet_retry_count; extern unsigned int lnet_numa_range; extern unsigned int lnet_health_sensitivity; +extern unsigned int lnet_recovery_interval; extern unsigned int lnet_peer_discovery_disabled; extern int portal_rotor; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index d796e9c..b12ed66 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -100,6 +100,27 @@ module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int, MODULE_PARM_DESC(lnet_health_sensitivity, "Value to decrement the health value by on error"); +/* + * lnet_recovery_interval determines how often we should perform recovery + * on unhealthy interfaces. + */ +unsigned int lnet_recovery_interval = 1; +static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp); +static struct kernel_param_ops param_ops_recovery_interval = { + .set = recovery_interval_set, + .get = param_get_int, +}; +#define param_check_recovery_interval(name, p) \ + __param_check(name, p, int) +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int, + &lnet_recovery_interval, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_recovery_interval, + "Interval to recover unhealthy interfaces in seconds"); + static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT; static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp); @@ -232,6 +253,42 @@ sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) } static int +recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *interval = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n"); + return rc; + } + + if (value < 1) { + CERROR("lnet_recovery_interval must be at least 1 second\n"); + return -EINVAL; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + *interval = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp) { int rc; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 3f5b112..3e2624c 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3324,7 +3324,10 @@ lnet_recover_peer_nis(void) static int lnet_monitor_thread(void *arg) { - int wakeup_counter = 0; + time64_t recovery_timeout = 0; + time64_t rsp_timeout = 0; + int interval; + time64_t now; /* * The monitor thread takes care of the following: @@ -3339,20 +3342,23 @@ lnet_monitor_thread(void *arg) cfs_block_allsigs(); while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) { + now = ktime_get_real_seconds(); + if (lnet_router_checker_active()) lnet_check_routers(); lnet_resend_pending_msgs(); - wakeup_counter++; - if (wakeup_counter >= lnet_transaction_timeout / 2) { + if (now >= rsp_timeout) { lnet_finalize_expired_responses(false); - wakeup_counter = 0; + rsp_timeout = now + (lnet_transaction_timeout / 2); } - lnet_recover_local_nis(); - - lnet_recover_peer_nis(); + if (now >= recovery_timeout) { + lnet_recover_local_nis(); + lnet_recover_peer_nis(); + recovery_timeout = now + lnet_recovery_interval; + } /* * TODO do we need to check if we should sleep without @@ -3363,9 +3369,11 @@ lnet_monitor_thread(void *arg) * cases where we get a complaint that an idle thread * is waking up unnecessarily. */ + interval = min(lnet_recovery_interval, + lnet_transaction_timeout / 2); wait_event_interruptible_timeout(the_lnet.ln_mt_waitq, false, - cfs_time_seconds(1)); + cfs_time_seconds(interval)); } /* clean up the router checker */