From 63cf744d0fdf72fc5ac7e154ec60c4a08139acc4 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Mon, 19 Feb 2018 15:35:58 -0800 Subject: [PATCH] LU-9120 lnet: add lnet_health_sensitivity Add lnet_health_senstivity value. This value determines the amount the NI health value is decremented by. The value defaults to 0, which turns off the health feature by default. The user needs to explicitly turn on this feature. The assumption is that many sites will only have one interface in their nodes. In this case the health feature will not increase the resiliency of their system. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I23f70b00f270803e5d296033e36a3a09986fd3cf Reviewed-on: https://review.whamcloud.com/32762 Reviewed-by: Olaf Weber Reviewed-by: Sonia Sharma Tested-by: Jenkins Reviewed-by: Chris Horn --- lnet/include/lnet/lib-lnet.h | 1 + lnet/lnet/api-ni.c | 58 ++++++++++++++++++++++++++++++++++++++++++++ lnet/lnet/lib-move.c | 12 ++++++++- 3 files changed, 70 insertions(+), 1 deletion(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index bed5244..2fd3292 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -534,6 +534,7 @@ void lnet_lib_exit(void); extern unsigned lnet_transaction_timeout; extern unsigned int lnet_numa_range; +extern unsigned int lnet_health_sensitivity; extern unsigned int lnet_peer_discovery_disabled; extern int portal_rotor; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index d10ff58..8b3dbad 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -78,6 +78,28 @@ module_param(lnet_numa_range, uint, 0444); MODULE_PARM_DESC(lnet_numa_range, "NUMA range to consider during Multi-Rail selection"); +/* + * lnet_health_sensitivity determines by how much we decrement the health + * value on sending error. The value defaults to 0, which means health + * checking is turned off by default. + */ +unsigned int lnet_health_sensitivity = 0; +static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp); +static struct kernel_param_ops param_ops_health_sensitivity = { + .set = sensitivity_set, + .get = param_get_int, +}; +#define param_check_health_sensitivity(name, p) \ + __param_check(name, p, int) +#ifdef HAVE_KERNEL_PARAM_OPS +module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR); +#else +module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int, + &lnet_health_sensitivity, S_IRUGO|S_IWUSR); +#endif +MODULE_PARM_DESC(lnet_health_sensitivity, + "Value to decrement the health value by on error"); + static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT; static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp); @@ -138,6 +160,42 @@ static int lnet_discover(struct lnet_process_id id, __u32 force, struct lnet_process_id __user *ids, int n_ids); static int +sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp) +{ + int rc; + unsigned *sensitivity = (unsigned *)kp->arg; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_health_sensitivity'\n"); + return rc; + } + + /* + * The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + + if (the_lnet.ln_state != LNET_STATE_RUNNING) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + if (value == *sensitivity) { + mutex_unlock(&the_lnet.ln_api_mutex); + return 0; + } + + *sensitivity = value; + + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int discovery_set(const char *val, cfs_kernel_param_arg_t *kp) { int rc; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 9d50948..c23c9ab5 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1508,6 +1508,17 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, */ if (ni_healthv < best_healthv) { continue; + } else if (ni_healthv > best_healthv) { + best_healthv = ni_healthv; + /* + * If we're going to prefer this ni because it's + * the healthiest, then we should set the + * shortest_distance in the algorithm in case + * there are multiple NIs with the same health but + * different distances. + */ + if (distance < shortest_distance) + shortest_distance = distance; } else if (distance > shortest_distance) { continue; } else if (distance < shortest_distance) { @@ -1520,7 +1531,6 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, } best_ni = ni; best_credits = ni_credits; - best_healthv = ni_healthv; } CDEBUG(D_NET, "selected best_ni %s\n", -- 1.8.3.1