Whamcloud - gitweb
LU-11300 lnet: router sensitivity 49/33449/30
authorAmir Shehata <ashehata@whamcloud.com>
Sat, 20 Oct 2018 00:09:24 +0000 (17:09 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:12:48 +0000 (18:12 +0000)
Introduce the router_sensitivity_percentage module parameter to
control the sensitivity of routers to failures. It defaults to 100%
which means a router interface needs to be fully healthy in order
to be used.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I3e9333033f049918c1cdca58a72604c71884acbe
Reviewed-on: https://review.whamcloud.com/33449
Tested-by: Jenkins
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Chris Horn <hornc@cray.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/router.c

index 26c0e44..dd4b34e 100644 (file)
@@ -573,6 +573,7 @@ extern unsigned int lnet_health_sensitivity;
 extern unsigned int lnet_recovery_interval;
 extern unsigned int lnet_peer_discovery_disabled;
 extern unsigned int lnet_drop_asym_route;
+extern unsigned int router_sensitivity_percentage;
 extern int portal_rotor;
 
 void lnet_mt_event_handler(struct lnet_event *event);
index 4aefdf1..ce2f12d 100644 (file)
@@ -90,6 +90,61 @@ static int router_ping_timeout = 50;
 module_param(router_ping_timeout, int, 0644);
 MODULE_PARM_DESC(router_ping_timeout, "Seconds to wait for the reply to a router health query");
 
+/*
+ * A value between 0 and 100. 0 meaning that even if router's interfaces
+ * have the worse health still consider the gateway usable.
+ * 100 means that at least one interface on the route's remote net is 100%
+ * healthy to consider the route alive.
+ * The default is set to 100 to ensure we maintain the original behavior.
+ */
+unsigned int router_sensitivity_percentage = 100;
+static int rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+static struct kernel_param_ops param_ops_rtr_sensitivity = {
+       .set = rtr_sensitivity_set,
+       .get = param_get_int,
+};
+#define param_check_rtr_sensitivity(name, p) \
+               __param_check(name, p, int)
+#ifdef HAVE_KERNEL_PARAM_OPS
+module_param(router_sensitivity_percentage, rtr_sensitivity, S_IRUGO|S_IWUSR);
+#else
+module_param_call(router_sensitivity_percentage, rtr_sensitivity_set, param_get_int,
+                 &router_sensitivity_percentage, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(router_sensitivity_percentage,
+               "How healthy a gateway should be to be used in percent");
+
+static int
+rtr_sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+       int rc;
+       unsigned *sen = (unsigned *)kp->arg;
+       unsigned long value;
+
+       rc = kstrtoul(val, 0, &value);
+       if (rc) {
+               CERROR("Invalid module parameter value for 'router_sensitivity_percentage'\n");
+               return rc;
+       }
+
+       if (value < 0 || value > 100) {
+               CERROR("Invalid value: %lu for 'router_sensitivity_percentage'\n", value);
+               return -EINVAL;
+       }
+
+       /*
+        * The purpose of locking the api_mutex here is to ensure that
+        * the correct value ends up stored properly.
+        */
+       mutex_lock(&the_lnet.ln_api_mutex);
+
+       *sen = value;
+
+       mutex_unlock(&the_lnet.ln_api_mutex);
+
+       return 0;
+}
+
 int
 lnet_peers_start_down(void)
 {