Whamcloud - gitweb
LU-11300 lnet: configure lnet router senstivity 55/33455/29
authorAmir Shehata <ashehata@whamcloud.com>
Tue, 23 Oct 2018 04:25:33 +0000 (21:25 -0700)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 7 Jun 2019 18:13:11 +0000 (18:13 +0000)
Allow the configuration of router_sensitivity_percentage from the
user space utility lnetctl

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: If5440f30881361ebb06dafa9cadb7cbc2b934f93
Reviewed-on: https://review.whamcloud.com/33455
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Sebastien Buisson <sbuisson@ddn.com>
Reviewed-by: Chris Horn <hornc@cray.com>
Tested-by: Jenkins
lnet/utils/lnetconfig/liblnetconfig.c
lnet/utils/lnetconfig/liblnetconfig.h
lnet/utils/lnetctl.c

index bdcda35..b9bc07d 100644 (file)
@@ -2570,6 +2570,28 @@ int lustre_lnet_config_recov_intrv(int intrv, int seq_no, struct cYAML **err_rc)
        return rc;
 }
 
+int lustre_lnet_config_rtr_sensitivity(int sen, int seq_no, struct cYAML **err_rc)
+{
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+       char val[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       snprintf(val, sizeof(val), "%d", sen);
+
+       rc = write_sysfs_file(modparam_path, "router_sensitivity_percentage", val,
+                             1, strlen(val) + 1);
+       if (rc)
+               snprintf(err_str, sizeof(err_str),
+                        "\"cannot configure router health sensitivity: %s\"",
+                        strerror(errno));
+
+       cYAML_build_error(rc, seq_no, ADD_CMD, "router_sensitivity", err_str, err_rc);
+
+       return rc;
+}
+
 int lustre_lnet_config_hsensitivity(int sen, int seq_no, struct cYAML **err_rc)
 {
        int rc = LUSTRE_CFG_RC_NO_ERR;
@@ -3498,6 +3520,31 @@ int lustre_lnet_show_hsensitivity(int seq_no, struct cYAML **show_rc,
                                       err_rc, l_errno);
 }
 
+int lustre_lnet_show_rtr_sensitivity(int seq_no, struct cYAML **show_rc,
+                                    struct cYAML **err_rc)
+{
+       int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+       char val[LNET_MAX_STR_LEN];
+       int sen = -1, l_errno = 0;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+       rc = read_sysfs_file(modparam_path, "router_sensitivity_percentage", val,
+                            1, sizeof(val));
+       if (rc) {
+               l_errno = -errno;
+               snprintf(err_str, sizeof(err_str),
+                        "\"cannot get router sensitivity percentage: %d\"", rc);
+       } else {
+               sen = atoi(val);
+       }
+
+       return build_global_yaml_entry(err_str, sizeof(err_str), seq_no,
+                                      "router_sensitivity", sen, show_rc,
+                                      err_rc, l_errno);
+}
+
 int lustre_lnet_show_transaction_to(int seq_no, struct cYAML **show_rc,
                                    struct cYAML **err_rc)
 {
@@ -4626,7 +4673,7 @@ static int handle_yaml_config_global_settings(struct cYAML *tree,
                                              struct cYAML **err_rc)
 {
        struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no,
-                    *sen, *recov, *drop_asym_route;
+                    *sen, *recov, *rsen, *drop_asym_route;
        int rc = 0;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
@@ -4686,6 +4733,13 @@ static int handle_yaml_config_global_settings(struct cYAML *tree,
                                                        : -1,
                                                    err_rc);
 
+       rsen = cYAML_get_object_item(tree, "router_sensitivity");
+       if (rsen)
+               rc = lustre_lnet_config_rtr_sensitivity(rsen->cy_valueint,
+                                                    seq_no ? seq_no->cy_valueint
+                                                       : -1,
+                                                    err_rc);
+
        return rc;
 }
 
@@ -4733,7 +4787,7 @@ static int handle_yaml_show_global_settings(struct cYAML *tree,
                                            struct cYAML **err_rc)
 {
        struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no,
-                    *sen, *recov, *drop_asym_route;
+                    *sen, *recov, *rsen, *drop_asym_route;
        int rc = 0;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
@@ -4785,6 +4839,12 @@ static int handle_yaml_show_global_settings(struct cYAML *tree,
                                                        : -1,
                                                  show_rc, err_rc);
 
+       rsen = cYAML_get_object_item(tree, "router_sensitivity");
+       if (rsen)
+               rc = lustre_lnet_show_hsensitivity(seq_no ? seq_no->cy_valueint
+                                                       : -1,
+                                                    show_rc, err_rc);
+
        return rc;
 }
 
index 8a5e6ac..a564694 100644 (file)
@@ -279,6 +279,18 @@ int lustre_lnet_show_recov_intrv(int seq_no, struct cYAML **show_rc,
                                 struct cYAML **err_rc);
 
 /*
+ * lustre_lnet_config_rtr_sensitivity
+ *   sets the router sensitivity percentage. If the percentage health
+ *   of a router interface drops below that it's considered failed
+ *
+ *   sen - sensitivity value to configure
+ *   seq_no - sequence number of the request
+ *   err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *   caller
+ */
+int lustre_lnet_config_rtr_sensitivity(int sen, int seq_no, struct cYAML **err_rc);
+
+/*
  * lustre_lnet_config_hsensitivity
  *   sets the health sensitivity; the value by which to decrement the
  *   health value of a local or peer NI. If 0 then health is turned off
@@ -303,6 +315,18 @@ int lustre_lnet_show_hsensitivity(int seq_no, struct cYAML **show_rc,
                                  struct cYAML **err_rc);
 
 /*
+ * lustre_lnet_show_rtr_sensitivity
+ *    show the router sensitivity percentage in the system
+ *
+ *   seq_no - sequence number of the request
+ *   show_rc - [OUT] struct cYAML tree containing health sensitivity info
+ *   err_rc - [OUT] struct cYAML tree describing the error. Freed by
+ *   caller
+ */
+int lustre_lnet_show_rtr_sensitivity(int seq_no, struct cYAML **show_rc,
+                                    struct cYAML **err_rc);
+
+/*
  * lustre_lnet_config_transaction_to
  *   sets the timeout after which a message expires or a timeout event is
  *   propagated for an expired response.
index c503223..00b3bc9 100644 (file)
@@ -57,6 +57,7 @@ static int jt_set_numa(int argc, char **argv);
 static int jt_set_retry_count(int argc, char **argv);
 static int jt_set_transaction_to(int argc, char **argv);
 static int jt_set_recov_intrv(int argc, char **argv);
+static int jt_set_rtr_sensitivity(int argc, char **argv);
 static int jt_set_hsensitivity(int argc, char **argv);
 static int jt_add_peer_nid(int argc, char **argv);
 static int jt_del_peer_nid(int argc, char **argv);
@@ -208,6 +209,9 @@ command_t set_cmds[] = {
         "\t>0 - sensitivity value not more than 1000\n"},
        {"recovery_interval", jt_set_recov_intrv, 0, "interval to ping in seconds (at least 1)\n"
         "\t>0 - time in seconds between pings\n"},
+       {"router_sensitivity", jt_set_rtr_sensitivity, 0, "router sensitivity %\n"
+        "\t100 - router interfaces need to be fully healthy to be used\n"
+        "\t<100 - router interfaces can be used even if not healthy\n"},
        { 0, 0, 0, NULL }
 };
 
@@ -394,6 +398,34 @@ static int jt_set_recov_intrv(int argc, char **argv)
        return rc;
 }
 
+static int jt_set_rtr_sensitivity(int argc, char **argv)
+{
+       long int value;
+       int rc;
+       struct cYAML *err_rc = NULL;
+
+       rc = check_cmd(set_cmds, "set", "router_sensitivity", 2, argc, argv);
+       if (rc)
+               return rc;
+
+       rc = parse_long(argv[1], &value);
+       if (rc != 0) {
+               cYAML_build_error(-1, -1, "parser", "set",
+                                 "cannot parse router sensitivity value", &err_rc);
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+               return -1;
+       }
+
+       rc = lustre_lnet_config_rtr_sensitivity(value, -1, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
 static int jt_set_hsensitivity(int argc, char **argv)
 {
        long int value;
@@ -1299,6 +1331,12 @@ static int jt_show_global(int argc, char **argv)
                goto out;
        }
 
+       rc = lustre_lnet_show_rtr_sensitivity(-1, &show_rc, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_print_tree2file(stderr, err_rc);
+               goto out;
+       }
+
        if (show_rc)
                cYAML_print_tree(show_rc);