Whamcloud - gitweb
LU-13569 lnet: Add lnet_recovery_limit to lnetctl 17/39717/14
authorChris Horn <chris.horn@hpe.com>
Fri, 21 Aug 2020 19:27:07 +0000 (14:27 -0500)
committerOleg Drokin <green@whamcloud.com>
Wed, 9 Dec 2020 07:48:13 +0000 (07:48 +0000)
Allow setting/reading lnet_recovery_limit via lnetctl.

Test-Parameters: trivial
HPE-bug-id: LUS-9109
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I5aac297bad15e43a52d8b8531da08a1d3f559bea
Reviewed-on: https://review.whamcloud.com/39717
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Amir Shehata <ashehata@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/utils/lnetconfig/liblnetconfig.c
lnet/utils/lnetconfig/liblnetconfig.h
lnet/utils/lnetctl.c
lustre/doc/lnetctl.8

index c584c1d..a5a3eeb 100644 (file)
@@ -2421,6 +2421,36 @@ int lustre_lnet_config_response_tracking(int val, int seq_no,
        return rc;
 }
 
+int lustre_lnet_config_recovery_limit(int val, int seq_no,
+                                     struct cYAML **err_rc)
+{
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+       char val_str[LNET_MAX_STR_LEN];
+
+       if (val < 0) {
+               rc = LUSTRE_CFG_RC_BAD_PARAM;
+               snprintf(err_str, sizeof(err_str),
+                        "\"Must be greater than or equal to 0\"");
+       } else {
+               snprintf(err_str, sizeof(err_str), "\"success\"");
+
+               snprintf(val_str, sizeof(val_str), "%d", val);
+
+               rc = write_sysfs_file(modparam_path, "lnet_recovery_limit",
+                                     val_str, 1, strlen(val_str) + 1);
+               if (rc)
+                       snprintf(err_str, sizeof(err_str),
+                                "\"cannot configure recovery limit: %s\"",
+                                strerror(errno));
+       }
+
+       cYAML_build_error(rc, seq_no, ADD_CMD, "recovery_limit", err_str,
+                         err_rc);
+
+       return rc;
+}
+
 int lustre_lnet_config_max_intf(int max, int seq_no, struct cYAML **err_rc)
 {
        int rc = LUSTRE_CFG_RC_NO_ERR;
@@ -3511,6 +3541,31 @@ int lustre_lnet_show_response_tracking(int seq_no, struct cYAML **show_rc,
                                       show_rc, err_rc, l_errno);
 }
 
+int lustre_lnet_show_recovery_limit(int seq_no, struct cYAML **show_rc,
+                                   struct cYAML **err_rc)
+{
+       int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+       char val[LNET_MAX_STR_LEN];
+       int recov_limit = -1, l_errno = 0;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+       rc = read_sysfs_file(modparam_path, "lnet_recovery_limit", val,
+                            1, sizeof(val));
+       if (rc) {
+               l_errno = -errno;
+               snprintf(err_str, sizeof(err_str),
+                        "\"cannot get lnet_recovery_limit value: %d\"", rc);
+       } else {
+               recov_limit = atoi(val);
+       }
+
+       return build_global_yaml_entry(err_str, sizeof(err_str), seq_no,
+                                      "recovery_limit", recov_limit,
+                                      show_rc, err_rc, l_errno);
+}
+
 int lustre_lnet_show_max_intf(int seq_no, struct cYAML **show_rc,
                              struct cYAML **err_rc)
 {
@@ -4498,7 +4553,8 @@ static int handle_yaml_config_global_settings(struct cYAML *tree,
                                              struct cYAML **err_rc)
 {
        struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no,
-                    *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking;
+                    *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking,
+                    *recov_limit;
        int rc = 0;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
@@ -4572,6 +4628,13 @@ static int handle_yaml_config_global_settings(struct cYAML *tree,
                                                        : -1,
                                                     err_rc);
 
+       recov_limit = cYAML_get_object_item(tree, "recovery_limit");
+       if (recov_limit)
+               rc = lustre_lnet_config_recovery_limit(recov_limit->cy_valueint,
+                                                      seq_no ? seq_no->cy_valueint
+                                                       : -1,
+                                                      err_rc);
+
        return rc;
 }
 
@@ -4619,7 +4682,8 @@ static int handle_yaml_show_global_settings(struct cYAML *tree,
                                            struct cYAML **err_rc)
 {
        struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no,
-                    *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking;
+                    *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking,
+                    *recov_limit;
        int rc = 0;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
@@ -4684,6 +4748,13 @@ static int handle_yaml_show_global_settings(struct cYAML *tree,
                                                        -1,
                                                        show_rc, err_rc);
 
+       recov_limit = cYAML_get_object_item(tree, "recovery_limit");
+       if (recov_limit)
+               rc = lustre_lnet_show_recovery_limit(seq_no ?
+                                                    seq_no->cy_valueint :
+                                                    -1,
+                                                    show_rc, err_rc);
+
        return rc;
 }
 
index b6ea68b..fe9ce5d 100644 (file)
@@ -403,6 +403,10 @@ int lustre_lnet_config_response_tracking(int count, int seq_no,
                                         struct cYAML **err_rc);
 int lustre_lnet_show_response_tracking(int seq_no, struct cYAML **show_rc,
                                       struct cYAML **err_rc);
+int lustre_lnet_config_recovery_limit(int val, int seq_no,
+                                     struct cYAML **err_rc);
+int lustre_lnet_show_recovery_limit(int seq_no, struct cYAML **show_rc,
+                                   struct cYAML **err_rc);
 
 /*
  * lustre_lnet_config_max_intf
index 33bf046..2328d00 100644 (file)
@@ -84,6 +84,7 @@ static int jt_set_ni_value(int argc, char **argv);
 static int jt_set_peer_ni_value(int argc, char **argv);
 static int jt_calc_service_id(int argc, char **argv);
 static int jt_set_response_tracking(int argc, char **argv);
+static int jt_set_recovery_limit(int argc, char **argv);
 
 command_t cmd_list[] = {
        {"lnet", jt_lnet, 0, "lnet {configure | unconfigure} [--all]"},
@@ -92,7 +93,10 @@ command_t cmd_list[] = {
        {"routing", jt_routing, 0, "routing {show | help}"},
        {"set", jt_set, 0, "set {tiny_buffers | small_buffers | large_buffers"
                           " | routing | numa_range | max_interfaces"
-                          " | discovery}"},
+                          " | discovery | drop_asym_route | retry_count"
+                          " | transaction_timeout | health_sensitivity"
+                          " | recovery_interval | router_sensitivity"
+                          " | response_tracking | recovery_limit}"},
        {"import", jt_import, 0, "import FILE.yaml"},
        {"export", jt_export, 0, "export FILE.yaml"},
        {"stats", jt_stats, 0, "stats {show | help}"},
@@ -225,6 +229,10 @@ command_t set_cmds[] = {
         "\t3 - Both PUTs and GETs are eligible for response tracking (default)\n"
         "\tNote: Regardless of the value of the response_tracking parameter LNet\n"
         "\t      pings and discovery pushes always utilize response tracking\n"},
+       {"recovery_limit", jt_set_recovery_limit, 0,
+        "Set how long LNet will attempt to recover unhealthy interfaces.\n"
+        "\t0 - Recover indefinitely (default)\n"
+        "\t>0 - Recover for the specified number of seconds.\n"},
        { 0, 0, 0, NULL }
 };
 
@@ -372,6 +380,35 @@ static int jt_set_response_tracking(int argc, char **argv)
        return rc;
 }
 
+static int jt_set_recovery_limit(int argc, char **argv)
+{
+       long int value;
+       int rc;
+       struct cYAML *err_rc = NULL;
+
+       rc = check_cmd(set_cmds, "set", "recovery_limit", 2, argc, argv);
+       if (rc)
+               return rc;
+
+       rc = parse_long(argv[1], &value);
+       if (rc != 0) {
+               cYAML_build_error(-1, -1, "parser", "set",
+                                 "cannot parse recovery_limit value",
+                                 &err_rc);
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+               return -1;
+       }
+
+       rc = lustre_lnet_config_recovery_limit(value, -1, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR)
+               cYAML_print_tree2file(stderr, err_rc);
+
+       cYAML_free_tree(err_rc);
+
+       return rc;
+}
+
 static int jt_set_max_intf(int argc, char **argv)
 {
        long int value;
@@ -1418,6 +1455,12 @@ static int jt_show_global(int argc, char **argv)
                goto out;
        }
 
+       rc = lustre_lnet_show_recovery_limit(-1, &show_rc, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_print_tree2file(stderr, err_rc);
+               goto out;
+       }
+
        if (show_rc)
                cYAML_print_tree(show_rc);
 
@@ -1754,6 +1797,13 @@ static int jt_export(int argc, char **argv)
                err_rc = NULL;
        }
 
+       rc = lustre_lnet_show_recovery_limit(-1, &show_rc, &err_rc);
+       if (rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_print_tree2file(stderr, err_rc);
+               cYAML_free_tree(err_rc);
+               err_rc = NULL;
+       }
+
        if (show_rc != NULL) {
                cYAML_print_tree2file(f, show_rc);
                cYAML_free_tree(show_rc);
index 6ad711f..6e04865 100644 (file)
@@ -261,6 +261,12 @@ Set the behavior of response tracking\.
   Note: Regardless of the value of the response_tracking parameter LNet
         pings and discovery pushes always utilize response tracking\.
 .
+.TP
+\fBlnetctl set\fR recovery_limit \fIvalue\fR
+Set how long LNet will attempt to recover unhealthy peer interfaces\.
+  0 - Recover indefinitely (default)\.
+  >0 - Recover for the specified number of seconds\.
+.
 .SS "Import and Export YAML Configuration Files"
 LNet configuration can be represented in YAML format\. A YAML configuration
 file can be passed to the lnetctl utility via the \fBimport\fR command\. The