From 3e5c6620fd0b0511498d14d38e8610d08f6da7b3 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Fri, 21 Aug 2020 14:27:07 -0500 Subject: [PATCH] LU-13569 lnet: Add lnet_recovery_limit to lnetctl Allow setting/reading lnet_recovery_limit via lnetctl. Test-Parameters: trivial HPE-bug-id: LUS-9109 Signed-off-by: Chris Horn Change-Id: I5aac297bad15e43a52d8b8531da08a1d3f559bea Reviewed-on: https://review.whamcloud.com/39717 Reviewed-by: Serguei Smirnov Reviewed-by: Amir Shehata Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- lnet/utils/lnetconfig/liblnetconfig.c | 75 ++++++++++++++++++++++++++++++++++- lnet/utils/lnetconfig/liblnetconfig.h | 4 ++ lnet/utils/lnetctl.c | 52 +++++++++++++++++++++++- lustre/doc/lnetctl.8 | 6 +++ 4 files changed, 134 insertions(+), 3 deletions(-) diff --git a/lnet/utils/lnetconfig/liblnetconfig.c b/lnet/utils/lnetconfig/liblnetconfig.c index c584c1d..a5a3eeb 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.c +++ b/lnet/utils/lnetconfig/liblnetconfig.c @@ -2421,6 +2421,36 @@ int lustre_lnet_config_response_tracking(int val, int seq_no, return rc; } +int lustre_lnet_config_recovery_limit(int val, int seq_no, + struct cYAML **err_rc) +{ + int rc = LUSTRE_CFG_RC_NO_ERR; + char err_str[LNET_MAX_STR_LEN]; + char val_str[LNET_MAX_STR_LEN]; + + if (val < 0) { + rc = LUSTRE_CFG_RC_BAD_PARAM; + snprintf(err_str, sizeof(err_str), + "\"Must be greater than or equal to 0\""); + } else { + snprintf(err_str, sizeof(err_str), "\"success\""); + + snprintf(val_str, sizeof(val_str), "%d", val); + + rc = write_sysfs_file(modparam_path, "lnet_recovery_limit", + val_str, 1, strlen(val_str) + 1); + if (rc) + snprintf(err_str, sizeof(err_str), + "\"cannot configure recovery limit: %s\"", + strerror(errno)); + } + + cYAML_build_error(rc, seq_no, ADD_CMD, "recovery_limit", err_str, + err_rc); + + return rc; +} + int lustre_lnet_config_max_intf(int max, int seq_no, struct cYAML **err_rc) { int rc = LUSTRE_CFG_RC_NO_ERR; @@ -3511,6 +3541,31 @@ int lustre_lnet_show_response_tracking(int seq_no, struct cYAML **show_rc, show_rc, err_rc, l_errno); } +int lustre_lnet_show_recovery_limit(int seq_no, struct cYAML **show_rc, + struct cYAML **err_rc) +{ + int rc = LUSTRE_CFG_RC_OUT_OF_MEM; + char val[LNET_MAX_STR_LEN]; + int recov_limit = -1, l_errno = 0; + char err_str[LNET_MAX_STR_LEN]; + + snprintf(err_str, sizeof(err_str), "\"out of memory\""); + + rc = read_sysfs_file(modparam_path, "lnet_recovery_limit", val, + 1, sizeof(val)); + if (rc) { + l_errno = -errno; + snprintf(err_str, sizeof(err_str), + "\"cannot get lnet_recovery_limit value: %d\"", rc); + } else { + recov_limit = atoi(val); + } + + return build_global_yaml_entry(err_str, sizeof(err_str), seq_no, + "recovery_limit", recov_limit, + show_rc, err_rc, l_errno); +} + int lustre_lnet_show_max_intf(int seq_no, struct cYAML **show_rc, struct cYAML **err_rc) { @@ -4498,7 +4553,8 @@ static int handle_yaml_config_global_settings(struct cYAML *tree, struct cYAML **err_rc) { struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no, - *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking; + *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking, + *recov_limit; int rc = 0; seq_no = cYAML_get_object_item(tree, "seq_no"); @@ -4572,6 +4628,13 @@ static int handle_yaml_config_global_settings(struct cYAML *tree, : -1, err_rc); + recov_limit = cYAML_get_object_item(tree, "recovery_limit"); + if (recov_limit) + rc = lustre_lnet_config_recovery_limit(recov_limit->cy_valueint, + seq_no ? seq_no->cy_valueint + : -1, + err_rc); + return rc; } @@ -4619,7 +4682,8 @@ static int handle_yaml_show_global_settings(struct cYAML *tree, struct cYAML **err_rc) { struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no, - *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking; + *sen, *recov, *rsen, *drop_asym_route, *rsp_tracking, + *recov_limit; int rc = 0; seq_no = cYAML_get_object_item(tree, "seq_no"); @@ -4684,6 +4748,13 @@ static int handle_yaml_show_global_settings(struct cYAML *tree, -1, show_rc, err_rc); + recov_limit = cYAML_get_object_item(tree, "recovery_limit"); + if (recov_limit) + rc = lustre_lnet_show_recovery_limit(seq_no ? + seq_no->cy_valueint : + -1, + show_rc, err_rc); + return rc; } diff --git a/lnet/utils/lnetconfig/liblnetconfig.h b/lnet/utils/lnetconfig/liblnetconfig.h index b6ea68b..fe9ce5d 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.h +++ b/lnet/utils/lnetconfig/liblnetconfig.h @@ -403,6 +403,10 @@ int lustre_lnet_config_response_tracking(int count, int seq_no, struct cYAML **err_rc); int lustre_lnet_show_response_tracking(int seq_no, struct cYAML **show_rc, struct cYAML **err_rc); +int lustre_lnet_config_recovery_limit(int val, int seq_no, + struct cYAML **err_rc); +int lustre_lnet_show_recovery_limit(int seq_no, struct cYAML **show_rc, + struct cYAML **err_rc); /* * lustre_lnet_config_max_intf diff --git a/lnet/utils/lnetctl.c b/lnet/utils/lnetctl.c index 33bf046..2328d00 100644 --- a/lnet/utils/lnetctl.c +++ b/lnet/utils/lnetctl.c @@ -84,6 +84,7 @@ static int jt_set_ni_value(int argc, char **argv); static int jt_set_peer_ni_value(int argc, char **argv); static int jt_calc_service_id(int argc, char **argv); static int jt_set_response_tracking(int argc, char **argv); +static int jt_set_recovery_limit(int argc, char **argv); command_t cmd_list[] = { {"lnet", jt_lnet, 0, "lnet {configure | unconfigure} [--all]"}, @@ -92,7 +93,10 @@ command_t cmd_list[] = { {"routing", jt_routing, 0, "routing {show | help}"}, {"set", jt_set, 0, "set {tiny_buffers | small_buffers | large_buffers" " | routing | numa_range | max_interfaces" - " | discovery}"}, + " | discovery | drop_asym_route | retry_count" + " | transaction_timeout | health_sensitivity" + " | recovery_interval | router_sensitivity" + " | response_tracking | recovery_limit}"}, {"import", jt_import, 0, "import FILE.yaml"}, {"export", jt_export, 0, "export FILE.yaml"}, {"stats", jt_stats, 0, "stats {show | help}"}, @@ -225,6 +229,10 @@ command_t set_cmds[] = { "\t3 - Both PUTs and GETs are eligible for response tracking (default)\n" "\tNote: Regardless of the value of the response_tracking parameter LNet\n" "\t pings and discovery pushes always utilize response tracking\n"}, + {"recovery_limit", jt_set_recovery_limit, 0, + "Set how long LNet will attempt to recover unhealthy interfaces.\n" + "\t0 - Recover indefinitely (default)\n" + "\t>0 - Recover for the specified number of seconds.\n"}, { 0, 0, 0, NULL } }; @@ -372,6 +380,35 @@ static int jt_set_response_tracking(int argc, char **argv) return rc; } +static int jt_set_recovery_limit(int argc, char **argv) +{ + long int value; + int rc; + struct cYAML *err_rc = NULL; + + rc = check_cmd(set_cmds, "set", "recovery_limit", 2, argc, argv); + if (rc) + return rc; + + rc = parse_long(argv[1], &value); + if (rc != 0) { + cYAML_build_error(-1, -1, "parser", "set", + "cannot parse recovery_limit value", + &err_rc); + cYAML_print_tree2file(stderr, err_rc); + cYAML_free_tree(err_rc); + return -1; + } + + rc = lustre_lnet_config_recovery_limit(value, -1, &err_rc); + if (rc != LUSTRE_CFG_RC_NO_ERR) + cYAML_print_tree2file(stderr, err_rc); + + cYAML_free_tree(err_rc); + + return rc; +} + static int jt_set_max_intf(int argc, char **argv) { long int value; @@ -1418,6 +1455,12 @@ static int jt_show_global(int argc, char **argv) goto out; } + rc = lustre_lnet_show_recovery_limit(-1, &show_rc, &err_rc); + if (rc != LUSTRE_CFG_RC_NO_ERR) { + cYAML_print_tree2file(stderr, err_rc); + goto out; + } + if (show_rc) cYAML_print_tree(show_rc); @@ -1754,6 +1797,13 @@ static int jt_export(int argc, char **argv) err_rc = NULL; } + rc = lustre_lnet_show_recovery_limit(-1, &show_rc, &err_rc); + if (rc != LUSTRE_CFG_RC_NO_ERR) { + cYAML_print_tree2file(stderr, err_rc); + cYAML_free_tree(err_rc); + err_rc = NULL; + } + if (show_rc != NULL) { cYAML_print_tree2file(f, show_rc); cYAML_free_tree(show_rc); diff --git a/lustre/doc/lnetctl.8 b/lustre/doc/lnetctl.8 index 6ad711f..6e04865 100644 --- a/lustre/doc/lnetctl.8 +++ b/lustre/doc/lnetctl.8 @@ -261,6 +261,12 @@ Set the behavior of response tracking\. Note: Regardless of the value of the response_tracking parameter LNet pings and discovery pushes always utilize response tracking\. . +.TP +\fBlnetctl set\fR recovery_limit \fIvalue\fR +Set how long LNet will attempt to recover unhealthy peer interfaces\. + 0 - Recover indefinitely (default)\. + >0 - Recover for the specified number of seconds\. +. .SS "Import and Export YAML Configuration Files" LNet configuration can be represented in YAML format\. A YAML configuration file can be passed to the lnetctl utility via the \fBimport\fR command\. The -- 1.8.3.1