extern unsigned int lnet_recovery_limit;
extern unsigned int lnet_peer_discovery_disabled;
extern unsigned int lnet_drop_asym_route;
+extern unsigned int lnet_max_recovery_ping_interval;
+extern unsigned int lnet_max_recovery_ping_count;
extern unsigned int router_sensitivity_percentage;
extern int alive_router_check_interval;
extern int live_router_check_interval;
return false;
}
-#define LNET_RECOVERY_INTERVAL_MAX 900
static inline unsigned int
lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now)
{
unsigned int interval;
- /* 2^9 = 512, 2^10 = 1024 */
- if (ping_count > 9)
- interval = LNET_RECOVERY_INTERVAL_MAX;
+ /* lnet_max_recovery_interval <= 2^lnet_max_recovery_ping_count */
+ if (ping_count > lnet_max_recovery_ping_count)
+ interval = lnet_max_recovery_ping_interval;
else
interval = 1 << ping_count;
MODULE_PARM_DESC(lnet_recovery_limit,
"How long to attempt recovery of unhealthy peer interfaces in seconds. Set to 0 to allow indefinite recovery");
+unsigned int lnet_max_recovery_ping_interval = 900;
+unsigned int lnet_max_recovery_ping_count = 9;
+static int max_recovery_ping_interval_set(const char *val,
+ cfs_kernel_param_arg_t *kp);
+
+#define param_check_max_recovery_ping_interval(name, p) \
+ __param_check(name, p, int)
+
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_max_recovery_ping_interval = {
+ .set = max_recovery_ping_interval_set,
+ .get = param_get_int,
+};
+module_param(lnet_max_recovery_ping_interval, max_recovery_ping_interval, 0644);
+#else
+module_param_call(lnet_max_recovery_ping_interval, max_recovery_ping_interval,
+ param_get_int, &lnet_max_recovery_ping_interval, 0644);
+#endif
+MODULE_PARM_DESC(lnet_max_recovery_ping_interval,
+ "The max interval between LNet recovery pings, in seconds");
+
static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
}
static int
+max_recovery_ping_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+ int rc;
+ unsigned long value;
+
+ rc = kstrtoul(val, 0, &value);
+ if (rc) {
+ CERROR("Invalid module parameter value for 'lnet_max_recovery_ping_interval'\n");
+ return rc;
+ }
+
+ if (!value) {
+ CERROR("Invalid max ping timeout. Must be strictly positive\n");
+ return -EINVAL;
+ }
+
+ /* The purpose of locking the api_mutex here is to ensure that
+ * the correct value ends up stored properly.
+ */
+ mutex_lock(&the_lnet.ln_api_mutex);
+ lnet_max_recovery_ping_interval = value;
+ lnet_max_recovery_ping_count = 0;
+ value >>= 1;
+ while (value) {
+ lnet_max_recovery_ping_count++;
+ value >>= 1;
+ }
+ mutex_unlock(&the_lnet.ln_api_mutex);
+
+ return 0;
+}
+
+static int
discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
{
int rc;
return rc;
}
+int lustre_lnet_config_max_recovery_ping_interval(int interval, int seq_no,
+ struct cYAML **err_rc)
+{
+ int rc = LUSTRE_CFG_RC_NO_ERR;
+ char err_str[LNET_MAX_STR_LEN] = "\"success\"";
+ char interval_str[LNET_MAX_STR_LEN];
+
+ if (interval <= 0) {
+ rc = LUSTRE_CFG_RC_BAD_PARAM;
+ snprintf(err_str, sizeof(err_str),
+ "\"must be strictly positive\"");
+
+ } else {
+ snprintf(interval_str, sizeof(interval_str), "%d", interval);
+
+ rc = write_sysfs_file(modparam_path,
+ "lnet_max_recovery_ping_interval",
+ interval_str, 1,
+ strlen(interval_str) + 1);
+ if (rc)
+ snprintf(err_str, sizeof(err_str),
+ "\"cannot configure maximum recovery ping interval: %s\"",
+ strerror(errno));
+ }
+
+ cYAML_build_error(rc, seq_no, ADD_CMD, "maximum recovery ping interval",
+ err_str, err_rc);
+
+ return rc;
+}
+
+
int lustre_lnet_show_routing(int seq_no, struct cYAML **show_rc,
struct cYAML **err_rc, bool backup)
{
show_rc, err_rc, l_errno);
}
+int lustre_lnet_show_max_recovery_ping_interval(int seq_no,
+ struct cYAML **show_rc,
+ struct cYAML **err_rc)
+{
+ int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+ char val[LNET_MAX_STR_LEN];
+ int interval = -1, l_errno = 0;
+ char err_str[LNET_MAX_STR_LEN];
+
+ snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+ rc = read_sysfs_file(modparam_path, "lnet_max_recovery_ping_interval",
+ val, 1, sizeof(val));
+ if (rc) {
+ l_errno = -errno;
+ snprintf(err_str, sizeof(err_str),
+ "\"cannot get lnet_max_recovery_ping_interval value: %d\"",
+ rc);
+ } else {
+ interval = atoi(val);
+ }
+
+ return build_global_yaml_entry(err_str, sizeof(err_str), seq_no,
+ "max_recovery_ping_interval", interval,
+ show_rc, err_rc, l_errno);
+}
+
+
int lustre_lnet_show_max_intf(int seq_no, struct cYAML **show_rc,
struct cYAML **err_rc)
{
struct cYAML **err_rc);
int lustre_lnet_show_recovery_limit(int seq_no, struct cYAML **show_rc,
struct cYAML **err_rc);
+int lustre_lnet_show_max_recovery_ping_interval(int seq_no,
+ struct cYAML **show_rc,
+ struct cYAML **err_rc);
/*
* lustre_lnet_config_max_intf
int seq_no, struct cYAML **err_rc);
/*
+ * lustre_lnet_config_max_recovery_ping_interval
+ * Set the maximum recovery ping interval.
+ *
+ * interval - interval value in seconds
+ * seq_no - sequence number of the request
+ * err_rc - [OUT] struct cYAML tree describing the error. Freed by caller
+ */
+int lustre_lnet_config_max_recovery_ping_interval(int interval, int seq_no,
+ struct cYAML **err_rc);
+
+/*
* lustre_lnet_show_routing
* Send down an IOCTL to dump buffers and routing status
* This function is used to dump buffers for all CPU partitions.
static int jt_set_recov_intrv(int argc, char **argv);
static int jt_set_rtr_sensitivity(int argc, char **argv);
static int jt_set_hsensitivity(int argc, char **argv);
+static int jt_set_max_recovery_ping_interval(int argc, char **argv);
static int jt_reset_stats(int argc, char **argv);
static int jt_add_peer_nid(int argc, char **argv);
static int jt_del_peer_nid(int argc, char **argv);
"Set how long LNet will attempt to recover unhealthy interfaces.\n"
"\t0 - Recover indefinitely (default)\n"
"\t>0 - Recover for the specified number of seconds.\n"},
+ {"max_recovery_ping_interval", jt_set_max_recovery_ping_interval, 0,
+ "maximum recovery ping interval\n"
+ "\t>0 - maximum recovery ping interval in seconds\n"},
{ 0, 0, 0, NULL }
};
return rc;
}
+static int jt_set_max_recovery_ping_interval(int argc, char **argv)
+{
+ long int value;
+ int rc;
+ struct cYAML *err_rc = NULL;
+
+ rc = check_cmd(set_cmds, "set", "maximum recovery_interval", 2, argc, argv);
+ if (rc)
+ return rc;
+
+ rc = parse_long(argv[1], &value);
+ if (rc != 0) {
+ cYAML_build_error(-1, -1, "parser", "set",
+ "cannot parse maximum recovery interval value",
+ &err_rc);
+ cYAML_print_tree2file(stderr, err_rc);
+ cYAML_free_tree(err_rc);
+ return -1;
+ }
+
+ rc = lustre_lnet_config_max_recovery_ping_interval(value, -1, &err_rc);
+ if (rc != LUSTRE_CFG_RC_NO_ERR)
+ cYAML_print_tree2file(stderr, err_rc);
+
+ cYAML_free_tree(err_rc);
+
+ return rc;
+}
+
+
static int jt_config_lnet(int argc, char **argv)
{
struct cYAML *err_rc = NULL;
goto out;
}
+ rc = lustre_lnet_show_max_recovery_ping_interval(-1, &show_rc, &err_rc);
+ if (rc != LUSTRE_CFG_RC_NO_ERR) {
+ cYAML_print_tree2file(stderr, err_rc);
+ goto out;
+ }
+
if (show_rc)
cYAML_print_tree(show_rc);
err_rc = NULL;
}
+ rc = lustre_lnet_show_max_recovery_ping_interval(-1, &show_rc, &err_rc);
+ if (rc != LUSTRE_CFG_RC_NO_ERR) {
+ cYAML_print_tree2file(stderr, err_rc);
+ cYAML_free_tree(err_rc);
+ err_rc = NULL;
+ }
+
rc = lustre_lnet_show_udsp(-1, -1, &show_rc, &err_rc);
if (rc != LUSTRE_CFG_RC_NO_ERR) {
cYAML_print_tree2file(stderr, err_rc);
0 - Recover indefinitely (default)\.
>0 - Recover for the specified number of seconds\.
.
+.TP
+\fBlnetctl set\fR max_recovery_ping_interval \fIvalue\fR
+Set the maximum recovery ping interval.
+The recovery ping mechanism increases the next scheduled recovery ping attempt
+timeout exponentially (base 2) until it is equal to the value set.
+The default value is 900.
+.
.SS "Import and Export YAML Configuration Files"
LNet configuration can be represented in YAML format\. A YAML configuration
file can be passed to the lnetctl utility via the \fBimport\fR command\. The
# If the recovery limit is 10 seconds, then when the 5th enqueue happens
# we expect the peer NI to have aged out, so it will not actually be
# queued.
+# If max_recovery_ping_interval is set to 2 then:
+# First enqueue happens at time 0.
+# 2nd at 0 + 2^0 = 1
+# 3rd at 1 + 2^1 = 3
+# 4th at 3 + 2^1 = 5
+# 5th at 5 + 2^1 = 7
+# 6th at 7 + 2^1 = 9
+# 7th at 9 + 2^1 = 11
+# e.g. after 4 seconds we would expect to have seen the 3th enqueue,
+# (2 pings sent, 3rd about to happen), and the 4th enqueue is yet to happen
+# e.g. after 10 seconds we would expect to have seen the 6th enqueue,
+# (5 pings sent, 6th about to happen), and the 8th enqueue is yet to happen
check_ping_count() {
local queue="$1"
local expect="$2"
do_lnetctl discover $prim_nid ||
error "failed to discover myself"
+ local default=$($LNETCTL global show |
+ awk '/recovery_limit/{print $NF}')
# Set recovery limit to 10 seconds.
do_lnetctl set recovery_limit 10 ||
error "failed to set recovery_limit"
# Use local_error so LNet doesn't attempt to resend the discovery ping
$LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e local_error
$LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e local_error
- do_lnetctl discover $($LCTL list_nids | head -n 1) &&
+ do_lnetctl discover $prim_nid &&
error "Expected discovery to fail"
+ # See comment for check_ping_count()
sleep 5
- check_nid_in_recovq "-l" 1
+ check_nid_in_recovq "-l" "1"
check_ping_count "ni" "2"
sleep 5
- check_nid_in_recovq "-l" 1
+ check_nid_in_recovq "-l" "1"
check_ping_count "ni" "3"
$LCTL net_drop_del -a
+ reinit_dlc || return $?
+ add_net "tcp" "${INTERFACES[0]}" || return $?
+ add_net "tcp1" "${INTERFACES[0]}" || return $?
+
+ local prim_nid=$($LCTL list_nids | head -n 1)
+
+ do_lnetctl discover $prim_nid ||
+ error "failed to discover myself"
+
+ do_lnetctl set recovery_limit $default ||
+ error "failed to set recovery_limit"
+
+ default=$($LNETCTL global show |
+ awk '/max_recovery_ping_interval/{print $NF}')
+ do_lnetctl set max_recovery_ping_interval 2 ||
+ error "failed to set max_recovery_ping_interval"
+
+ $LCTL set_param debug=+net
+ # Use local_error so LNet doesn't attempt to resend the discovery ping
+ $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e local_error
+ $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e local_error
+ do_lnetctl discover $prim_nid &&
+ error "Expected discovery to fail"
+
+ # See comment for check_ping_count()
+ sleep 4
+ check_nid_in_recovq "-l" "1"
+ check_ping_count "ni" "2"
+
+ sleep 6
+ check_nid_in_recovq "-l" "1"
+ check_ping_count "ni" "5"
+
+ $LCTL net_drop_del -a
+
+ do_lnetctl set max_recovery_ping_interval $default ||
+ error "failed to set max_recovery_ping_interval"
+
return 0
}
run_test 210 "Local NI recovery checks"
do_lnetctl discover $prim_nid ||
error "failed to discover myself"
+ local default=$($LNETCTL global show |
+ awk '/recovery_limit/{print $NF}')
# Set recovery limit to 10 seconds.
do_lnetctl set recovery_limit 10 ||
error "failed to set recovery_limit"
check_nid_in_recovq "-p" 0
check_ping_count "peer_ni" "0"
+ reinit_dlc || return $?
+ add_net "tcp" "${INTERFACES[0]}" || return $?
+ add_net "tcp1" "${INTERFACES[0]}" || return $?
+
+ local prim_nid=$($LCTL list_nids | head -n 1)
+
+ do_lnetctl discover $prim_nid ||
+ error "failed to discover myself"
+
+ do_lnetctl set recovery_limit $default ||
+ error "failed to set recovery_limit"
+
+ default=$($LNETCTL global show |
+ awk '/max_recovery_ping_interval/{print $NF}')
+ do_lnetctl set max_recovery_ping_interval 2 ||
+ error "failed to set max_recovery_ping_interval"
+
+ $LCTL net_drop_add -s *@tcp -d *@tcp -m GET -r 1 -e remote_error
+ $LCTL net_drop_add -s *@tcp1 -d *@tcp1 -m GET -r 1 -e remote_error
+
+ # Set health to 0 on one interface. This forces it onto the recovery
+ # queue.
+ $LNETCTL peer set --nid $prim_nid --health 0
+
+ # See comment for check_ping_count()
+ sleep 4
+ check_nid_in_recovq "-p" "1"
+ check_ping_count "peer_ni" "2"
+
+ sleep 6
+ check_nid_in_recovq "-p" "1"
+ check_ping_count "peer_ni" "5"
+
+ $LCTL net_drop_del -a
+
+ do_lnetctl set max_recovery_ping_interval $default ||
+ error "failed to set max_recovery_ping_interval"
+
return 0
}
run_test 211 "Remote NI recovery checks"