*/
unsigned int lnet_health_sensitivity = 0;
static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
static struct kernel_param_ops param_ops_health_sensitivity = {
.set = sensitivity_set,
.get = param_get_int,
};
#define param_check_health_sensitivity(name, p) \
__param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
module_param(lnet_health_sensitivity, health_sensitivity, S_IRUGO|S_IWUSR);
#else
module_param_call(lnet_health_sensitivity, sensitivity_set, param_get_int,
MODULE_PARM_DESC(lnet_health_sensitivity,
"Value to decrement the health value by on error");
+/*
+ * lnet_recovery_interval determines how often we should perform recovery
+ * on unhealthy interfaces.
+ */
+unsigned int lnet_recovery_interval = 1;
+static int recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
+static struct kernel_param_ops param_ops_recovery_interval = {
+ .set = recovery_interval_set,
+ .get = param_get_int,
+};
+#define param_check_recovery_interval(name, p) \
+ __param_check(name, p, int)
+module_param(lnet_recovery_interval, recovery_interval, S_IRUGO|S_IWUSR);
+#else
+module_param_call(lnet_recovery_interval, recovery_interval_set, param_get_int,
+ &lnet_recovery_interval, S_IRUGO|S_IWUSR);
+#endif
+MODULE_PARM_DESC(lnet_recovery_interval,
+ "Interval to recover unhealthy interfaces in seconds");
+
static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT;
static int intf_max_set(const char *val, cfs_kernel_param_arg_t *kp);
MODULE_PARM_DESC(lnet_peer_discovery_disabled,
"Set to 1 to disable peer discovery on this node.");
-unsigned lnet_transaction_timeout = 5;
+unsigned lnet_transaction_timeout = 50;
static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
static struct kernel_param_ops param_ops_transaction_timeout = {
.set = transaction_to_set,
.get = param_get_int,
#define param_check_transaction_timeout(name, p) \
__param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
module_param(lnet_transaction_timeout, transaction_timeout, S_IRUGO|S_IWUSR);
#else
module_param_call(lnet_transaction_timeout, transaction_to_set, param_get_int,
&lnet_transaction_timeout, S_IRUGO|S_IWUSR);
#endif
-MODULE_PARM_DESC(lnet_peer_discovery_disabled,
- "Set to 1 to disable peer discovery on this node.");
+MODULE_PARM_DESC(lnet_transaction_timeout,
+ "Maximum number of seconds to wait for a peer response.");
unsigned lnet_retry_count = 0;
static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
+#ifdef HAVE_KERNEL_PARAM_OPS
static struct kernel_param_ops param_ops_retry_count = {
.set = retry_count_set,
.get = param_get_int,
#define param_check_retry_count(name, p) \
__param_check(name, p, int)
-#ifdef HAVE_KERNEL_PARAM_OPS
module_param(lnet_retry_count, retry_count, S_IRUGO|S_IWUSR);
#else
module_param_call(lnet_retry_count, retry_count_set, param_get_int,
return 0;
}
- if (value == *sensitivity) {
+ if (value > LNET_MAX_HEALTH_VALUE) {
mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
+ CERROR("Invalid health value. Maximum: %d value = %lu\n",
+ LNET_MAX_HEALTH_VALUE, value);
+ return -EINVAL;
}
*sensitivity = value;
}
static int
+recovery_interval_set(const char *val, cfs_kernel_param_arg_t *kp)
+{
+ int rc;
+ unsigned *interval = (unsigned *)kp->arg;
+ unsigned long value;
+
+ rc = kstrtoul(val, 0, &value);
+ if (rc) {
+ CERROR("Invalid module parameter value for 'lnet_recovery_interval'\n");
+ return rc;
+ }
+
+ if (value < 1) {
+ CERROR("lnet_recovery_interval must be at least 1 second\n");
+ return -EINVAL;
+ }
+
+ /*
+ * The purpose of locking the api_mutex here is to ensure that
+ * the correct value ends up stored properly.
+ */
+ mutex_lock(&the_lnet.ln_api_mutex);
+
+ if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ mutex_unlock(&the_lnet.ln_api_mutex);
+ return 0;
+ }
+
+ *interval = value;
+
+ mutex_unlock(&the_lnet.ln_api_mutex);
+
+ return 0;
+}
+
+static int
discovery_set(const char *val, cfs_kernel_param_arg_t *kp)
{
int rc;
EXPORT_SYMBOL(lnet_unregister_lnd);
void
+lnet_counters_get_common(struct lnet_counters_common *common)
+{
+ struct lnet_counters *ctr;
+ int i;
+
+ memset(common, 0, sizeof(*common));
+
+ lnet_net_lock(LNET_LOCK_EX);
+
+ cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
+ common->lcc_msgs_max += ctr->lct_common.lcc_msgs_max;
+ common->lcc_msgs_alloc += ctr->lct_common.lcc_msgs_alloc;
+ common->lcc_errors += ctr->lct_common.lcc_errors;
+ common->lcc_send_count += ctr->lct_common.lcc_send_count;
+ common->lcc_recv_count += ctr->lct_common.lcc_recv_count;
+ common->lcc_route_count += ctr->lct_common.lcc_route_count;
+ common->lcc_drop_count += ctr->lct_common.lcc_drop_count;
+ common->lcc_send_length += ctr->lct_common.lcc_send_length;
+ common->lcc_recv_length += ctr->lct_common.lcc_recv_length;
+ common->lcc_route_length += ctr->lct_common.lcc_route_length;
+ common->lcc_drop_length += ctr->lct_common.lcc_drop_length;
+ }
+ lnet_net_unlock(LNET_LOCK_EX);
+}
+EXPORT_SYMBOL(lnet_counters_get_common);
+
+void
lnet_counters_get(struct lnet_counters *counters)
{
struct lnet_counters *ctr;
+ struct lnet_counters_health *health = &counters->lct_health;
int i;
memset(counters, 0, sizeof(*counters));
+ lnet_counters_get_common(&counters->lct_common);
+
lnet_net_lock(LNET_LOCK_EX);
cfs_percpt_for_each(ctr, i, the_lnet.ln_counters) {
- counters->msgs_max += ctr->msgs_max;
- counters->msgs_alloc += ctr->msgs_alloc;
- counters->rst_alloc += ctr->rst_alloc;
- counters->errors += ctr->errors;
- counters->resend_count += ctr->resend_count;
- counters->response_timeout_count += ctr->response_timeout_count;
- counters->local_interrupt_count += ctr->local_interrupt_count;
- counters->local_dropped_count += ctr->local_dropped_count;
- counters->local_aborted_count += ctr->local_aborted_count;
- counters->local_no_route_count += ctr->local_no_route_count;
- counters->local_timeout_count += ctr->local_timeout_count;
- counters->local_error_count += ctr->local_error_count;
- counters->remote_dropped_count += ctr->remote_dropped_count;
- counters->remote_error_count += ctr->remote_error_count;
- counters->remote_timeout_count += ctr->remote_timeout_count;
- counters->network_timeout_count += ctr->network_timeout_count;
- counters->send_count += ctr->send_count;
- counters->recv_count += ctr->recv_count;
- counters->route_count += ctr->route_count;
- counters->drop_count += ctr->drop_count;
- counters->send_length += ctr->send_length;
- counters->recv_length += ctr->recv_length;
- counters->route_length += ctr->route_length;
- counters->drop_length += ctr->drop_length;
-
+ health->lch_rst_alloc += ctr->lct_health.lch_rst_alloc;
+ health->lch_resend_count += ctr->lct_health.lch_resend_count;
+ health->lch_response_timeout_count +=
+ ctr->lct_health.lch_response_timeout_count;
+ health->lch_local_interrupt_count +=
+ ctr->lct_health.lch_local_interrupt_count;
+ health->lch_local_dropped_count +=
+ ctr->lct_health.lch_local_dropped_count;
+ health->lch_local_aborted_count +=
+ ctr->lct_health.lch_local_aborted_count;
+ health->lch_local_no_route_count +=
+ ctr->lct_health.lch_local_no_route_count;
+ health->lch_local_timeout_count +=
+ ctr->lct_health.lch_local_timeout_count;
+ health->lch_local_error_count +=
+ ctr->lct_health.lch_local_error_count;
+ health->lch_remote_dropped_count +=
+ ctr->lct_health.lch_remote_dropped_count;
+ health->lch_remote_error_count +=
+ ctr->lct_health.lch_remote_error_count;
+ health->lch_remote_timeout_count +=
+ ctr->lct_health.lch_remote_timeout_count;
+ health->lch_network_timeout_count +=
+ ctr->lct_health.lch_network_timeout_count;
}
lnet_net_unlock(LNET_LOCK_EX);
}
list_del_init(&ni->ni_netlist);
/* the ni should be in deleting state. If it's not it's
* a bug */
- LASSERT(ni->ni_state & LNET_NI_STATE_DELETING);
+ LASSERT(ni->ni_state == LNET_NI_STATE_DELETING);
cfs_percpt_for_each(ref, j, ni->ni_refs) {
if (*ref == 0)
continue;
lnet_net_lock(LNET_LOCK_EX);
lnet_ni_lock(ni);
- ni->ni_state |= LNET_NI_STATE_DELETING;
- ni->ni_state &= ~LNET_NI_STATE_ACTIVE;
+ ni->ni_state = LNET_NI_STATE_DELETING;
lnet_ni_unlock(ni);
lnet_ni_unlink_locked(ni);
lnet_incr_dlc_seq();
}
lnet_ni_lock(ni);
- ni->ni_state |= LNET_NI_STATE_ACTIVE;
- ni->ni_state &= ~LNET_NI_STATE_INIT;
+ ni->ni_state = LNET_NI_STATE_ACTIVE;
lnet_ni_unlock(ni);
/* We keep a reference on the loopback net through the loopback NI */