/*
* lnet_health_sensitivity determines by how much we decrement the health
- * value on sending error. The value defaults to 0, which means health
- * checking is turned off by default.
+ * value on sending error. The value defaults to 100, which means health
+ * interface health is decremented by 100 points every failure.
*/
-unsigned int lnet_health_sensitivity = 0;
+unsigned int lnet_health_sensitivity = 100;
static int sensitivity_set(const char *val, cfs_kernel_param_arg_t *kp);
#ifdef HAVE_KERNEL_PARAM_OPS
static struct kernel_param_ops param_ops_health_sensitivity = {
MODULE_PARM_DESC(lnet_drop_asym_route,
"Set to 1 to drop asymmetrical route messages.");
-unsigned lnet_transaction_timeout = 50;
+#define LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT 50
+#define LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT 10
+
+unsigned lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
static int transaction_to_set(const char *val, cfs_kernel_param_arg_t *kp);
#ifdef HAVE_KERNEL_PARAM_OPS
static struct kernel_param_ops param_ops_transaction_timeout = {
MODULE_PARM_DESC(lnet_transaction_timeout,
"Maximum number of seconds to wait for a peer response.");
-unsigned lnet_retry_count = 0;
+#define LNET_RETRY_COUNT_HEALTH_DEFAULT 3
+unsigned lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
static int retry_count_set(const char *val, cfs_kernel_param_arg_t *kp);
#ifdef HAVE_KERNEL_PARAM_OPS
static struct kernel_param_ops param_ops_retry_count = {
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
if (value > LNET_MAX_HEALTH_VALUE) {
mutex_unlock(&the_lnet.ln_api_mutex);
CERROR("Invalid health value. Maximum: %d value = %lu\n",
return -EINVAL;
}
+ /*
+ * if we're turning on health then use the health timeout
+ * defaults.
+ */
+ if (*sensitivity == 0 && value != 0) {
+ lnet_transaction_timeout = LNET_TRANSACTION_TIMEOUT_HEALTH_DEFAULT;
+ lnet_retry_count = LNET_RETRY_COUNT_HEALTH_DEFAULT;
+ /*
+ * if we're turning off health then use the no health timeout
+ * default.
+ */
+ } else if (*sensitivity != 0 && value == 0) {
+ lnet_transaction_timeout =
+ LNET_TRANSACTION_TIMEOUT_NO_HEALTH_DEFAULT;
+ lnet_retry_count = 0;
+ }
+
*sensitivity = value;
mutex_unlock(&the_lnet.ln_api_mutex);
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
*interval = value;
mutex_unlock(&the_lnet.ln_api_mutex);
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
if (value < lnet_retry_count || value == 0) {
mutex_unlock(&the_lnet.ln_api_mutex);
CERROR("Invalid value for lnet_transaction_timeout (%lu). "
*/
mutex_lock(&the_lnet.ln_api_mutex);
- if (the_lnet.ln_state != LNET_STATE_RUNNING) {
+ if (lnet_health_sensitivity == 0) {
mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
+ CERROR("Can not set retry_count when health feature is turned off\n");
+ return -EINVAL;
}
if (value > lnet_transaction_timeout) {
return -EINVAL;
}
- if (value == *retry_count) {
- mutex_unlock(&the_lnet.ln_api_mutex);
- return 0;
- }
-
*retry_count = value;
if (value == 0)
INIT_LIST_HEAD(&the_lnet.ln_mt_localNIRecovq);
INIT_LIST_HEAD(&the_lnet.ln_mt_peerNIRecovq);
init_waitqueue_head(&the_lnet.ln_dc_waitq);
+ LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
rc = lnet_descriptor_setup();
if (rc != 0)
static int
lnet_unprepare (void)
{
+ int rc;
+
/* NB no LNET_LOCK since this is the last reference. All LND instances
* have shut down already, so it is safe to unlink and free all
* descriptors, even those that appear committed to a network op (eg MD
LASSERT(list_empty(&the_lnet.ln_test_peers));
LASSERT(list_empty(&the_lnet.ln_nets));
+ if (!LNetEQHandleIsInvalid(the_lnet.ln_mt_eqh)) {
+ rc = LNetEQFree(the_lnet.ln_mt_eqh);
+ LNetInvalidateEQHandle(&the_lnet.ln_mt_eqh);
+ LASSERT(rc == 0);
+ }
+
lnet_portals_destroy();
if (the_lnet.ln_md_containers != NULL) {
return count;
}
+void
+lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
+{
+ struct lnet_ni_status *stat;
+ int nnis;
+ int i;
+
+ __swab32s(&pbuf->pb_info.pi_magic);
+ __swab32s(&pbuf->pb_info.pi_features);
+ __swab32s(&pbuf->pb_info.pi_pid);
+ __swab32s(&pbuf->pb_info.pi_nnis);
+ nnis = pbuf->pb_info.pi_nnis;
+ if (nnis > pbuf->pb_nnis)
+ nnis = pbuf->pb_nnis;
+ for (i = 0; i < nnis; i++) {
+ stat = &pbuf->pb_info.pi_ni[i];
+ __swab64s(&stat->ns_nid);
+ __swab32s(&stat->ns_status);
+ }
+ return;
+}
+
int
lnet_ping_info_validate(struct lnet_ping_info *pinfo)
{
}
the_lnet.ln_refcount = 0;
- LNetInvalidateEQHandle(&the_lnet.ln_rc_eqh);
INIT_LIST_HEAD(&the_lnet.ln_lnds);
INIT_LIST_HEAD(&the_lnet.ln_net_zombie);
- INIT_LIST_HEAD(&the_lnet.ln_rcd_zombie);
INIT_LIST_HEAD(&the_lnet.ln_msg_resend);
- INIT_LIST_HEAD(&the_lnet.ln_rcd_deathrow);
/* The hash table size is the number of bits it takes to express the set
* ln_num_routes, minus 1 (better to under estimate than over so we
if (rc != 0)
goto err_shutdown_lndnis;
- rc = lnet_check_routes();
- if (rc != 0)
- goto err_destroy_routes;
-
rc = lnet_rtrpools_alloc(im_a_router);
if (rc != 0)
goto err_destroy_routes;
lnet_ping_target_update(pbuf, ping_mdh);
+ rc = LNetEQAlloc(0, lnet_mt_event_handler, &the_lnet.ln_mt_eqh);
+ if (rc != 0) {
+ CERROR("Can't allocate monitor thread EQ: %d\n", rc);
+ goto err_stop_ping;
+ }
+
rc = lnet_monitor_thr_start();
if (rc != 0)
goto err_stop_ping;
lnet_fault_fini();
- lnet_router_debugfs_init();
+ lnet_router_debugfs_fini();
lnet_peer_discovery_stop();
lnet_push_target_fini();
lnet_monitor_thr_stop();
config->cfg_nid,
config->cfg_config_u.cfg_route.
rtr_priority);
- if (rc == 0) {
- rc = lnet_check_routes();
- if (rc != 0)
- lnet_del_route(config->cfg_net,
- config->cfg_nid);
- }
mutex_unlock(&the_lnet.ln_api_mutex);
return rc;