return;
lnet_net_lock(0);
- /* the mt could've shutdown and cleaned up the queues */
- if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
- lnet_net_unlock(0);
- return;
- }
-
lnet_dec_healthv_locked(&lpni->lpni_healthv);
/*
* add the peer NI to the recovery queue if it's not already there
* value will not be reduced. In this case, there is no reason to
* invoke recovery
*/
- if (list_empty(&lpni->lpni_recovery) &&
- atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
- CERROR("lpni %s added to recovery queue. Health = %d\n",
- libcfs_nid2str(lpni->lpni_nid),
- atomic_read(&lpni->lpni_healthv));
- list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
- lnet_peer_ni_addref_locked(lpni);
- }
+ lnet_peer_ni_add_to_recoveryq_locked(lpni);
lnet_net_unlock(0);
}
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+ struct lnet_ni *ni = msg->msg_txni;
+ struct lnet_peer_ni *lpni = msg->msg_txpeer;
+ struct lnet_counters *counters = the_lnet.ln_counters[0];
+
+ switch (hstatus) {
+ case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+ atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+ counters->local_interrupt_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_DROPPED:
+ atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+ counters->local_dropped_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_ABORTED:
+ atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+ counters->local_aborted_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+ atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+ counters->local_no_route_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+ atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+ counters->local_timeout_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_ERROR:
+ atomic_inc(&ni->ni_hstats.hlt_local_error);
+ counters->local_error_count++;
+ break;
+ case LNET_MSG_STATUS_REMOTE_DROPPED:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+ counters->remote_dropped_count++;
+ break;
+ case LNET_MSG_STATUS_REMOTE_ERROR:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+ counters->remote_error_count++;
+ break;
+ case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+ counters->remote_timeout_count++;
+ break;
+ case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+ counters->network_timeout_count++;
+ break;
+ case LNET_MSG_STATUS_OK:
+ break;
+ default:
+ LBUG();
+ }
+}
+
/*
* Do a health check on the message:
* return -1 if we're not going to handle the error or
enum lnet_msg_hstatus hstatus = msg->msg_health_status;
bool lo = false;
- /* TODO: lnet_incr_hstats(hstatus); */
+ /* if we're shutting down no point in handling health. */
+ if (the_lnet.ln_state != LNET_STATE_RUNNING)
+ return -1;
LASSERT(msg->msg_txni);
ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
return -1;
- /* if we're shutting down no point in handling health. */
- if (the_lnet.ln_state != LNET_STATE_RUNNING)
- return -1;
+ /*
+ * stats are only incremented for errors so avoid wasting time
+ * incrementing statistics if there is no error.
+ */
+ if (hstatus != LNET_MSG_STATUS_OK) {
+ lnet_net_lock(0);
+ lnet_incr_hstats(msg, hstatus);
+ lnet_net_unlock(0);
+ }
CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
libcfs_nid2str(msg->msg_txni->ni_nid),
}
}
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+ enum lnet_msg_hstatus *hstatus)
+{
+ if (!msg)
+ return false;
+
+ if (list_empty(&the_lnet.ln_drop_rules))
+ return false;
+
+ /* match only health rules */
+ if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
+ return false;
+
+ CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
+ libcfs_nid2str(msg->msg_hdr.src_nid),
+ libcfs_nid2str(msg->msg_hdr.dest_nid),
+ lnet_msgtyp2str(msg->msg_type),
+ lnet_health_error2str(*hstatus));
+
+ return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
void
lnet_finalize(struct lnet_msg *msg, int status)
{