return;
lnet_net_lock(0);
- /* the mt could've shutdown and cleaned up the queues */
- if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
- lnet_net_unlock(0);
- return;
- }
-
lnet_dec_healthv_locked(&lpni->lpni_healthv);
/*
* add the peer NI to the recovery queue if it's not already there
* value will not be reduced. In this case, there is no reason to
* invoke recovery
*/
- if (list_empty(&lpni->lpni_recovery) &&
- atomic_read(&lpni->lpni_healthv) < LNET_MAX_HEALTH_VALUE) {
- CERROR("lpni %s added to recovery queue. Health = %d\n",
- libcfs_nid2str(lpni->lpni_nid),
- atomic_read(&lpni->lpni_healthv));
- list_add_tail(&lpni->lpni_recovery, &the_lnet.ln_mt_peerNIRecovq);
- lnet_peer_ni_addref_locked(lpni);
- }
+ lnet_peer_ni_add_to_recoveryq_locked(lpni);
lnet_net_unlock(0);
}
+static void
+lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
+{
+ struct lnet_ni *ni = msg->msg_txni;
+ struct lnet_peer_ni *lpni = msg->msg_txpeer;
+ struct lnet_counters *counters = the_lnet.ln_counters[0];
+
+ switch (hstatus) {
+ case LNET_MSG_STATUS_LOCAL_INTERRUPT:
+ atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
+ counters->local_interrupt_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_DROPPED:
+ atomic_inc(&ni->ni_hstats.hlt_local_dropped);
+ counters->local_dropped_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_ABORTED:
+ atomic_inc(&ni->ni_hstats.hlt_local_aborted);
+ counters->local_aborted_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
+ atomic_inc(&ni->ni_hstats.hlt_local_no_route);
+ counters->local_no_route_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_TIMEOUT:
+ atomic_inc(&ni->ni_hstats.hlt_local_timeout);
+ counters->local_timeout_count++;
+ break;
+ case LNET_MSG_STATUS_LOCAL_ERROR:
+ atomic_inc(&ni->ni_hstats.hlt_local_error);
+ counters->local_error_count++;
+ break;
+ case LNET_MSG_STATUS_REMOTE_DROPPED:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
+ counters->remote_dropped_count++;
+ break;
+ case LNET_MSG_STATUS_REMOTE_ERROR:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
+ counters->remote_error_count++;
+ break;
+ case LNET_MSG_STATUS_REMOTE_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
+ counters->remote_timeout_count++;
+ break;
+ case LNET_MSG_STATUS_NETWORK_TIMEOUT:
+ if (lpni)
+ atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
+ counters->network_timeout_count++;
+ break;
+ case LNET_MSG_STATUS_OK:
+ break;
+ default:
+ LBUG();
+ }
+}
+
/*
* Do a health check on the message:
- * return -1 if we're not going to handle the error
+ * return -1 if we're not going to handle the error or
+ * if we've reached the maximum number of retries.
* success case will return -1 as well
* return 0 if it the message is requeued for send
*/
enum lnet_msg_hstatus hstatus = msg->msg_health_status;
bool lo = false;
- /* TODO: lnet_incr_hstats(hstatus); */
+ /* if we're shutting down no point in handling health. */
+ if (the_lnet.ln_state != LNET_STATE_RUNNING)
+ return -1;
LASSERT(msg->msg_txni);
ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
return -1;
- /* if we're shutting down no point in handling health. */
- if (the_lnet.ln_state != LNET_STATE_RUNNING)
- return -1;
+ /*
+ * stats are only incremented for errors so avoid wasting time
+ * incrementing statistics if there is no error.
+ */
+ if (hstatus != LNET_MSG_STATUS_OK) {
+ lnet_net_lock(0);
+ lnet_incr_hstats(msg, hstatus);
+ lnet_net_unlock(0);
+ }
CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
libcfs_nid2str(msg->msg_txni->ni_nid),
if (msg->msg_no_resend)
return -1;
+ /* check if the message has exceeded the number of retries */
+ if (msg->msg_retry_count >= lnet_retry_count)
+ return -1;
+ msg->msg_retry_count++;
+
lnet_net_lock(msg->msg_tx_cpt);
/*
}
}
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+ enum lnet_msg_hstatus *hstatus)
+{
+ if (!msg)
+ return false;
+
+ if (list_empty(&the_lnet.ln_drop_rules))
+ return false;
+
+ /* match only health rules */
+ if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
+ return false;
+
+ CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
+ libcfs_nid2str(msg->msg_hdr.src_nid),
+ libcfs_nid2str(msg->msg_hdr.dest_nid),
+ lnet_msgtyp2str(msg->msg_type),
+ lnet_health_error2str(*hstatus));
+
+ return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
void
lnet_finalize(struct lnet_msg *msg, int status)
{
msg->msg_ev.status = status;
+ /*
+ * if this is an ACK or a REPLY then make sure to remove the
+ * response tracker.
+ */
+ if (msg->msg_ev.type == LNET_EVENT_REPLY ||
+ msg->msg_ev.type == LNET_EVENT_ACK) {
+ cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
+ lnet_detach_rsp_tracker(msg->msg_md, cpt);
+ }
+
/* if the message is successfully sent, no need to keep the MD around */
if (msg->msg_md != NULL && !status)
lnet_detach_md(msg, status);