static void
lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
- int status, bool unlink_event)
+ int status, bool send, bool unlink_event)
{
lnet_nid_t nid = ev_info->mt_nid;
return;
}
lnet_ni_lock(ni);
- ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+ if (!send || (send && status != 0))
+ ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
if (status)
ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
lnet_ni_unlock(ni);
return;
}
spin_lock(&lpni->lpni_lock);
- lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+ if (!send || (send && status != 0))
+ lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
if (status)
lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
spin_unlock(&lpni->lpni_lock);
libcfs_nid2str(ev_info->mt_nid));
/* fallthrough */
case LNET_EVENT_REPLY:
- lnet_handle_recovery_reply(ev_info, event->status,
+ lnet_handle_recovery_reply(ev_info, event->status, false,
event->type == LNET_EVENT_UNLINK);
break;
case LNET_EVENT_SEND:
libcfs_nid2str(ev_info->mt_nid),
(event->status) ? "unsuccessfully" :
"successfully", event->status);
+ lnet_handle_recovery_reply(ev_info, event->status, true, false);
break;
default:
CERROR("Unexpected event: %d\n", event->type);
__u32 sensitivity = lnet_health_sensitivity;
__u32 lp_sensitivity;
- /* lpni could be NULL if we're in the LOLND case */
+ /*
+ * NO-OP if:
+ * 1. lpni could be NULL if we're in the LOLND case
+ * 2. this is a recovery message
+ */
if (!lpni)
return;
case LNET_MSG_STATUS_LOCAL_ABORTED:
case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
case LNET_MSG_STATUS_LOCAL_TIMEOUT:
- lnet_handle_local_failure(ni);
+ /*
+ * don't further decrement the health value if the
+ * recovery message failed.
+ */
+ if (!msg->msg_recovery)
+ lnet_handle_local_failure(ni);
if (msg->msg_tx_committed)
/* add to the re-send queue */
return lnet_attempt_msg_resend(msg);
* finalize the message
*/
case LNET_MSG_STATUS_LOCAL_ERROR:
- lnet_handle_local_failure(ni);
+ /*
+ * don't further decrement the health value if the
+ * recovery message failed.
+ */
+ if (!msg->msg_recovery)
+ lnet_handle_local_failure(ni);
return -1;
/*
* attempt a resend safely.
*/
case LNET_MSG_STATUS_REMOTE_DROPPED:
- lnet_handle_remote_failure(lpni);
+ if (!msg->msg_recovery)
+ lnet_handle_remote_failure(lpni);
if (msg->msg_tx_committed)
return lnet_attempt_msg_resend(msg);
break;
case LNET_MSG_STATUS_REMOTE_ERROR:
case LNET_MSG_STATUS_REMOTE_TIMEOUT:
case LNET_MSG_STATUS_NETWORK_TIMEOUT:
- lnet_handle_remote_failure(lpni);
+ if (!msg->msg_recovery)
+ lnet_handle_remote_failure(lpni);
return -1;
default:
LBUG();