Don't decrement the interface's health value when recovery
message fails. If we've already determined that an interface
is unhealthy, there is no need to continue decrementing
it's health every 1 second. It'll take too long to come back
into service when it becomes healthy.
Clean up where health is decremented in order not to have
repetitive decrements. No need to decrement in lnet_notify()
because in order for the LND to call this an existing transmit
must've failed. This means a message has already failed which
will result in the health being decremented.
When a recovery send fails make sure to flag the recovery as
failed because there is no reply expected in this case.
Test-parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ifb3500a77a5a5be51e7079269c8ddba85ed0c2a7
Reviewed-on: https://review.whamcloud.com/36921
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
static void
lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
static void
lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
- int status, bool unlink_event)
+ int status, bool send, bool unlink_event)
{
lnet_nid_t nid = ev_info->mt_nid;
{
lnet_nid_t nid = ev_info->mt_nid;
return;
}
lnet_ni_lock(ni);
return;
}
lnet_ni_lock(ni);
- ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
+ if (!send || (send && status != 0))
+ ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
if (status)
ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
lnet_ni_unlock(ni);
if (status)
ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
lnet_ni_unlock(ni);
return;
}
spin_lock(&lpni->lpni_lock);
return;
}
spin_lock(&lpni->lpni_lock);
- lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
+ if (!send || (send && status != 0))
+ lpni->lpni_state &= ~LNET_PEER_NI_RECOVERY_PENDING;
if (status)
lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
spin_unlock(&lpni->lpni_lock);
if (status)
lpni->lpni_state |= LNET_PEER_NI_RECOVERY_FAILED;
spin_unlock(&lpni->lpni_lock);
libcfs_nid2str(ev_info->mt_nid));
/* fallthrough */
case LNET_EVENT_REPLY:
libcfs_nid2str(ev_info->mt_nid));
/* fallthrough */
case LNET_EVENT_REPLY:
- lnet_handle_recovery_reply(ev_info, event->status,
+ lnet_handle_recovery_reply(ev_info, event->status, false,
event->type == LNET_EVENT_UNLINK);
break;
case LNET_EVENT_SEND:
event->type == LNET_EVENT_UNLINK);
break;
case LNET_EVENT_SEND:
libcfs_nid2str(ev_info->mt_nid),
(event->status) ? "unsuccessfully" :
"successfully", event->status);
libcfs_nid2str(ev_info->mt_nid),
(event->status) ? "unsuccessfully" :
"successfully", event->status);
+ lnet_handle_recovery_reply(ev_info, event->status, true, false);
break;
default:
CERROR("Unexpected event: %d\n", event->type);
break;
default:
CERROR("Unexpected event: %d\n", event->type);
__u32 sensitivity = lnet_health_sensitivity;
__u32 lp_sensitivity;
__u32 sensitivity = lnet_health_sensitivity;
__u32 lp_sensitivity;
- /* lpni could be NULL if we're in the LOLND case */
+ /*
+ * NO-OP if:
+ * 1. lpni could be NULL if we're in the LOLND case
+ * 2. this is a recovery message
+ */
case LNET_MSG_STATUS_LOCAL_ABORTED:
case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
case LNET_MSG_STATUS_LOCAL_TIMEOUT:
case LNET_MSG_STATUS_LOCAL_ABORTED:
case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
case LNET_MSG_STATUS_LOCAL_TIMEOUT:
- lnet_handle_local_failure(ni);
+ /*
+ * don't further decrement the health value if the
+ * recovery message failed.
+ */
+ if (!msg->msg_recovery)
+ lnet_handle_local_failure(ni);
if (msg->msg_tx_committed)
/* add to the re-send queue */
return lnet_attempt_msg_resend(msg);
if (msg->msg_tx_committed)
/* add to the re-send queue */
return lnet_attempt_msg_resend(msg);
* finalize the message
*/
case LNET_MSG_STATUS_LOCAL_ERROR:
* finalize the message
*/
case LNET_MSG_STATUS_LOCAL_ERROR:
- lnet_handle_local_failure(ni);
+ /*
+ * don't further decrement the health value if the
+ * recovery message failed.
+ */
+ if (!msg->msg_recovery)
+ lnet_handle_local_failure(ni);
* attempt a resend safely.
*/
case LNET_MSG_STATUS_REMOTE_DROPPED:
* attempt a resend safely.
*/
case LNET_MSG_STATUS_REMOTE_DROPPED:
- lnet_handle_remote_failure(lpni);
+ if (!msg->msg_recovery)
+ lnet_handle_remote_failure(lpni);
if (msg->msg_tx_committed)
return lnet_attempt_msg_resend(msg);
break;
if (msg->msg_tx_committed)
return lnet_attempt_msg_resend(msg);
break;
case LNET_MSG_STATUS_REMOTE_ERROR:
case LNET_MSG_STATUS_REMOTE_TIMEOUT:
case LNET_MSG_STATUS_NETWORK_TIMEOUT:
case LNET_MSG_STATUS_REMOTE_ERROR:
case LNET_MSG_STATUS_REMOTE_TIMEOUT:
case LNET_MSG_STATUS_NETWORK_TIMEOUT:
- lnet_handle_remote_failure(lpni);
+ if (!msg->msg_recovery)
+ lnet_handle_remote_failure(lpni);
return -1;
default:
LBUG();
return -1;
default:
LBUG();
(sensitivity) ? sensitivity :
lnet_health_sensitivity);
}
(sensitivity) ? sensitivity :
lnet_health_sensitivity);
}
- } else {
- lnet_handle_remote_failure_locked(lpni);
}
/* recalculate aliveness */
}
/* recalculate aliveness */