/* NB: returns 1 when alive, 0 when dead, negative when error;
* may drop the lnet_net_lock */
static int
-lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
+ struct lnet_msg *msg)
{
time64_t now = ktime_get_seconds();
return 1;
/*
+ * If we're resending a message, let's attempt to send it even if
+ * the peer is down to fulfill our resend quota on the message
+ */
+ if (msg->msg_retry_count > 0)
+ return 1;
+
+ /*
* Peer appears dead, but we should avoid frequent NI queries (at
* most once per lnet_queryinterval seconds).
*/
/* NB 'lp' is always the next hop */
if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 &&
- lnet_peer_alive_locked(ni, lp) == 0) {
+ lnet_peer_alive_locked(ni, lp, msg) == 0) {
the_lnet.ln_counters[cpt]->drop_count++;
the_lnet.ln_counters[cpt]->drop_length += msg->msg_len;
lnet_net_unlock(cpt);
}
rcd = rtr->lpni_rcd;
- if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis)
+
+ /*
+ * The response to the router checker ping could've timed out and
+ * the mdh might've been invalidated, so we need to update it
+ * again.
+ */
+ if (!rcd || rcd->rcd_nnis > rcd->rcd_pingbuffer->pb_nnis ||
+ LNetMDHandleIsInvalid(rcd->rcd_mdh))
rcd = lnet_update_rc_data_locked(rtr);
if (rcd == NULL)
return;