__u32 send_case = sd->sd_send_case;
int rc;
__u32 routing = send_case & REMOTE_DST;
+ struct lnet_rsp_tracker *rspt;
/*
* Increment sequence number of the selected peer so that we
msg->msg_hdr.dest_nid = cpu_to_le64(msg->msg_txpeer->lpni_nid);
}
+ /*
+ * if we have response tracker block update it with the next hop
+ * nid
+ */
+ if (msg->msg_md) {
+ rspt = msg->msg_md->md_rspt_ptr;
+ if (rspt) {
+ rspt->rspt_next_hop_nid = msg->msg_txpeer->lpni_nid;
+ CDEBUG(D_NET, "rspt_next_hop_nid = %s\n",
+ libcfs_nid2str(rspt->rspt_next_hop_nid));
+ }
+ }
+
rc = lnet_post_send_locked(msg, 0);
if (!rc)
if (ktime_compare(ktime_get(), rspt->rspt_deadline) >= 0 ||
force) {
+ struct lnet_peer_ni *lpni;
+ lnet_nid_t nid;
+
md = lnet_handle2md(&rspt->rspt_mdh);
if (!md) {
LNetInvalidateMDHandle(&rspt->rspt_mdh);
list_del_init(&rspt->rspt_on_list);
- CNETERR("Response timed out: md = %p\n", md);
+ nid = rspt->rspt_next_hop_nid;
+
+ CNETERR("Response timed out: md = %p: nid = %s\n",
+ md, libcfs_nid2str(nid));
LNetMDUnlink(rspt->rspt_mdh);
lnet_rspt_free(rspt, i);
+
+ /*
+ * If there is a timeout on the response
+ * from the next hop decrement its health
+ * value so that we don't use it
+ */
+ lnet_net_lock(0);
+ lpni = lnet_find_peer_ni_locked(nid);
+ if (lpni) {
+ lnet_handle_remote_failure_locked(lpni);
+ lnet_peer_ni_decref_locked(lpni);
+ }
+ lnet_net_unlock(0);
} else {
lnet_res_unlock(i);
break;
LNetInvalidateMDHandle(&recovery_mdh);
- if (ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING || force) {
+ if (ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING ||
+ force) {
recovery_mdh = ni->ni_ping_mdh;
LNetInvalidateMDHandle(&ni->ni_ping_mdh);
}
lnet_net_lock(0);
lnet_ni_lock(ni);
- if (!(ni->ni_state & LNET_NI_STATE_ACTIVE) ||
+ if (ni->ni_state != LNET_NI_STATE_ACTIVE ||
healthv == LNET_MAX_HEALTH_VALUE) {
list_del_init(&ni->ni_recovery);
lnet_unlink_ni_recovery_mdh_locked(ni, 0, false);
* But we want to keep the local_ni on the recovery queue
* so we can continue the attempts to recover it.
*/
- if (ni->ni_state & LNET_NI_STATE_RECOVERY_FAILED) {
+ if (ni->ni_recovery_state & LNET_NI_RECOVERY_FAILED) {
lnet_unlink_ni_recovery_mdh_locked(ni, 0, true);
- ni->ni_state &= ~LNET_NI_STATE_RECOVERY_FAILED;
+ ni->ni_recovery_state &= ~LNET_NI_RECOVERY_FAILED;
}
lnet_ni_unlock(ni);
libcfs_nid2str(ni->ni_nid));
lnet_ni_lock(ni);
- if (!(ni->ni_state & LNET_NI_STATE_RECOVERY_PENDING)) {
- ni->ni_state |= LNET_NI_STATE_RECOVERY_PENDING;
+ if (!(ni->ni_recovery_state & LNET_NI_RECOVERY_PENDING)) {
+ ni->ni_recovery_state |= LNET_NI_RECOVERY_PENDING;
lnet_ni_unlock(ni);
LIBCFS_ALLOC(ev_info, sizeof(*ev_info));
CERROR("out of memory. Can't recover %s\n",
libcfs_nid2str(ni->ni_nid));
lnet_ni_lock(ni);
- ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+ ni->ni_recovery_state &=
+ ~LNET_NI_RECOVERY_PENDING;
lnet_ni_unlock(ni);
continue;
}
lnet_ni_lock(ni);
if (rc)
- ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+ ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
}
lnet_ni_unlock(ni);
}
static int
lnet_monitor_thread(void *arg)
{
- int wakeup_counter = 0;
+ time64_t recovery_timeout = 0;
+ time64_t rsp_timeout = 0;
+ int interval;
+ time64_t now;
/*
* The monitor thread takes care of the following:
cfs_block_allsigs();
while (the_lnet.ln_mt_state == LNET_MT_STATE_RUNNING) {
+ now = ktime_get_real_seconds();
+
if (lnet_router_checker_active())
lnet_check_routers();
lnet_resend_pending_msgs();
- wakeup_counter++;
- if (wakeup_counter >= lnet_transaction_timeout / 2) {
+ if (now >= rsp_timeout) {
lnet_finalize_expired_responses(false);
- wakeup_counter = 0;
+ rsp_timeout = now + (lnet_transaction_timeout / 2);
}
- lnet_recover_local_nis();
-
- lnet_recover_peer_nis();
+ if (now >= recovery_timeout) {
+ lnet_recover_local_nis();
+ lnet_recover_peer_nis();
+ recovery_timeout = now + lnet_recovery_interval;
+ }
/*
* TODO do we need to check if we should sleep without
* cases where we get a complaint that an idle thread
* is waking up unnecessarily.
*/
+ interval = min(lnet_recovery_interval,
+ lnet_transaction_timeout / 2);
wait_event_interruptible_timeout(the_lnet.ln_mt_waitq,
false,
- cfs_time_seconds(1));
+ cfs_time_seconds(interval));
}
/* clean up the router checker */
return;
}
lnet_ni_lock(ni);
- ni->ni_state &= ~LNET_NI_STATE_RECOVERY_PENDING;
+ ni->ni_recovery_state &= ~LNET_NI_RECOVERY_PENDING;
if (status)
- ni->ni_state |= LNET_NI_STATE_RECOVERY_FAILED;
+ ni->ni_recovery_state |= LNET_NI_RECOVERY_FAILED;
lnet_ni_unlock(ni);
lnet_net_unlock(0);