return rc;
}
-/*
- * This function can be called from two paths:
- * 1. when sending a message
- * 2. when decommiting a message (lnet_msg_decommit_tx())
- * In both these cases the peer_ni should have it's reference count
- * acquired by the caller and therefore it is safe to drop the spin
- * lock before calling lnd_query()
- */
-static void
-lnet_ni_query_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp)
-{
- time64_t last_alive = 0;
- int cpt = lnet_cpt_of_nid_locked(lp->lpni_nid, ni);
-
- LASSERT(lnet_peer_aliveness_enabled(lp));
- LASSERT(ni->ni_net->net_lnd->lnd_query != NULL);
-
- lnet_net_unlock(cpt);
- (ni->ni_net->net_lnd->lnd_query)(ni, lp->lpni_nid, &last_alive);
- lnet_net_lock(cpt);
-
- lp->lpni_last_query = ktime_get_seconds();
-
- if (last_alive != 0) /* NI has updated timestamp */
- lp->lpni_last_alive = last_alive;
-}
-
-/* NB: always called with lnet_net_lock held */
-static inline int
-lnet_peer_is_alive(struct lnet_peer_ni *lp, time64_t now)
-{
- int alive;
- time64_t deadline;
-
- LASSERT (lnet_peer_aliveness_enabled(lp));
-
- /*
- * Trust lnet_notify() if it has more recent aliveness news, but
- * ignore the initial assumed death (see lnet_peers_start_down()).
- */
- spin_lock(&lp->lpni_lock);
- if (!lp->lpni_alive && lp->lpni_alive_count > 0 &&
- lp->lpni_timestamp >= lp->lpni_last_alive) {
- spin_unlock(&lp->lpni_lock);
- return 0;
- }
-
- deadline = lp->lpni_last_alive +
- lp->lpni_net->net_tunables.lct_peer_timeout;
- alive = deadline > now;
-
- /*
- * Update obsolete lp_alive except for routers assumed to be dead
- * initially, because router checker would update aliveness in this
- * case, and moreover lpni_last_alive at peer creation is assumed.
- */
- if (alive && !lp->lpni_alive &&
- !(lnet_isrouter(lp) && lp->lpni_alive_count == 0)) {
- spin_unlock(&lp->lpni_lock);
- lnet_notify_locked(lp, 0, 1, lp->lpni_last_alive);
- } else {
- spin_unlock(&lp->lpni_lock);
- }
-
- return alive;
-}
-
-
/* NB: returns 1 when alive, 0 when dead, negative when error;
* may drop the lnet_net_lock */
static int
-lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lp,
+lnet_peer_alive_locked(struct lnet_ni *ni, struct lnet_peer_ni *lpni,
struct lnet_msg *msg)
{
- time64_t now = ktime_get_seconds();
-
- if (!lnet_peer_aliveness_enabled(lp))
+ if (!lnet_peer_aliveness_enabled(lpni))
return -ENODEV;
- if (lnet_peer_is_alive(lp, now))
- return 1;
-
/*
* If we're resending a message, let's attempt to send it even if
* the peer is down to fulfill our resend quota on the message
if (msg->msg_retry_count > 0)
return 1;
- /*
- * Peer appears dead, but we should avoid frequent NI queries (at
- * most once per lnet_queryinterval seconds).
- */
- if (lp->lpni_last_query != 0) {
- static const int lnet_queryinterval = 1;
- time64_t next_query;
-
- next_query = lp->lpni_last_query + lnet_queryinterval;
-
- if (now < next_query) {
- if (lp->lpni_alive)
- CWARN("Unexpected aliveness of peer %s: "
- "%lld < %lld (%d/%d)\n",
- libcfs_nid2str(lp->lpni_nid),
- now, next_query,
- lnet_queryinterval,
- lp->lpni_net->net_tunables.lct_peer_timeout);
- return 0;
- }
- }
-
- /* query NI for latest aliveness news */
- lnet_ni_query_locked(ni, lp);
+ /* try and send recovery messages irregardless */
+ if (msg->msg_recovery)
+ return 1;
- if (lnet_peer_is_alive(lp, now))
+ /* always send any responses */
+ if (msg->msg_type == LNET_MSG_ACK ||
+ msg->msg_type == LNET_MSG_REPLY)
return 1;
- lnet_notify_locked(lp, 0, 0, lp->lpni_last_alive);
- return 0;
+ return lnet_is_peer_ni_alive(lpni);
}
/**
/* Multi-Rail: Primary NID of source. */
msg->msg_initiator = lnet_peer_primary_nid_locked(src_nid);
- if (lnet_isrouter(msg->msg_rxpeer)) {
- lnet_peer_set_alive(msg->msg_rxpeer);
- if (avoid_asym_router_failure &&
- LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) {
- /* received a remote message from router, update
- * remote NI status on this router.
- * NB: multi-hop routed message will be ignored.
- */
- lnet_router_ni_update_locked(msg->msg_rxpeer,
- LNET_NIDNET(src_nid));
- }
- }
+ /*
+ * mark the status of this lpni as UP since we received a message
+ * from it. The ping response reports back the ns_status which is
+ * marked on the remote as up or down and we cache it here.
+ */
+ msg->msg_rxpeer->lpni_ns_status = LNET_NI_STATUS_UP;
lnet_msg_commit(msg, cpt);
}
if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp))
- aliveness = lp->lpni_alive ? "up" : "down";
+ aliveness = (lnet_is_peer_ni_alive(lp)) ? "up" : "down";
CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n",
libcfs_nid2str(lp->lpni_nid), atomic_read(&lp->lpni_refcount),
if (lnet_isrouter(lp) ||
lnet_peer_aliveness_enabled(lp))
snprintf(aliveness, LNET_MAX_STR_LEN,
- lp->lpni_alive ? "up" : "down");
+ lnet_is_peer_ni_alive(lp) ? "up" : "down");
*nid = lp->lpni_nid;
*refcount = atomic_read(&lp->lpni_refcount);
if (lnet_isrouter(lpni) ||
lnet_peer_aliveness_enabled(lpni))
snprintf(lpni_info->cr_aliveness, LNET_MAX_STR_LEN,
- lpni->lpni_alive ? "up" : "down");
+ lnet_is_peer_ni_alive(lpni) ? "up" : "down");
lpni_info->cr_refcount = atomic_read(&lpni->lpni_refcount);
lpni_info->cr_ni_peer_tx_credits = (lpni->lpni_net != NULL) ?
lp->lpni_timestamp = when; /* update timestamp */
- if (lp->lpni_alive_count != 0 && /* got old news */
- (!lp->lpni_alive) == (!alive)) { /* new date for old news */
+ /* got old news */
+ if (lp->lpni_alive_count != 0 &&
+ /* new date for old news */
+ (!lnet_is_peer_ni_alive(lp)) == (!alive)) {
spin_unlock(&lp->lpni_lock);
CDEBUG(D_NET, "Old news\n");
return;
/* Flag that notification is outstanding */
lp->lpni_alive_count++;
- lp->lpni_alive = (alive) ? 1 : 0;
lp->lpni_notify = 1;
lp->lpni_notifylnd = notifylnd;
- if (lp->lpni_alive)
+ if (lnet_is_peer_ni_alive(lp))
lp->lpni_ping_feats = LNET_PING_FEAT_INVAL; /* reset */
spin_unlock(&lp->lpni_lock);
* lnet_notify_locked().
*/
while (lp->lpni_notify) {
- alive = lp->lpni_alive;
+ alive = lnet_is_peer_ni_alive(lp);
notifylnd = lp->lpni_notifylnd;
lp->lpni_notifylnd = 0;