From 339c7b2b784a528f41c432e9b90285d3445b7536 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Thu, 11 Dec 2014 15:53:31 -0500 Subject: [PATCH] LU-5570 lnet: check router aliveness timestamp In current LNet, peer aliveness timestamp is only recorded for peers on routers, this patch will change this and also record aliveness of routers on regular nodes, so lnet can check aliveness timestamp of routers before sending message, avoid to choose router which has old aliveness timestamp and can be potentially dead or congested. Another change in this patch is, when lnet got a remote message from router, it will update remote NI status of router, in this case, lnet can get up-to-date aliveness informaiton without waiting to the next RC ping. Signed-off-by: Liang Zhen Change-Id: I305ba684ced231a87656458382966aeb8ff7ba48 Reviewed-on: http://review.whamcloud.com/11748 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Doug Oucharek Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 25 ++++++++++ lnet/include/lnet/lib-types.h | 5 -- lnet/lnet/lib-move.c | 105 ++++++++++++++++++++++++++---------------- lnet/lnet/lib-msg.c | 10 ++++ lnet/lnet/peer.c | 5 +- lnet/lnet/router.c | 7 ++- lnet/lnet/router_proc.c | 9 ++-- 7 files changed, 109 insertions(+), 57 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 36b1a63..73a4d68 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -651,6 +651,31 @@ lnet_isrouter(lnet_peer_t *lp) return lp->lp_rtr_refcount != 0; } +/* check if it's a router checker ping */ +static inline int +lnet_msg_is_rc_ping(struct lnet_msg *msg) +{ + lnet_hdr_t *hdr = &msg->msg_hdr; + + return msg->msg_type == LNET_MSG_GET && + hdr->msg.get.ptl_index == cpu_to_le32(LNET_RESERVED_PORTAL) && + hdr->msg.get.match_bits == + cpu_to_le64(LNET_PROTO_PING_MATCHBITS); +} + +/* peer aliveness is enabled in a network where lnet_ni_t::ni_peertimeout has + * been set to a positive value, it's only valid for router peers or peers on + * routers. + */ +static inline int +lnet_peer_aliveness_enabled(struct lnet_peer *lp) +{ + if (lp->lp_ni->ni_peertimeout <= 0) + return 0; + + return the_lnet.ln_routing || lnet_isrouter(lp); +} + static inline void lnet_ni_addref_locked(lnet_ni_t *ni, int cpt) { diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 5199f84..c25241f 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -544,11 +544,6 @@ struct lnet_peer_table { struct list_head *pt_hash; /* NID->peer hash */ }; -/* peer aliveness is enabled only on routers for peers in a network where the - * lnet_ni_t::ni_peertimeout has been set to a positive value */ -#define lnet_peer_aliveness_enabled(lp) (the_lnet.ln_routing != 0 && \ - (lp)->lp_ni->ni_peertimeout > 0) - typedef struct { struct list_head lr_list; /* chain on net */ struct list_head lr_gwlist; /* chain on gateway */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 8973813..5a5e14d 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -746,8 +746,6 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) int alive; cfs_time_t deadline; - LASSERT (lnet_peer_aliveness_enabled(lp)); - /* Trust lnet_notify() if it has more recent aliveness news, but * ignore the initial assumed death (see lnet_peers_start_down()). */ @@ -776,37 +774,28 @@ lnet_peer_is_alive (lnet_peer_t *lp, cfs_time_t now) int lnet_peer_alive_locked (lnet_peer_t *lp) { - cfs_time_t now = cfs_time_current(); - - if (!lnet_peer_aliveness_enabled(lp)) - return -ENODEV; + cfs_time_t now = cfs_time_current(); + bool query; - if (lnet_peer_is_alive(lp, now)) - return 1; + if (!lnet_peer_aliveness_enabled(lp)) + return -ENODEV; - /* Peer appears dead, but we should avoid frequent NI queries (at - * most once per lnet_queryinterval seconds). */ - if (lp->lp_last_query != 0) { - static const int lnet_queryinterval = 1; - - cfs_time_t next_query = - cfs_time_add(lp->lp_last_query, - cfs_time_seconds(lnet_queryinterval)); - - if (cfs_time_before(now, next_query)) { - if (lp->lp_alive) - CWARN("Unexpected aliveness of peer %s: " - "%d < %d (%d/%d)\n", - libcfs_nid2str(lp->lp_nid), - (int)now, (int)next_query, - lnet_queryinterval, - lp->lp_ni->ni_peertimeout); - return 0; - } + if (lp->lp_last_query == 0) { + query = true; + } else { + /* Peer appears dead, but we should avoid frequent NI queries + * (at most once per ni_query_interval seconds). */ + static const int ni_query_interval = 1; + cfs_time_t next_query; + + next_query = cfs_time_add(lp->lp_last_query, + cfs_time_seconds(ni_query_interval)); + query = cfs_time_aftereq(now, next_query); } /* query NI for latest aliveness news */ - lnet_ni_query_locked(lp->lp_ni, lp); + if (query) + lnet_ni_query_locked(lp->lp_ni, lp); if (lnet_peer_is_alive(lp, now)) return 1; @@ -841,7 +830,8 @@ lnet_post_send_locked(lnet_msg_t *msg, int do_send) /* NB 'lp' is always the next hop */ if ((msg->msg_target.pid & LNET_PID_USERFLAG) == 0 && - lnet_peer_alive_locked(lp) == 0) { + lnet_peer_alive_locked(lp) == 0 && + !lnet_msg_is_rc_ping(msg)) { /* send RC ping even for dead router */ the_lnet.ln_counters[cpt]->drop_count++; the_lnet.ln_counters[cpt]->drop_length += msg->msg_len; lnet_net_unlock(cpt); @@ -1194,6 +1184,30 @@ lnet_compare_routes(lnet_route_t *r1, lnet_route_t *r2) lnet_peer_t *p1 = r1->lr_gateway; lnet_peer_t *p2 = r2->lr_gateway; + if (p1->lp_ni->ni_peertimeout > 0 && + p2->lp_ni->ni_peertimeout > 0) { + /* if a router has queued bytes but no aliveness update for + * the last 10 seconds, it could be potentially dead or + * congested, so we prefer not to choose it even its status + * is still alive. + */ + int router_slow = cfs_time_seconds(10); + bool r1_slow; + bool r2_slow; + cfs_time_t now = cfs_time_current(); + + r1_slow = p1->lp_txqnob != 0 && + cfs_time_aftereq(now, p1->lp_last_alive + router_slow); + r2_slow = p2->lp_txqnob != 0 && + cfs_time_aftereq(now, p2->lp_last_alive + router_slow); + + if (!r1_slow && r2_slow) + return 1; + + if (r1_slow && !r2_slow) + return -1; + } + if (r1->lr_priority < r2->lr_priority) return 1; @@ -1876,17 +1890,18 @@ int lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, void *private, int rdma_req) { - int rc = 0; - int cpt; - int for_me; struct lnet_msg *msg; - lnet_pid_t dest_pid; - lnet_nid_t dest_nid; - lnet_nid_t src_nid; - __u32 payload_length; - __u32 type; - - LASSERT (!in_interrupt ()); + lnet_peer_t *rxpeer; + lnet_pid_t dest_pid; + lnet_nid_t dest_nid; + lnet_nid_t src_nid; + __u32 payload_length; + __u32 type; + int for_me; + int cpt; + int rc = 0; + + LASSERT(!in_interrupt()); type = le32_to_cpu(hdr->type); src_nid = le64_to_cpu(hdr->src_nid); @@ -2069,6 +2084,18 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, } lnet_msg_commit(msg, cpt); + /* LND just notified me for incoming message from rxpeer, so assume + * it is alive */ + rxpeer = msg->msg_rxpeer; + rxpeer->lp_last_alive = rxpeer->lp_last_query = cfs_time_current(); + if (!rxpeer->lp_alive) + lnet_notify_locked(rxpeer, 0, 1, rxpeer->lp_last_alive); + + if (lnet_isrouter(msg->msg_rxpeer) && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + lnet_router_ni_update_locked(msg->msg_rxpeer, + LNET_NIDNET(src_nid)); + } /* message delay simulation */ if (unlikely(!list_empty(&the_lnet.ln_delay_rules) && diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index ca5df09..26bd68b 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -489,7 +489,17 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) * (finalize sending first then finalize receiving) */ cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt; + lnet_net_lock(cpt); + if (msg->msg_tx_committed && msg->msg_txpeer != NULL && status == 0) { + lnet_peer_t *lp = msg->msg_txpeer; + + /* LND is finalising a outgoing message for txpeer, so asssume + * it is alive */ + lp->lp_last_alive = lp->lp_last_query = cfs_time_current(); + if (!lp->lp_alive) + lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); + } container = the_lnet.ln_msg_containers[cpt]; list_add_tail(&msg->msg_list, &container->msc_finalizing); diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 6521fbc..363d270 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -375,7 +375,7 @@ lnet_debug_peer(lnet_nid_t nid) return; } - if (lnet_isrouter(lp) || lnet_peer_aliveness_enabled(lp)) + if (lnet_peer_aliveness_enabled(lp)) aliveness = lp->lp_alive ? "up" : "down"; CDEBUG(D_WARNING, "%-24s %4d %5s %5d %5d %5d %5d %5d %ld\n", @@ -427,8 +427,7 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid, continue; snprintf(aliveness, LNET_MAX_STR_LEN, "NA"); - if (lnet_isrouter(lp) || - lnet_peer_aliveness_enabled(lp)) + if (lnet_peer_aliveness_enabled(lp)) snprintf(aliveness, LNET_MAX_STR_LEN, lp->lp_alive ? "up" : "down"); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 58526b5..015a2e4 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -827,6 +827,7 @@ lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net) { lnet_route_t *rte; + /* NB: this can't help for multi-hop routing */ if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) { list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) { if (rte->lr_net == net) { @@ -1137,11 +1138,9 @@ lnet_router_checker_start(void) /* EQ size doesn't matter; the callback is guaranteed to get every * event */ eqsz = 0; - rc = LNetEQAlloc(eqsz, lnet_router_checker_event, - &the_lnet.ln_rc_eqh); + rc = LNetEQAlloc(eqsz, lnet_router_checker_event, &the_lnet.ln_rc_eqh); #else - rc = LNetEQAlloc(eqsz, LNET_EQ_HANDLER_NONE, - &the_lnet.ln_rc_eqh); + rc = LNetEQAlloc(eqsz, LNET_EQ_HANDLER_NONE, &the_lnet.ln_rc_eqh); #endif if (rc != 0) { CERROR("Can't allocate EQ(%d): %d\n", eqsz, rc); diff --git a/lnet/lnet/router_proc.c b/lnet/lnet/router_proc.c index e39e1ee..95d34c9 100644 --- a/lnet/lnet/router_proc.c +++ b/lnet/lnet/router_proc.c @@ -507,14 +507,11 @@ proc_lnet_peers(struct ctl_table *table, int write, void __user *buffer, int minrtrcr = peer->lp_minrtrcredits; int txqnob = peer->lp_txqnob; - if (lnet_isrouter(peer) || - lnet_peer_aliveness_enabled(peer)) - aliveness = peer->lp_alive ? "up" : "down"; - if (lnet_peer_aliveness_enabled(peer)) { - cfs_time_t now = cfs_time_current(); - cfs_duration_t delta; + cfs_time_t now = cfs_time_current(); + cfs_duration_t delta; + aliveness = peer->lp_alive ? "up" : "down"; delta = cfs_time_sub(now, peer->lp_last_alive); lastalive = cfs_duration_sec(delta); -- 1.8.3.1