From fb259fe85813e0f28ac7f7410689e3856ef26316 Mon Sep 17 00:00:00 2001 From: Liang Zhen Date: Tue, 28 Oct 2014 18:04:51 +0800 Subject: [PATCH] LU-5485 lnet: peer aliveness status and NI status A couple of changes to improve aliveness detection: - When LNet received a message, it can determine peer of this message is alive - When LNet recieved a message from remote network, it can determine router is alive and NI status on router is UP. Signed-off-by: Liang Zhen Change-Id: I7133987c5c8728248cce7bc0a95048b26bc6611a Reviewed-on: http://review.whamcloud.com/12453 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: James Simmons Reviewed-by: Isaac Huang Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 10 ++++++++++ lnet/lnet/lib-move.c | 13 +++++++++++++ lnet/lnet/router.c | 17 ++++++++++++++++- 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index d0b4b51..a48a5d3 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -702,6 +702,7 @@ lnet_net2rnethash(__u32 net) } extern lnd_t the_lolnd; +extern int avoid_asym_router_failure; #ifndef __KERNEL__ /* unconditional registration */ @@ -989,6 +990,7 @@ int lnet_peer_buffer_credits(lnet_ni_t *ni); int lnet_router_checker_start(void); void lnet_router_checker_stop(void); +void lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net); void lnet_swap_pinginfo(lnet_ping_info_t *info); int lnet_parse_ip2nets(char **networksp, char *ip2nets); @@ -1010,6 +1012,14 @@ int lnet_get_peer_info(__u32 peer_index, __u64 *nid, __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis, __u32 *peer_tx_qnob); +static inline void +lnet_peer_set_alive(lnet_peer_t *lp) +{ + lp->lp_last_alive = lp->lp_last_query = cfs_time_current(); + if (!lp->lp_alive) + lnet_notify_locked(lp, 0, 1, lp->lp_last_alive); +} + #ifndef __KERNEL__ static inline int lnet_parse_int_tunable(int *value, char *name) diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 4211e4f..17150a9 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2055,6 +2055,19 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, goto drop; } + if (lnet_isrouter(msg->msg_rxpeer)) { + lnet_peer_set_alive(msg->msg_rxpeer); + if (avoid_asym_router_failure && + LNET_NIDNET(src_nid) != LNET_NIDNET(from_nid)) { + /* received a remote message from router, update + * remote NI status on this router. + * NB: multi-hop routed message will be ignored. + */ + lnet_router_ni_update_locked(msg->msg_rxpeer, + LNET_NIDNET(src_nid)); + } + } + lnet_msg_commit(msg, cpt); /* message delay simulation */ diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 1c0c3af..091cfd9 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -87,7 +87,7 @@ static int check_routers_before_use = 0; CFS_MODULE_PARM(check_routers_before_use, "i", int, 0444, "Assume routers are down and ping them before use"); -static int avoid_asym_router_failure = 1; +int avoid_asym_router_failure = 1; CFS_MODULE_PARM(avoid_asym_router_failure, "i", int, 0644, "Avoid asymmetrical router failures (0 to disable)"); @@ -823,6 +823,21 @@ lnet_wait_known_routerstate(void) } void +lnet_router_ni_update_locked(lnet_peer_t *gw, __u32 net) +{ + lnet_route_t *rte; + + if ((gw->lp_ping_feats & LNET_PING_FEAT_NI_STATUS) != 0) { + list_for_each_entry(rte, &gw->lp_routes, lr_gwlist) { + if (rte->lr_net == net) { + rte->lr_downis = 0; + break; + } + } + } +} + +void lnet_update_ni_status_locked(void) { lnet_ni_t *ni; -- 1.8.3.1