From 728822f4b73cd9ca31f9800dfdd013d86cb44faf Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Tue, 19 Nov 2019 19:40:34 -0800 Subject: [PATCH] LU-13025 lnet: pick healthiest peer net When iterating over the peer nets, select the healthiest one. Node might be able to reach a peer over multiple nets, and therefore the health of these peer nets must be considered. Test-parameters: trivial Signed-off-by: Amir Shehata Change-Id: I155888dca358627fcb63c2ed0e51114bc49a9ff1 Reviewed-on: https://review.whamcloud.com/36912 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Serguei Smirnov Reviewed-by: Chris Horn Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 32 ++++++++++++++++++++++++++++++-- lnet/include/lnet/lib-types.h | 3 +++ lnet/lnet/lib-move.c | 12 ++++++++++-- lnet/lnet/lib-msg.c | 17 ++++++++++++----- lnet/lnet/peer.c | 1 + lnet/lnet/router.c | 6 +++--- 6 files changed, 59 insertions(+), 12 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 45e5b6d..3199853 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -967,9 +967,37 @@ lnet_is_peer_ni_alive(struct lnet_peer_ni *lpni) } static inline void -lnet_set_healthv(atomic_t *healthv, int value) +lnet_update_peer_net_healthv(struct lnet_peer_ni *lpni) { - atomic_set(healthv, value); + struct lnet_peer_net *lpn; + int best_healthv = 0; + + lpn = lpni->lpni_peer_net; + + list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) { + int lpni_healthv = atomic_read(&lpni->lpni_healthv); + if (best_healthv < lpni_healthv) + best_healthv = lpni_healthv; + } + + lpn->lpn_healthv = best_healthv; +} + +static inline void +lnet_set_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value) +{ + if (atomic_read(&lpni->lpni_healthv) == value) + return; + atomic_set(&lpni->lpni_healthv, value); + lnet_update_peer_net_healthv(lpni); +} + +static inline void +lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni) +{ + /* only adjust the net health if the lpni health value changed */ + if (atomic_add_unless(&lpni->lpni_healthv, 1, LNET_MAX_HEALTH_VALUE)) + lnet_update_peer_net_healthv(lpni); } static inline void diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index f6aa0d7..d226dec 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -758,6 +758,9 @@ struct lnet_peer_net { /* Net ID */ __u32 lpn_net_id; + /* peer net health */ + int lpn_healthv; + /* time of last router net check attempt */ time64_t lpn_rtrcheck_timestamp; diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index d93374b..e78fe63 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -2197,6 +2197,7 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, { struct lnet_peer_net *peer_net = NULL; struct lnet_ni *best_ni = NULL; + int lpn_healthv = 0; /* * The peer can have multiple interfaces, some of them can be on @@ -2213,8 +2214,15 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt, */ if (!lnet_get_net_locked(peer_net->lpn_net_id)) continue; - best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, - peer_net, md_cpt, false); + + /* always select the lpn with the best health */ + if (lpn_healthv <= peer_net->lpn_healthv) + lpn_healthv = peer_net->lpn_healthv; + else + continue; + + best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net, + md_cpt, false); /* * if this is a discovery message and lp_disc_net_id is diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 225ae3e..a00ba31 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -503,6 +503,10 @@ lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni) sensitivity = lp_sensitivity; lnet_dec_healthv_locked(&lpni->lpni_healthv, sensitivity); + + /* update the peer_net's health value */ + lnet_update_peer_net_healthv(lpni); + /* * add the peer NI to the recovery queue if it's not already there * and it's health value is actually below the maximum. It's @@ -849,11 +853,14 @@ lnet_health_check(struct lnet_msg *msg) * I'm a router, then set that lpni's health to * maximum so we can commence communication */ - if (lnet_isrouter(lpni) || the_lnet.ln_routing) - lnet_set_healthv(&lpni->lpni_healthv, - LNET_MAX_HEALTH_VALUE); - else - lnet_inc_healthv(&lpni->lpni_healthv); + lnet_net_lock(0); + if (lnet_isrouter(lpni) || the_lnet.ln_routing) { + lnet_set_lpni_healthv_locked(lpni, + LNET_MAX_HEALTH_VALUE); + } else { + lnet_inc_lpni_healthv_locked(lpni); + } + lnet_net_unlock(0); } /* we can finalize this message */ diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index cd2dd21..19f256b 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -1275,6 +1275,7 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp, /* Add peer_ni to peer_net */ lpni->lpni_peer_net = lpn; list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis); + lnet_update_peer_net_healthv(lpni); lnet_peer_net_addref_locked(lpn); /* Add peer_net to peer */ diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 4ab65f7..4ac28d5 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -1642,10 +1642,10 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, if (alive) { if (reset) - lnet_set_healthv(&lpni->lpni_healthv, - LNET_MAX_HEALTH_VALUE); + lnet_set_lpni_healthv_locked(lpni, + LNET_MAX_HEALTH_VALUE); else - lnet_inc_healthv(&lpni->lpni_healthv); + lnet_inc_lpni_healthv_locked(lpni); } else { lnet_handle_remote_failure_locked(lpni); } -- 1.8.3.1