From: Amir Shehata Date: Tue, 3 Dec 2019 01:09:07 +0000 (-0800) Subject: LU-12303 lnet: recover health at same rate as dec X-Git-Tag: 2.13.53~18 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=1d94a29dbc018fd00aa1c8a7a7ae343e0c9a4b83 LU-12303 lnet: recover health at same rate as dec When there is a failure to send to or over an interface the interface's health value is decremented by lnet_health_sensitivity. Originally, when it recovers the health value goes up by 1. This patch makes incrementing and decrementing health value symmetrical, by lnet_health_sensitivity. In this way if a site wants to stop using and start using an interface quickly, then it can set lnet_health_sensitivity to a large value compared to the health value maximum, which is hard coded to 1000. For example setting lnet_health_sensitivity to 500 will reduce the health value of an interface to 0 after two failed sends and reset it to maximum health value after two successful sends. Test-parameters: trivial Signed-off-by: Amir Shehata Change-Id: Ib5cedb063d9ccf79c574edac291551f8c94bcce4 Reviewed-on: https://review.whamcloud.com/36920 Reviewed-by: Serguei Smirnov Reviewed-by: Chris Horn Tested-by: Maloo Tested-by: jenkins Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 87708bf..c7ec6d1 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -997,18 +997,50 @@ lnet_set_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value) lnet_update_peer_net_healthv(lpni); } +static inline bool +lnet_atomic_add_unless_max(atomic_t *v, int a, int u) +{ + int c = atomic_read(v); + bool mod = false; + int old; + int m; + + if (c == u) + return mod; + + for (;;) { + if (c + a >= u) + m = u; + else + m = c + a; + old = atomic_cmpxchg(v, c, m); + + if (old == u) + break; + + if (old == c) { + mod = true; + break; + } + c = old; + } + + return mod; +} + static inline void -lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni) +lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value) { /* only adjust the net health if the lpni health value changed */ - if (atomic_add_unless(&lpni->lpni_healthv, 1, LNET_MAX_HEALTH_VALUE)) + if (lnet_atomic_add_unless_max(&lpni->lpni_healthv, value, + LNET_MAX_HEALTH_VALUE)) lnet_update_peer_net_healthv(lpni); } static inline void -lnet_inc_healthv(atomic_t *healthv) +lnet_inc_healthv(atomic_t *healthv, int value) { - atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE); + lnet_atomic_add_unless_max(healthv, value, LNET_MAX_HEALTH_VALUE); } void lnet_incr_stats(struct lnet_element_stats *stats, diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 92e19c5..17deed5 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3653,7 +3653,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info, * In the peer case, it'll naturally be incremented */ if (!unlink_event) - lnet_inc_healthv(&ni->ni_healthv); + lnet_inc_healthv(&ni->ni_healthv, + lnet_health_sensitivity); } else { struct lnet_peer_ni *lpni; int cpt; diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index d1d3835..e92b5c8 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -838,7 +838,7 @@ lnet_health_check(struct lnet_msg *msg) * increment the local ni health weather we successfully * received or sent a message on it. */ - lnet_inc_healthv(&ni->ni_healthv); + lnet_inc_healthv(&ni->ni_healthv, lnet_health_sensitivity); /* * It's possible msg_txpeer is NULL in the LOLND * case. Only increment the peer's health if we're @@ -858,7 +858,12 @@ lnet_health_check(struct lnet_msg *msg) lnet_set_lpni_healthv_locked(lpni, LNET_MAX_HEALTH_VALUE); } else { - lnet_inc_lpni_healthv_locked(lpni); + __u32 sensitivity = lpni->lpni_peer_net-> + lpn_peer->lp_health_sensitivity; + + lnet_inc_lpni_healthv_locked(lpni, + (sensitivity) ? sensitivity : + lnet_health_sensitivity); } lnet_net_unlock(0); } diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 0a306087..8fa1dc1 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -1738,11 +1738,17 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset, } if (alive) { - if (reset) + if (reset) { lnet_set_lpni_healthv_locked(lpni, LNET_MAX_HEALTH_VALUE); - else - lnet_inc_lpni_healthv_locked(lpni); + } else { + __u32 sensitivity = lpni->lpni_peer_net-> + lpn_peer->lp_health_sensitivity; + + lnet_inc_lpni_healthv_locked(lpni, + (sensitivity) ? sensitivity : + lnet_health_sensitivity); + } } else { lnet_handle_remote_failure_locked(lpni); }