Whamcloud - gitweb
LU-12303 lnet: recover health at same rate as dec 20/36920/9
authorAmir Shehata <ashehata@whamcloud.com>
Tue, 3 Dec 2019 01:09:07 +0000 (17:09 -0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 31 Mar 2020 07:00:06 +0000 (07:00 +0000)
When there is a failure to send to or over an interface the
interface's health value is decremented by lnet_health_sensitivity.
Originally, when it recovers the health value goes up by 1.
This patch makes incrementing and decrementing health value
symmetrical, by lnet_health_sensitivity.

In this way if a site wants to stop using and start using an interface
quickly, then it can set lnet_health_sensitivity to a large value
compared to the health value maximum, which is hard coded to 1000.
For example setting lnet_health_sensitivity to 500 will reduce the
health value of an interface to 0 after two failed sends and
reset it to maximum health value after two successful sends.

Test-parameters: trivial

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ib5cedb063d9ccf79c574edac291551f8c94bcce4
Reviewed-on: https://review.whamcloud.com/36920
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/router.c

index 87708bf..c7ec6d1 100644 (file)
@@ -997,18 +997,50 @@ lnet_set_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value)
        lnet_update_peer_net_healthv(lpni);
 }
 
        lnet_update_peer_net_healthv(lpni);
 }
 
+static inline bool
+lnet_atomic_add_unless_max(atomic_t *v, int a, int u)
+{
+       int c = atomic_read(v);
+       bool mod = false;
+       int old;
+       int m;
+
+       if (c == u)
+               return mod;
+
+       for (;;) {
+               if (c + a >= u)
+                       m = u;
+               else
+                       m = c + a;
+               old = atomic_cmpxchg(v, c, m);
+
+               if (old == u)
+                       break;
+
+               if (old == c) {
+                       mod = true;
+                       break;
+               }
+               c = old;
+       }
+
+       return mod;
+}
+
 static inline void
 static inline void
-lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni)
+lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value)
 {
        /* only adjust the net health if the lpni health value changed */
 {
        /* only adjust the net health if the lpni health value changed */
-       if (atomic_add_unless(&lpni->lpni_healthv, 1, LNET_MAX_HEALTH_VALUE))
+       if (lnet_atomic_add_unless_max(&lpni->lpni_healthv, value,
+                                      LNET_MAX_HEALTH_VALUE))
                lnet_update_peer_net_healthv(lpni);
 }
 
 static inline void
                lnet_update_peer_net_healthv(lpni);
 }
 
 static inline void
-lnet_inc_healthv(atomic_t *healthv)
+lnet_inc_healthv(atomic_t *healthv, int value)
 {
 {
-       atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE);
+       lnet_atomic_add_unless_max(healthv, value, LNET_MAX_HEALTH_VALUE);
 }
 
 void lnet_incr_stats(struct lnet_element_stats *stats,
 }
 
 void lnet_incr_stats(struct lnet_element_stats *stats,
index 92e19c5..17deed5 100644 (file)
@@ -3653,7 +3653,8 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
                 * In the peer case, it'll naturally be incremented
                 */
                if (!unlink_event)
                 * In the peer case, it'll naturally be incremented
                 */
                if (!unlink_event)
-                       lnet_inc_healthv(&ni->ni_healthv);
+                       lnet_inc_healthv(&ni->ni_healthv,
+                                        lnet_health_sensitivity);
        } else {
                struct lnet_peer_ni *lpni;
                int cpt;
        } else {
                struct lnet_peer_ni *lpni;
                int cpt;
index d1d3835..e92b5c8 100644 (file)
@@ -838,7 +838,7 @@ lnet_health_check(struct lnet_msg *msg)
                 * increment the local ni health weather we successfully
                 * received or sent a message on it.
                 */
                 * increment the local ni health weather we successfully
                 * received or sent a message on it.
                 */
-               lnet_inc_healthv(&ni->ni_healthv);
+               lnet_inc_healthv(&ni->ni_healthv, lnet_health_sensitivity);
                /*
                 * It's possible msg_txpeer is NULL in the LOLND
                 * case. Only increment the peer's health if we're
                /*
                 * It's possible msg_txpeer is NULL in the LOLND
                 * case. Only increment the peer's health if we're
@@ -858,7 +858,12 @@ lnet_health_check(struct lnet_msg *msg)
                                lnet_set_lpni_healthv_locked(lpni,
                                        LNET_MAX_HEALTH_VALUE);
                        } else {
                                lnet_set_lpni_healthv_locked(lpni,
                                        LNET_MAX_HEALTH_VALUE);
                        } else {
-                               lnet_inc_lpni_healthv_locked(lpni);
+                               __u32 sensitivity = lpni->lpni_peer_net->
+                                       lpn_peer->lp_health_sensitivity;
+
+                               lnet_inc_lpni_healthv_locked(lpni,
+                                       (sensitivity) ? sensitivity :
+                                       lnet_health_sensitivity);
                        }
                        lnet_net_unlock(0);
                }
                        }
                        lnet_net_unlock(0);
                }
index 0a30608..8fa1dc1 100644 (file)
@@ -1738,11 +1738,17 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
        }
 
        if (alive) {
        }
 
        if (alive) {
-               if (reset)
+               if (reset) {
                        lnet_set_lpni_healthv_locked(lpni,
                                                     LNET_MAX_HEALTH_VALUE);
                        lnet_set_lpni_healthv_locked(lpni,
                                                     LNET_MAX_HEALTH_VALUE);
-               else
-                       lnet_inc_lpni_healthv_locked(lpni);
+               } else {
+                       __u32 sensitivity = lpni->lpni_peer_net->
+                                       lpn_peer->lp_health_sensitivity;
+
+                       lnet_inc_lpni_healthv_locked(lpni,
+                                       (sensitivity) ? sensitivity :
+                                       lnet_health_sensitivity);
+               }
        } else {
                lnet_handle_remote_failure_locked(lpni);
        }
        } else {
                lnet_handle_remote_failure_locked(lpni);
        }