When there is a failure to send to or over an interface the
interface's health value is decremented by lnet_health_sensitivity.
Originally, when it recovers the health value goes up by 1.
This patch makes incrementing and decrementing health value
symmetrical, by lnet_health_sensitivity.
In this way if a site wants to stop using and start using an interface
quickly, then it can set lnet_health_sensitivity to a large value
compared to the health value maximum, which is hard coded to 1000.
For example setting lnet_health_sensitivity to 500 will reduce the
health value of an interface to 0 after two failed sends and
reset it to maximum health value after two successful sends.
Test-parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: Ib5cedb063d9ccf79c574edac291551f8c94bcce4
Reviewed-on: https://review.whamcloud.com/36920
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet_update_peer_net_healthv(lpni);
}
lnet_update_peer_net_healthv(lpni);
}
+static inline bool
+lnet_atomic_add_unless_max(atomic_t *v, int a, int u)
+{
+ int c = atomic_read(v);
+ bool mod = false;
+ int old;
+ int m;
+
+ if (c == u)
+ return mod;
+
+ for (;;) {
+ if (c + a >= u)
+ m = u;
+ else
+ m = c + a;
+ old = atomic_cmpxchg(v, c, m);
+
+ if (old == u)
+ break;
+
+ if (old == c) {
+ mod = true;
+ break;
+ }
+ c = old;
+ }
+
+ return mod;
+}
+
-lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni)
+lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value)
{
/* only adjust the net health if the lpni health value changed */
{
/* only adjust the net health if the lpni health value changed */
- if (atomic_add_unless(&lpni->lpni_healthv, 1, LNET_MAX_HEALTH_VALUE))
+ if (lnet_atomic_add_unless_max(&lpni->lpni_healthv, value,
+ LNET_MAX_HEALTH_VALUE))
lnet_update_peer_net_healthv(lpni);
}
static inline void
lnet_update_peer_net_healthv(lpni);
}
static inline void
-lnet_inc_healthv(atomic_t *healthv)
+lnet_inc_healthv(atomic_t *healthv, int value)
- atomic_add_unless(healthv, 1, LNET_MAX_HEALTH_VALUE);
+ lnet_atomic_add_unless_max(healthv, value, LNET_MAX_HEALTH_VALUE);
}
void lnet_incr_stats(struct lnet_element_stats *stats,
}
void lnet_incr_stats(struct lnet_element_stats *stats,
* In the peer case, it'll naturally be incremented
*/
if (!unlink_event)
* In the peer case, it'll naturally be incremented
*/
if (!unlink_event)
- lnet_inc_healthv(&ni->ni_healthv);
+ lnet_inc_healthv(&ni->ni_healthv,
+ lnet_health_sensitivity);
} else {
struct lnet_peer_ni *lpni;
int cpt;
} else {
struct lnet_peer_ni *lpni;
int cpt;
* increment the local ni health weather we successfully
* received or sent a message on it.
*/
* increment the local ni health weather we successfully
* received or sent a message on it.
*/
- lnet_inc_healthv(&ni->ni_healthv);
+ lnet_inc_healthv(&ni->ni_healthv, lnet_health_sensitivity);
/*
* It's possible msg_txpeer is NULL in the LOLND
* case. Only increment the peer's health if we're
/*
* It's possible msg_txpeer is NULL in the LOLND
* case. Only increment the peer's health if we're
lnet_set_lpni_healthv_locked(lpni,
LNET_MAX_HEALTH_VALUE);
} else {
lnet_set_lpni_healthv_locked(lpni,
LNET_MAX_HEALTH_VALUE);
} else {
- lnet_inc_lpni_healthv_locked(lpni);
+ __u32 sensitivity = lpni->lpni_peer_net->
+ lpn_peer->lp_health_sensitivity;
+
+ lnet_inc_lpni_healthv_locked(lpni,
+ (sensitivity) ? sensitivity :
+ lnet_health_sensitivity);
lnet_set_lpni_healthv_locked(lpni,
LNET_MAX_HEALTH_VALUE);
lnet_set_lpni_healthv_locked(lpni,
LNET_MAX_HEALTH_VALUE);
- else
- lnet_inc_lpni_healthv_locked(lpni);
+ } else {
+ __u32 sensitivity = lpni->lpni_peer_net->
+ lpn_peer->lp_health_sensitivity;
+
+ lnet_inc_lpni_healthv_locked(lpni,
+ (sensitivity) ? sensitivity :
+ lnet_health_sensitivity);
+ }
} else {
lnet_handle_remote_failure_locked(lpni);
}
} else {
lnet_handle_remote_failure_locked(lpni);
}