When iterating over the peer nets, select the healthiest one.
Node might be able to reach a peer over multiple nets, and therefore
the health of these peer nets must be considered.
Test-parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I155888dca358627fcb63c2ed0e51114bc49a9ff1
Reviewed-on: https://review.whamcloud.com/36912
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
}
static inline void
-lnet_set_healthv(atomic_t *healthv, int value)
+lnet_update_peer_net_healthv(struct lnet_peer_ni *lpni)
{
- atomic_set(healthv, value);
+ struct lnet_peer_net *lpn;
+ int best_healthv = 0;
+
+ lpn = lpni->lpni_peer_net;
+
+ list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) {
+ int lpni_healthv = atomic_read(&lpni->lpni_healthv);
+ if (best_healthv < lpni_healthv)
+ best_healthv = lpni_healthv;
+ }
+
+ lpn->lpn_healthv = best_healthv;
+}
+
+static inline void
+lnet_set_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value)
+{
+ if (atomic_read(&lpni->lpni_healthv) == value)
+ return;
+ atomic_set(&lpni->lpni_healthv, value);
+ lnet_update_peer_net_healthv(lpni);
+}
+
+static inline void
+lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni)
+{
+ /* only adjust the net health if the lpni health value changed */
+ if (atomic_add_unless(&lpni->lpni_healthv, 1, LNET_MAX_HEALTH_VALUE))
+ lnet_update_peer_net_healthv(lpni);
}
static inline void
/* Net ID */
__u32 lpn_net_id;
+ /* peer net health */
+ int lpn_healthv;
+
/* time of last router net check attempt */
time64_t lpn_rtrcheck_timestamp;
{
struct lnet_peer_net *peer_net = NULL;
struct lnet_ni *best_ni = NULL;
+ int lpn_healthv = 0;
/*
* The peer can have multiple interfaces, some of them can be on
*/
if (!lnet_get_net_locked(peer_net->lpn_net_id))
continue;
- best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer,
- peer_net, md_cpt, false);
+
+ /* always select the lpn with the best health */
+ if (lpn_healthv <= peer_net->lpn_healthv)
+ lpn_healthv = peer_net->lpn_healthv;
+ else
+ continue;
+
+ best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net,
+ md_cpt, false);
/*
* if this is a discovery message and lp_disc_net_id is
sensitivity = lp_sensitivity;
lnet_dec_healthv_locked(&lpni->lpni_healthv, sensitivity);
+
+ /* update the peer_net's health value */
+ lnet_update_peer_net_healthv(lpni);
+
/*
* add the peer NI to the recovery queue if it's not already there
* and it's health value is actually below the maximum. It's
* I'm a router, then set that lpni's health to
* maximum so we can commence communication
*/
- if (lnet_isrouter(lpni) || the_lnet.ln_routing)
- lnet_set_healthv(&lpni->lpni_healthv,
- LNET_MAX_HEALTH_VALUE);
- else
- lnet_inc_healthv(&lpni->lpni_healthv);
+ lnet_net_lock(0);
+ if (lnet_isrouter(lpni) || the_lnet.ln_routing) {
+ lnet_set_lpni_healthv_locked(lpni,
+ LNET_MAX_HEALTH_VALUE);
+ } else {
+ lnet_inc_lpni_healthv_locked(lpni);
+ }
+ lnet_net_unlock(0);
}
/* we can finalize this message */
/* Add peer_ni to peer_net */
lpni->lpni_peer_net = lpn;
list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis);
+ lnet_update_peer_net_healthv(lpni);
lnet_peer_net_addref_locked(lpn);
/* Add peer_net to peer */
if (alive) {
if (reset)
- lnet_set_healthv(&lpni->lpni_healthv,
- LNET_MAX_HEALTH_VALUE);
+ lnet_set_lpni_healthv_locked(lpni,
+ LNET_MAX_HEALTH_VALUE);
else
- lnet_inc_healthv(&lpni->lpni_healthv);
+ lnet_inc_lpni_healthv_locked(lpni);
} else {
lnet_handle_remote_failure_locked(lpni);
}