From d54afb86116c0640d7a201571b337042c87a3e40 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Fri, 16 Feb 2018 14:10:33 -0800 Subject: [PATCH 1/1] LU-9120 lnet: add health value per ni Add a health value per local network interface. The health value reflects the health of the NI. It is initialized to 1000. 1000 is chosen to be able to granularly decrement the health value on error. If the NI is absolutely not healthy that will be indicated by an LND event, which will flag that the NI is down and should never be used. Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: I0fb362a84c110f482633fb86a81c4d7b26c3ecba Reviewed-on: https://review.whamcloud.com/32761 Tested-by: Jenkins Reviewed-by: Sonia Sharma Reviewed-by: Olaf Weber Reviewed-by: Chris Horn --- lnet/include/lnet/lib-lnet.h | 1 - lnet/include/lnet/lib-types.h | 15 +++++++++++++++ lnet/lnet/api-ni.c | 1 + lnet/lnet/lib-move.c | 17 +++++++++++------ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index f8a208d..bed5244 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -897,7 +897,6 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid, __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis, __u32 *peer_tx_qnob); - static inline bool lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni) { diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index e96b544..7528c3c 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -55,6 +55,12 @@ #define LNET_MAX_IOV (LNET_MAX_PAYLOAD >> PAGE_SHIFT) +/* + * This is the maximum health value. + * All local and peer NIs created have their health default to this value. + */ +#define LNET_MAX_HEALTH_VALUE 1000 + /* forward refs */ struct lnet_libmd; @@ -410,6 +416,15 @@ struct lnet_ni { __u32 ni_seq; /* + * health value + * initialized to LNET_MAX_HEALTH_VALUE + * Value is decremented every time we fail to send a message over + * this NI because of a NI specific failure. + * Value is incremented if we successfully send a message. + */ + atomic_t ni_healthv; + + /* * equivalent interfaces to use * This is an array because socklnd bonding can still be configured */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index f42ca74..d10ff58 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -1833,6 +1833,7 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun) atomic_set(&ni->ni_tx_credits, lnet_ni_tq_credits(ni) * ni->ni_ncpts); + atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE); CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n", libcfs_nid2str(ni->ni_nid), diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 383cc82..9d50948 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -1452,6 +1452,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, struct lnet_ni *ni = NULL; unsigned int shortest_distance; int best_credits; + int best_healthv; /* * If there is no peer_ni that we can send to on this network, @@ -1463,20 +1464,21 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, if (best_ni == NULL) { shortest_distance = UINT_MAX; best_credits = INT_MIN; + best_healthv = 0; } else { shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt, best_ni->ni_dev_cpt); best_credits = atomic_read(&best_ni->ni_tx_credits); + best_healthv = atomic_read(&best_ni->ni_healthv); } while ((ni = lnet_get_next_ni_locked(local_net, ni))) { unsigned int distance; int ni_credits; - - if (!lnet_is_ni_healthy_locked(ni)) - continue; + int ni_healthv; ni_credits = atomic_read(&ni->ni_tx_credits); + ni_healthv = atomic_read(&ni->ni_healthv); /* * calculate the distance from the CPT on which @@ -1501,21 +1503,24 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni, distance = lnet_numa_range; /* - * Select on shorter distance, then available + * Select on health, shorter distance, available * credits, then round-robin. */ - if (distance > shortest_distance) { + if (ni_healthv < best_healthv) { + continue; + } else if (distance > shortest_distance) { continue; } else if (distance < shortest_distance) { shortest_distance = distance; } else if (ni_credits < best_credits) { continue; } else if (ni_credits == best_credits) { - if (best_ni && (best_ni)->ni_seq <= ni->ni_seq) + if (best_ni && best_ni->ni_seq <= ni->ni_seq) continue; } best_ni = ni; best_credits = ni_credits; + best_healthv = ni_healthv; } CDEBUG(D_NET, "selected best_ni %s\n", -- 1.8.3.1