summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
4e48761)
Add a health value per local network interface. The health value
reflects the health of the NI. It is initialized to 1000. 1000 is
chosen to be able to granularly decrement the health value on error.
If the NI is absolutely not healthy that will be indicated by an
LND event, which will flag that the NI is down and should never
be used.
Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I0fb362a84c110f482633fb86a81c4d7b26c3ecba
Reviewed-on: https://review.whamcloud.com/32761
Tested-by: Jenkins
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Chris Horn <hornc@cray.com>
__u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
__u32 *peer_tx_qnob);
__u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
__u32 *peer_tx_qnob);
static inline bool
lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
{
static inline bool
lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
{
#define LNET_MAX_IOV (LNET_MAX_PAYLOAD >> PAGE_SHIFT)
#define LNET_MAX_IOV (LNET_MAX_PAYLOAD >> PAGE_SHIFT)
+/*
+ * This is the maximum health value.
+ * All local and peer NIs created have their health default to this value.
+ */
+#define LNET_MAX_HEALTH_VALUE 1000
+
/* forward refs */
struct lnet_libmd;
/* forward refs */
struct lnet_libmd;
+ * health value
+ * initialized to LNET_MAX_HEALTH_VALUE
+ * Value is decremented every time we fail to send a message over
+ * this NI because of a NI specific failure.
+ * Value is incremented if we successfully send a message.
+ */
+ atomic_t ni_healthv;
+
+ /*
* equivalent interfaces to use
* This is an array because socklnd bonding can still be configured
*/
* equivalent interfaces to use
* This is an array because socklnd bonding can still be configured
*/
atomic_set(&ni->ni_tx_credits,
lnet_ni_tq_credits(ni) * ni->ni_ncpts);
atomic_set(&ni->ni_tx_credits,
lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+ atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE);
CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
libcfs_nid2str(ni->ni_nid),
CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
libcfs_nid2str(ni->ni_nid),
struct lnet_ni *ni = NULL;
unsigned int shortest_distance;
int best_credits;
struct lnet_ni *ni = NULL;
unsigned int shortest_distance;
int best_credits;
/*
* If there is no peer_ni that we can send to on this network,
/*
* If there is no peer_ni that we can send to on this network,
if (best_ni == NULL) {
shortest_distance = UINT_MAX;
best_credits = INT_MIN;
if (best_ni == NULL) {
shortest_distance = UINT_MAX;
best_credits = INT_MIN;
} else {
shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
best_ni->ni_dev_cpt);
best_credits = atomic_read(&best_ni->ni_tx_credits);
} else {
shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
best_ni->ni_dev_cpt);
best_credits = atomic_read(&best_ni->ni_tx_credits);
+ best_healthv = atomic_read(&best_ni->ni_healthv);
}
while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
unsigned int distance;
int ni_credits;
}
while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
unsigned int distance;
int ni_credits;
-
- if (!lnet_is_ni_healthy_locked(ni))
- continue;
ni_credits = atomic_read(&ni->ni_tx_credits);
ni_credits = atomic_read(&ni->ni_tx_credits);
+ ni_healthv = atomic_read(&ni->ni_healthv);
/*
* calculate the distance from the CPT on which
/*
* calculate the distance from the CPT on which
distance = lnet_numa_range;
/*
distance = lnet_numa_range;
/*
- * Select on shorter distance, then available
+ * Select on health, shorter distance, available
* credits, then round-robin.
*/
* credits, then round-robin.
*/
- if (distance > shortest_distance) {
+ if (ni_healthv < best_healthv) {
+ continue;
+ } else if (distance > shortest_distance) {
continue;
} else if (distance < shortest_distance) {
shortest_distance = distance;
} else if (ni_credits < best_credits) {
continue;
} else if (ni_credits == best_credits) {
continue;
} else if (distance < shortest_distance) {
shortest_distance = distance;
} else if (ni_credits < best_credits) {
continue;
} else if (ni_credits == best_credits) {
- if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
+ if (best_ni && best_ni->ni_seq <= ni->ni_seq)
continue;
}
best_ni = ni;
best_credits = ni_credits;
continue;
}
best_ni = ni;
best_credits = ni_credits;
+ best_healthv = ni_healthv;
}
CDEBUG(D_NET, "selected best_ni %s\n",
}
CDEBUG(D_NET, "selected best_ni %s\n",