Whamcloud - gitweb
LU-9120 lnet: add health value per ni 61/32761/10
authorAmir Shehata <amir.shehata@intel.com>
Fri, 16 Feb 2018 22:10:33 +0000 (14:10 -0800)
committerAmir Shehata <ashehata@whamcloud.com>
Fri, 17 Aug 2018 19:52:54 +0000 (19:52 +0000)
Add a health value per local network interface. The health value
reflects the health of the NI. It is initialized to 1000. 1000 is
chosen to be able to granularly decrement the health value on error.

If the NI is absolutely not healthy that will be indicated by an
LND event, which will flag that the NI is down and should never
be used.

Test-Parameters: forbuildonly
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I0fb362a84c110f482633fb86a81c4d7b26c3ecba
Reviewed-on: https://review.whamcloud.com/32761
Tested-by: Jenkins
Reviewed-by: Sonia Sharma <sharmaso@whamcloud.com>
Reviewed-by: Olaf Weber <olaf.weber@hpe.com>
Reviewed-by: Chris Horn <hornc@cray.com>
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/api-ni.c
lnet/lnet/lib-move.c

index f8a208d..bed5244 100644 (file)
@@ -897,7 +897,6 @@ int lnet_get_peer_ni_info(__u32 peer_index, __u64 *nid,
                          __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
                          __u32 *peer_tx_qnob);
 
                          __u32 *peer_rtr_credits, __u32 *peer_min_rtr_credtis,
                          __u32 *peer_tx_qnob);
 
-
 static inline bool
 lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
 {
 static inline bool
 lnet_is_peer_ni_healthy_locked(struct lnet_peer_ni *lpni)
 {
index e96b544..7528c3c 100644 (file)
 
 #define LNET_MAX_IOV           (LNET_MAX_PAYLOAD >> PAGE_SHIFT)
 
 
 #define LNET_MAX_IOV           (LNET_MAX_PAYLOAD >> PAGE_SHIFT)
 
+/*
+ * This is the maximum health value.
+ * All local and peer NIs created have their health default to this value.
+ */
+#define LNET_MAX_HEALTH_VALUE 1000
+
 /* forward refs */
 struct lnet_libmd;
 
 /* forward refs */
 struct lnet_libmd;
 
@@ -410,6 +416,15 @@ struct lnet_ni {
        __u32                   ni_seq;
 
        /*
        __u32                   ni_seq;
 
        /*
+        * health value
+        *      initialized to LNET_MAX_HEALTH_VALUE
+        * Value is decremented every time we fail to send a message over
+        * this NI because of a NI specific failure.
+        * Value is incremented if we successfully send a message.
+        */
+       atomic_t                ni_healthv;
+
+       /*
         * equivalent interfaces to use
         * This is an array because socklnd bonding can still be configured
         */
         * equivalent interfaces to use
         * This is an array because socklnd bonding can still be configured
         */
index f42ca74..d10ff58 100644 (file)
@@ -1833,6 +1833,7 @@ lnet_startup_lndni(struct lnet_ni *ni, struct lnet_lnd_tunables *tun)
 
        atomic_set(&ni->ni_tx_credits,
                   lnet_ni_tq_credits(ni) * ni->ni_ncpts);
 
        atomic_set(&ni->ni_tx_credits,
                   lnet_ni_tq_credits(ni) * ni->ni_ncpts);
+       atomic_set(&ni->ni_healthv, LNET_MAX_HEALTH_VALUE);
 
        CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
                libcfs_nid2str(ni->ni_nid),
 
        CDEBUG(D_LNI, "Added LNI %s [%d/%d/%d/%d]\n",
                libcfs_nid2str(ni->ni_nid),
index 383cc82..9d50948 100644 (file)
@@ -1452,6 +1452,7 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
        struct lnet_ni *ni = NULL;
        unsigned int shortest_distance;
        int best_credits;
        struct lnet_ni *ni = NULL;
        unsigned int shortest_distance;
        int best_credits;
+       int best_healthv;
 
        /*
         * If there is no peer_ni that we can send to on this network,
 
        /*
         * If there is no peer_ni that we can send to on this network,
@@ -1463,20 +1464,21 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
        if (best_ni == NULL) {
                shortest_distance = UINT_MAX;
                best_credits = INT_MIN;
        if (best_ni == NULL) {
                shortest_distance = UINT_MAX;
                best_credits = INT_MIN;
+               best_healthv = 0;
        } else {
                shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
                                                     best_ni->ni_dev_cpt);
                best_credits = atomic_read(&best_ni->ni_tx_credits);
        } else {
                shortest_distance = cfs_cpt_distance(lnet_cpt_table(), md_cpt,
                                                     best_ni->ni_dev_cpt);
                best_credits = atomic_read(&best_ni->ni_tx_credits);
+               best_healthv = atomic_read(&best_ni->ni_healthv);
        }
 
        while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
                unsigned int distance;
                int ni_credits;
        }
 
        while ((ni = lnet_get_next_ni_locked(local_net, ni))) {
                unsigned int distance;
                int ni_credits;
-
-               if (!lnet_is_ni_healthy_locked(ni))
-                       continue;
+               int ni_healthv;
 
                ni_credits = atomic_read(&ni->ni_tx_credits);
 
                ni_credits = atomic_read(&ni->ni_tx_credits);
+               ni_healthv = atomic_read(&ni->ni_healthv);
 
                /*
                 * calculate the distance from the CPT on which
 
                /*
                 * calculate the distance from the CPT on which
@@ -1501,21 +1503,24 @@ lnet_get_best_ni(struct lnet_net *local_net, struct lnet_ni *best_ni,
                        distance = lnet_numa_range;
 
                /*
                        distance = lnet_numa_range;
 
                /*
-                * Select on shorter distance, then available
+                * Select on health, shorter distance, available
                 * credits, then round-robin.
                 */
                 * credits, then round-robin.
                 */
-               if (distance > shortest_distance) {
+               if (ni_healthv < best_healthv) {
+                       continue;
+               } else if (distance > shortest_distance) {
                        continue;
                } else if (distance < shortest_distance) {
                        shortest_distance = distance;
                } else if (ni_credits < best_credits) {
                        continue;
                } else if (ni_credits == best_credits) {
                        continue;
                } else if (distance < shortest_distance) {
                        shortest_distance = distance;
                } else if (ni_credits < best_credits) {
                        continue;
                } else if (ni_credits == best_credits) {
-                       if (best_ni && (best_ni)->ni_seq <= ni->ni_seq)
+                       if (best_ni && best_ni->ni_seq <= ni->ni_seq)
                                continue;
                }
                best_ni = ni;
                best_credits = ni_credits;
                                continue;
                }
                best_ni = ni;
                best_credits = ni_credits;
+               best_healthv = ni_healthv;
        }
 
        CDEBUG(D_NET, "selected best_ni %s\n",
        }
 
        CDEBUG(D_NET, "selected best_ni %s\n",