Whamcloud - gitweb
LU-13025 lnet: pick healthiest peer net 12/36912/6
authorAmir Shehata <ashehata@whamcloud.com>
Wed, 20 Nov 2019 03:40:34 +0000 (19:40 -0800)
committerOleg Drokin <green@whamcloud.com>
Sun, 1 Mar 2020 05:35:52 +0000 (05:35 +0000)
When iterating over the peer nets, select the healthiest one.
Node might be able to reach a peer over multiple nets, and therefore
the health of these peer nets must be considered.

Test-parameters: trivial

Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I155888dca358627fcb63c2ed0e51114bc49a9ff1
Reviewed-on: https://review.whamcloud.com/36912
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/peer.c
lnet/lnet/router.c

index 45e5b6d..3199853 100644 (file)
@@ -967,9 +967,37 @@ lnet_is_peer_ni_alive(struct lnet_peer_ni *lpni)
 }
 
 static inline void
-lnet_set_healthv(atomic_t *healthv, int value)
+lnet_update_peer_net_healthv(struct lnet_peer_ni *lpni)
 {
-       atomic_set(healthv, value);
+       struct lnet_peer_net *lpn;
+       int best_healthv = 0;
+
+       lpn = lpni->lpni_peer_net;
+
+       list_for_each_entry(lpni, &lpn->lpn_peer_nis, lpni_peer_nis) {
+               int lpni_healthv = atomic_read(&lpni->lpni_healthv);
+               if (best_healthv < lpni_healthv)
+                       best_healthv = lpni_healthv;
+       }
+
+       lpn->lpn_healthv = best_healthv;
+}
+
+static inline void
+lnet_set_lpni_healthv_locked(struct lnet_peer_ni *lpni, int value)
+{
+       if (atomic_read(&lpni->lpni_healthv) == value)
+               return;
+       atomic_set(&lpni->lpni_healthv, value);
+       lnet_update_peer_net_healthv(lpni);
+}
+
+static inline void
+lnet_inc_lpni_healthv_locked(struct lnet_peer_ni *lpni)
+{
+       /* only adjust the net health if the lpni health value changed */
+       if (atomic_add_unless(&lpni->lpni_healthv, 1, LNET_MAX_HEALTH_VALUE))
+               lnet_update_peer_net_healthv(lpni);
 }
 
 static inline void
index f6aa0d7..d226dec 100644 (file)
@@ -758,6 +758,9 @@ struct lnet_peer_net {
        /* Net ID */
        __u32                   lpn_net_id;
 
+       /* peer net health */
+       int                     lpn_healthv;
+
        /* time of last router net check attempt */
        time64_t                lpn_rtrcheck_timestamp;
 
index d93374b..e78fe63 100644 (file)
@@ -2197,6 +2197,7 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
 {
        struct lnet_peer_net *peer_net = NULL;
        struct lnet_ni *best_ni = NULL;
+       int lpn_healthv = 0;
 
        /*
         * The peer can have multiple interfaces, some of them can be on
@@ -2213,8 +2214,15 @@ lnet_find_best_ni_on_local_net(struct lnet_peer *peer, int md_cpt,
                 */
                if (!lnet_get_net_locked(peer_net->lpn_net_id))
                        continue;
-               best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer,
-                                                  peer_net, md_cpt, false);
+
+               /* always select the lpn with the best health */
+               if (lpn_healthv <= peer_net->lpn_healthv)
+                       lpn_healthv = peer_net->lpn_healthv;
+               else
+                       continue;
+
+               best_ni = lnet_find_best_ni_on_spec_net(best_ni, peer, peer_net,
+                                                       md_cpt, false);
 
                /*
                 * if this is a discovery message and lp_disc_net_id is
index 225ae3e..a00ba31 100644 (file)
@@ -503,6 +503,10 @@ lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
                sensitivity = lp_sensitivity;
 
        lnet_dec_healthv_locked(&lpni->lpni_healthv, sensitivity);
+
+       /* update the peer_net's health value */
+       lnet_update_peer_net_healthv(lpni);
+
        /*
         * add the peer NI to the recovery queue if it's not already there
         * and it's health value is actually below the maximum. It's
@@ -849,11 +853,14 @@ lnet_health_check(struct lnet_msg *msg)
                         * I'm a router, then set that lpni's health to
                         * maximum so we can commence communication
                         */
-                       if (lnet_isrouter(lpni) || the_lnet.ln_routing)
-                               lnet_set_healthv(&lpni->lpni_healthv,
-                                                LNET_MAX_HEALTH_VALUE);
-                       else
-                               lnet_inc_healthv(&lpni->lpni_healthv);
+                       lnet_net_lock(0);
+                       if (lnet_isrouter(lpni) || the_lnet.ln_routing) {
+                               lnet_set_lpni_healthv_locked(lpni,
+                                       LNET_MAX_HEALTH_VALUE);
+                       } else {
+                               lnet_inc_lpni_healthv_locked(lpni);
+                       }
+                       lnet_net_unlock(0);
                }
 
                /* we can finalize this message */
index cd2dd21..19f256b 100644 (file)
@@ -1275,6 +1275,7 @@ lnet_peer_attach_peer_ni(struct lnet_peer *lp,
        /* Add peer_ni to peer_net */
        lpni->lpni_peer_net = lpn;
        list_add_tail(&lpni->lpni_peer_nis, &lpn->lpn_peer_nis);
+       lnet_update_peer_net_healthv(lpni);
        lnet_peer_net_addref_locked(lpn);
 
        /* Add peer_net to peer */
index 4ab65f7..4ac28d5 100644 (file)
@@ -1642,10 +1642,10 @@ lnet_notify(struct lnet_ni *ni, lnet_nid_t nid, bool alive, bool reset,
 
        if (alive) {
                if (reset)
-                       lnet_set_healthv(&lpni->lpni_healthv,
-                                        LNET_MAX_HEALTH_VALUE);
+                       lnet_set_lpni_healthv_locked(lpni,
+                                                    LNET_MAX_HEALTH_VALUE);
                else
-                       lnet_inc_healthv(&lpni->lpni_healthv);
+                       lnet_inc_lpni_healthv_locked(lpni);
        } else {
                lnet_handle_remote_failure_locked(lpni);
        }