Whamcloud - gitweb
LU-13785 lnet: Use lr_hops for avoid_asym_router_failure 62/39362/7
authorChris Horn <chris.horn@hpe.com>
Tue, 14 Jul 2020 04:08:28 +0000 (23:08 -0500)
committerOleg Drokin <green@whamcloud.com>
Thu, 15 Apr 2021 06:30:15 +0000 (06:30 +0000)
In order for the asymmetric route failure avoidance feature to work
properly it needs to know what the hop count of a route should be.
This information is defined by the lr_hops field of the lnet_route.
The lr_single_hop is what discovery was able to determine the hop
count actually is (single or multi) based on the last ping reply.
If a remote interface on a router goes missing, the route may be
classified as multi-hop by discovery, but it should be considered
single-hop for the purposes of avoiding asymmetric route failure.

HPE-bug-id: LUS-9099
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I9c255f9a2175d964661850277808dae96ff7735c
Reviewed-on: https://review.whamcloud.com/39362
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/lnet/router.c

index 3d06b45..fb60c56 100644 (file)
@@ -326,7 +326,8 @@ bool lnet_is_route_alive(struct lnet_route *route)
         * that the remote net must exist on the gateway. For multi-hop
         * routes the next-hop will not have the remote net.
         */
-       if (avoid_asym_router_failure && route->lr_single_hop) {
+       if (avoid_asym_router_failure &&
+           (route->lr_hops == 1 || route->lr_hops == LNET_UNDEFINED_HOPS)) {
                rlpn = lnet_peer_get_net_locked(gw, route->lr_net);
                if (!rlpn)
                        return false;
@@ -377,7 +378,8 @@ lnet_consolidate_routes_locked(struct lnet_peer *orig_lp,
 static inline void
 lnet_check_route_inconsistency(struct lnet_route *route)
 {
-       if (!route->lr_single_hop && (int)route->lr_hops <= 1) {
+       if (!route->lr_single_hop &&
+           (route->lr_hops == 1 || route->lr_hops == LNET_UNDEFINED_HOPS)) {
                CWARN("route %s->%s is detected to be multi-hop but hop count is set to %d\n",
                        libcfs_net2str(route->lr_net),
                        libcfs_nid2str(route->lr_gateway->lp_primary_nid),
@@ -493,7 +495,9 @@ lnet_router_discovery_ping_reply(struct lnet_peer *lp)
                }
 
                route->lr_single_hop = single_hop;
-               if (avoid_asym_router_failure && single_hop)
+               if (avoid_asym_router_failure &&
+                   (route->lr_hops == 1 ||
+                    route->lr_hops == LNET_UNDEFINED_HOPS))
                        lnet_set_route_aliveness(route, net_up);
                else
                        lnet_set_route_aliveness(route, true);