Whamcloud - gitweb
LU-13912 lnet: Correct the router ping interval calculation 94/39694/7
authorChris Horn <chris.horn@hpe.com>
Mon, 17 Aug 2020 21:02:10 +0000 (16:02 -0500)
committerOleg Drokin <green@whamcloud.com>
Tue, 11 May 2021 22:54:02 +0000 (22:54 +0000)
The router ping interval is being divided by the number of local nets
which results in sending pings more frequently than defined by the
alive_router_check_interval. In addition, the current code is structured
such that we may not find a peer net in need of a ping until after
inspecting the router list multiple times. Re-work the code so that the
loop that inspects a router's peer nets will look at all of them until
it either loops back around the list or it finds one that actually
needs to be pinged.

We also move the check of LNET_PEER_RTR_DISCOVERY so that we avoid the
work of inspecting the router's peer nets if the router is already being
discovered.

Test-Parameters: trivial
HPE-bug-id: LUS-9237
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I5a4733002f29c0ade6aee62b4424313d5d245556
Reviewed-on: https://review.whamcloud.com/39694
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Neil Brown <neilb@suse.de>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/lnet/router.c

index 51afa78..ed52e0a 100644 (file)
@@ -810,8 +810,8 @@ struct lnet_peer_net {
        /* peer net health */
        int                     lpn_healthv;
 
-       /* time of last router net check attempt */
-       time64_t                lpn_rtrcheck_timestamp;
+       /* time of next router ping on this net */
+       time64_t                lpn_next_ping;
 
        /* selection sequence number */
        __u32                   lpn_seq;
index e5dd77b..50441b3 100644 (file)
@@ -619,6 +619,7 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
        unsigned int offset = 0;
        unsigned int len = 0;
        struct list_head *e;
+       time64_t now;
 
        lnet_shuffle_seed();
 
@@ -641,9 +642,10 @@ lnet_add_route_to_rnet(struct lnet_remotenet *rnet, struct lnet_route *route)
         * force a router check on the gateway to make sure the route is
         * alive
         */
+       now = ktime_get_real_seconds();
        list_for_each_entry(lpn, &route->lr_gateway->lp_peer_nets,
                            lpn_peer_nets) {
-               lpn->lpn_rtrcheck_timestamp = 0;
+               lpn->lpn_next_ping = now;
        }
 
        the_lnet.ln_remote_nets_version++;
@@ -1137,12 +1139,13 @@ bool lnet_router_checker_active(void)
 void
 lnet_check_routers(void)
 {
-       struct lnet_peer_net *first_lpn = NULL;
+       struct lnet_peer_net *first_lpn;
        struct lnet_peer_net *lpn;
        struct lnet_peer_ni *lpni;
        struct list_head *entry;
        struct lnet_peer *rtr;
        bool push = false;
+       bool needs_ping;
        bool found_lpn;
        __u64 version;
        __u32 net_id;
@@ -1158,15 +1161,18 @@ rescan:
                rtr = list_entry(entry, struct lnet_peer,
                                 lp_rtr_list);
 
+               /* If we're currently discovering the peer then don't
+                * issue another discovery
+                */
+               if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY)
+                       continue;
+
                now = ktime_get_real_seconds();
 
-               /*
-                * only discover the router if we've passed
-                * alive_router_check_interval seconds. Some of the router
-                * interfaces could be down and in that case they would be
-                * undergoing recovery separately from this discovery.
-                */
-               /* find next peer net which is also local */
+               /* find the next local peer net which needs to be ping'd */
+               needs_ping = false;
+               first_lpn = NULL;
+               found_lpn = false;
                net_id = rtr->lp_disc_net_id;
                do {
                        lpn = lnet_get_next_peer_net_locked(rtr, net_id);
@@ -1175,13 +1181,27 @@ rescan:
                                libcfs_nid2str(rtr->lp_primary_nid));
                                break;
                        }
+
+                       /* We looped back to the first peer net */
                        if (first_lpn == lpn)
                                break;
                        if (!first_lpn)
                                first_lpn = lpn;
-                       found_lpn = lnet_islocalnet_locked(lpn->lpn_net_id);
+
                        net_id = lpn->lpn_net_id;
-               } while (!found_lpn);
+                       if (!lnet_islocalnet_locked(net_id))
+                               continue;
+
+                       found_lpn = true;
+
+                       CDEBUG(D_NET, "rtr %s(%p) %s(%p) next ping %lld\n",
+                              libcfs_nid2str(rtr->lp_primary_nid), rtr,
+                              libcfs_net2str(net_id), lpn,
+                              lpn->lpn_next_ping);
+
+                       needs_ping = now >= lpn->lpn_next_ping;
+
+               } while (!needs_ping);
 
                if (!found_lpn || !lpn) {
                        CERROR("no local network found for gateway %s\n",
@@ -1189,19 +1209,10 @@ rescan:
                        continue;
                }
 
-               if (now - lpn->lpn_rtrcheck_timestamp <
-                   alive_router_check_interval / lnet_current_net_count)
-                      continue;
+               if (!needs_ping)
+                       continue;
 
-               /*
-                * If we're currently discovering the peer then don't
-                * issue another discovery
-                */
                spin_lock(&rtr->lp_lock);
-               if (rtr->lp_state & LNET_PEER_RTR_DISCOVERY) {
-                       spin_unlock(&rtr->lp_lock);
-                       continue;
-               }
                /* make sure we fully discover the router */
                rtr->lp_state &= ~LNET_PEER_NIDS_UPTODATE;
                rtr->lp_state |= LNET_PEER_FORCE_PING | LNET_PEER_FORCE_PUSH |
@@ -1225,16 +1236,16 @@ rescan:
                       libcfs_nid2str(lpni->lpni_nid), cpt);
                rc = lnet_discover_peer_locked(lpni, cpt, false);
 
-               /* decrement ref count acquired by find_peer_ni_locked() */
+               /* drop ref taken above */
                lnet_peer_ni_decref_locked(lpni);
 
                if (!rc)
-                       lpn->lpn_rtrcheck_timestamp = now;
+                       lpn->lpn_next_ping = now + alive_router_check_interval;
                else
                        CERROR("Failed to discover router %s\n",
                               libcfs_nid2str(rtr->lp_primary_nid));
 
-               /* NB dropped lock */
+               /* NB cpt lock was dropped in lnet_discover_peer_locked() */
                if (version != the_lnet.ln_routers_version) {
                        /* the routers list has changed */
                        goto rescan;