Whamcloud - gitweb
LU-13569 lnet: Recover peer NI w/exponential backoff interval 20/39720/15
authorChris Horn <chris.horn@hpe.com>
Sun, 23 Aug 2020 15:16:18 +0000 (10:16 -0500)
committerOleg Drokin <green@whamcloud.com>
Tue, 30 Mar 2021 04:16:05 +0000 (04:16 +0000)
Perform LNet recovery pings of peer NIs with an exponential backoff
interval.
 - The interval is equal to 2^(number failed pings) up to a maximum
   of 900 seconds (15 minutes).
 - When a message is received the count of failed pings for the
   associated peer NI is reset to 0 so that recovery can happen more
   quickly.

Test-Parameters: trivial
HPE-bug-id: LUS-9109
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ic7e60455015a0236a96010c07fc0ddd02078cf92
Reviewed-on: https://review.whamcloud.com/39720
Reviewed-by: Neil Brown <neilb@suse.de>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/lnet/lib-move.c
lnet/lnet/lib-msg.c
lnet/lnet/peer.c

index bb5eed0..1358f6f 100644 (file)
@@ -986,6 +986,28 @@ lnet_peer_needs_push(struct lnet_peer *lp)
        return false;
 }
 
+#define LNET_RECOVERY_INTERVAL_MAX 900
+static inline unsigned int
+lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now)
+{
+       unsigned int interval;
+
+       /* 2^9 = 512, 2^10 = 1024 */
+       if (ping_count > 9)
+               interval = LNET_RECOVERY_INTERVAL_MAX;
+       else
+               interval = 1 << ping_count;
+
+       return now + interval;
+}
+
+static inline void
+lnet_peer_ni_set_next_ping(struct lnet_peer_ni *lpni, time64_t now)
+{
+       lpni->lpni_next_ping =
+               lnet_get_next_recovery_ping(lpni->lpni_ping_count, now);
+}
+
 /*
  * A peer NI is alive if it satisfies the following two conditions:
  *  1. peer NI health >= LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage
index 2bf7039..010cf32 100644 (file)
@@ -586,6 +586,12 @@ struct lnet_peer_ni {
        atomic_t                lpni_healthv;
        /* recovery ping mdh */
        struct lnet_handle_md   lpni_recovery_ping_mdh;
+       /* When to send the next recovery ping */
+       time64_t                lpni_next_ping;
+       /* How many pings sent during current recovery period did not receive
+        * a reply. NB: reset whenever _any_ message arrives from this peer NI
+        */
+       unsigned int            lpni_ping_count;
        /* CPT this peer attached on */
        int                     lpni_cpt;
        /* state flags -- protected by lpni_lock */
index 5987b1b..0cc7767 100644 (file)
@@ -3653,6 +3653,12 @@ lnet_recover_peer_nis(void)
                }
 
                spin_unlock(&lpni->lpni_lock);
+
+               if (now < lpni->lpni_next_ping) {
+                       lnet_net_unlock(0);
+                       continue;
+               }
+
                lnet_net_unlock(0);
 
                /*
@@ -3702,6 +3708,8 @@ lnet_recover_peer_nis(void)
                                continue;
                        }
 
+                       lpni->lpni_ping_count++;
+
                        lpni->lpni_recovery_ping_mdh = mdh;
 
                        lnet_peer_ni_add_to_recoveryq_locked(lpni,
index 94f2a78..3015fd8 100644 (file)
@@ -867,8 +867,11 @@ lnet_health_check(struct lnet_msg *msg)
        switch (hstatus) {
        case LNET_MSG_STATUS_OK:
                /*
-                * increment the local ni health weather we successfully
+                * increment the local ni health whether we successfully
                 * received or sent a message on it.
+                *
+                * Ping counts are reset to 0 as appropriate to allow for
+                * faster recovery.
                 */
                lnet_inc_healthv(&ni->ni_healthv, lnet_health_sensitivity);
                /*
@@ -880,6 +883,7 @@ lnet_health_check(struct lnet_msg *msg)
                 * as indication that the router is fully healthy.
                 */
                if (lpni && msg->msg_rx_committed) {
+                       lpni->lpni_ping_count = 0;
                        /*
                         * If we're receiving a message from the router or
                         * I'm a router, then set that lpni's health to
index 903bf78..04765cf 100644 (file)
@@ -4029,14 +4029,22 @@ lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni,
                CDEBUG(D_NET, "lpni %s aged out last alive %lld\n",
                       libcfs_nid2str(lpni->lpni_nid),
                       lpni->lpni_last_alive);
+               /* Reset the ping count so that if this peer NI is added back to
+                * the recovery queue we will send the first ping right away.
+                */
+               lpni->lpni_ping_count = 0;
                return;
        }
 
        /* This peer NI is going on the recovery queue, so take a ref on it */
        lnet_peer_ni_addref_locked(lpni);
 
-       CDEBUG(D_NET, "%s added to recovery queue. last alive: %lld health: %d\n",
+       lnet_peer_ni_set_next_ping(lpni, now);
+
+       CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld last alive: %lld health: %d\n",
               libcfs_nid2str(lpni->lpni_nid),
+              lpni->lpni_ping_count,
+              lpni->lpni_next_ping,
               lpni->lpni_last_alive,
               atomic_read(&lpni->lpni_healthv));