From 917553c537a8860f57a50dc9752e3ac69d06c11c Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Sun, 23 Aug 2020 10:16:18 -0500 Subject: [PATCH] LU-13569 lnet: Recover peer NI w/exponential backoff interval Perform LNet recovery pings of peer NIs with an exponential backoff interval. - The interval is equal to 2^(number failed pings) up to a maximum of 900 seconds (15 minutes). - When a message is received the count of failed pings for the associated peer NI is reset to 0 so that recovery can happen more quickly. Test-Parameters: trivial HPE-bug-id: LUS-9109 Signed-off-by: Chris Horn Change-Id: Ic7e60455015a0236a96010c07fc0ddd02078cf92 Reviewed-on: https://review.whamcloud.com/39720 Reviewed-by: Neil Brown Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alexander Boyko Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- lnet/include/lnet/lib-lnet.h | 22 ++++++++++++++++++++++ lnet/include/lnet/lib-types.h | 6 ++++++ lnet/lnet/lib-move.c | 8 ++++++++ lnet/lnet/lib-msg.c | 6 +++++- lnet/lnet/peer.c | 10 +++++++++- 5 files changed, 50 insertions(+), 2 deletions(-) diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index bb5eed0..1358f6f 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -986,6 +986,28 @@ lnet_peer_needs_push(struct lnet_peer *lp) return false; } +#define LNET_RECOVERY_INTERVAL_MAX 900 +static inline unsigned int +lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now) +{ + unsigned int interval; + + /* 2^9 = 512, 2^10 = 1024 */ + if (ping_count > 9) + interval = LNET_RECOVERY_INTERVAL_MAX; + else + interval = 1 << ping_count; + + return now + interval; +} + +static inline void +lnet_peer_ni_set_next_ping(struct lnet_peer_ni *lpni, time64_t now) +{ + lpni->lpni_next_ping = + lnet_get_next_recovery_ping(lpni->lpni_ping_count, now); +} + /* * A peer NI is alive if it satisfies the following two conditions: * 1. peer NI health >= LNET_MAX_HEALTH_VALUE * router_sensitivity_percentage diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 2bf7039..010cf32 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -586,6 +586,12 @@ struct lnet_peer_ni { atomic_t lpni_healthv; /* recovery ping mdh */ struct lnet_handle_md lpni_recovery_ping_mdh; + /* When to send the next recovery ping */ + time64_t lpni_next_ping; + /* How many pings sent during current recovery period did not receive + * a reply. NB: reset whenever _any_ message arrives from this peer NI + */ + unsigned int lpni_ping_count; /* CPT this peer attached on */ int lpni_cpt; /* state flags -- protected by lpni_lock */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 5987b1b..0cc7767 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -3653,6 +3653,12 @@ lnet_recover_peer_nis(void) } spin_unlock(&lpni->lpni_lock); + + if (now < lpni->lpni_next_ping) { + lnet_net_unlock(0); + continue; + } + lnet_net_unlock(0); /* @@ -3702,6 +3708,8 @@ lnet_recover_peer_nis(void) continue; } + lpni->lpni_ping_count++; + lpni->lpni_recovery_ping_mdh = mdh; lnet_peer_ni_add_to_recoveryq_locked(lpni, diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 94f2a78..3015fd8 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -867,8 +867,11 @@ lnet_health_check(struct lnet_msg *msg) switch (hstatus) { case LNET_MSG_STATUS_OK: /* - * increment the local ni health weather we successfully + * increment the local ni health whether we successfully * received or sent a message on it. + * + * Ping counts are reset to 0 as appropriate to allow for + * faster recovery. */ lnet_inc_healthv(&ni->ni_healthv, lnet_health_sensitivity); /* @@ -880,6 +883,7 @@ lnet_health_check(struct lnet_msg *msg) * as indication that the router is fully healthy. */ if (lpni && msg->msg_rx_committed) { + lpni->lpni_ping_count = 0; /* * If we're receiving a message from the router or * I'm a router, then set that lpni's health to diff --git a/lnet/lnet/peer.c b/lnet/lnet/peer.c index 903bf78..04765cf 100644 --- a/lnet/lnet/peer.c +++ b/lnet/lnet/peer.c @@ -4029,14 +4029,22 @@ lnet_peer_ni_add_to_recoveryq_locked(struct lnet_peer_ni *lpni, CDEBUG(D_NET, "lpni %s aged out last alive %lld\n", libcfs_nid2str(lpni->lpni_nid), lpni->lpni_last_alive); + /* Reset the ping count so that if this peer NI is added back to + * the recovery queue we will send the first ping right away. + */ + lpni->lpni_ping_count = 0; return; } /* This peer NI is going on the recovery queue, so take a ref on it */ lnet_peer_ni_addref_locked(lpni); - CDEBUG(D_NET, "%s added to recovery queue. last alive: %lld health: %d\n", + lnet_peer_ni_set_next_ping(lpni, now); + + CDEBUG(D_NET, "%s added to recovery queue. ping count: %u next ping: %lld last alive: %lld health: %d\n", libcfs_nid2str(lpni->lpni_nid), + lpni->lpni_ping_count, + lpni->lpni_next_ping, lpni->lpni_last_alive, atomic_read(&lpni->lpni_healthv)); -- 1.8.3.1