From de2536850ed2ecc2169dec4ccc458589314b2896 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Mon, 22 Aug 2022 13:43:36 -0600 Subject: [PATCH] LU-16214 kfilnd: Proactively handshake old peers If asked to send a message to a peer that we haven't communicated with for some time, then we run the risk of that peer having a stale (or missing) peer entry for us. This can result in the target peer silently dropping our message. To reduce the chance of this happening proactively handshake any peer we haven't talked to in the last 2x LND timeouts. Note, kfilnd_peer_needs_hello() is called on both the send and receive path. We only want to proactively handshake on the send path, so an argument is added to this function so it can distinguish between the two situations. HPE-bug-id: LUS-11125 Test-Parameters: trivial Signed-off-by: Chris Horn Change-Id: Iaacb48e5c45305869bd22335ce112b21cf67e848 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48786 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Ian Ziemba Reviewed-by: Oleg Drokin Reviewed-by: Ron Gredvig --- lnet/klnds/kfilnd/kfilnd.c | 2 +- lnet/klnds/kfilnd/kfilnd.h | 12 +++++++++++- lnet/klnds/kfilnd/kfilnd_tn.c | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lnet/klnds/kfilnd/kfilnd.c b/lnet/klnds/kfilnd/kfilnd.c index e4efdd4..5f8d688 100644 --- a/lnet/klnds/kfilnd/kfilnd.c +++ b/lnet/klnds/kfilnd/kfilnd.c @@ -159,7 +159,7 @@ static int kfilnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *msg) return rc; } - if (kfilnd_peer_needs_hello(tn->tn_kp)) { + if (kfilnd_peer_needs_hello(tn->tn_kp, true)) { rc = kfilnd_send_hello_request(dev, cpt, tn->tn_kp); if (rc && kfilnd_peer_is_new_peer(tn->tn_kp)) { /* Only fail the send if this is a new peer. Otherwise diff --git a/lnet/klnds/kfilnd/kfilnd.h b/lnet/klnds/kfilnd/kfilnd.h index 4c21532..b8243d8 100644 --- a/lnet/klnds/kfilnd/kfilnd.h +++ b/lnet/klnds/kfilnd/kfilnd.h @@ -273,14 +273,24 @@ static inline bool kfilnd_peer_is_new_peer(struct kfilnd_peer *kp) /* Peer needs hello if it is not up to date and there is not already a hello * in flight. * + * Called from the send path and the receive path. When called from send path + * we additionally consider the peer's last alive value, and proactively + * handshake peers that we haven't talked to in a while. + * * If hello was sent more than LND timeout seconds ago, and we never received a * response, then send another one. */ -static inline bool kfilnd_peer_needs_hello(struct kfilnd_peer *kp) +static inline bool kfilnd_peer_needs_hello(struct kfilnd_peer *kp, + bool proactive_handshake) { if (atomic_read(&kp->kp_hello_pending) == 0) { if (atomic_read(&kp->kp_state) != KP_STATE_UPTODATE) return true; + else if (proactive_handshake && + ktime_before(kp->kp_last_alive + + lnet_get_lnd_timeout() * 2, + ktime_get_seconds())) + return true; } else if (ktime_before(kp->kp_hello_ts + lnet_get_lnd_timeout(), ktime_get_seconds())) { /* Sent hello but never received reply */ diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c index 9518672..505577e 100644 --- a/lnet/klnds/kfilnd/kfilnd_tn.c +++ b/lnet/klnds/kfilnd/kfilnd_tn.c @@ -728,7 +728,7 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn, break; case TN_EVENT_RX_OK: - if (kfilnd_peer_needs_hello(tn->tn_kp)) { + if (kfilnd_peer_needs_hello(tn->tn_kp, false)) { rc = kfilnd_send_hello_request(tn->tn_ep->end_dev, tn->tn_ep->end_cpt, tn->tn_kp); -- 1.8.3.1