Whamcloud - gitweb
LU-16214 kfilnd: Proactively handshake old peers 86/48786/4
authorChris Horn <chris.horn@hpe.com>
Mon, 22 Aug 2022 19:43:36 +0000 (13:43 -0600)
committerOleg Drokin <green@whamcloud.com>
Thu, 19 Jan 2023 15:30:52 +0000 (15:30 +0000)
If asked to send a message to a peer that we haven't communicated with
for some time, then we run the risk of that peer having a stale
(or missing) peer entry for us. This can result in the target peer
silently dropping our message. To reduce the chance of this happening
proactively handshake any peer we haven't talked to in the last 2x LND
timeouts.

Note, kfilnd_peer_needs_hello() is called on both the send and receive
path. We only want to proactively handshake on the send path, so an
argument is added to this function so it can distinguish between the
two situations.

HPE-bug-id: LUS-11125
Test-Parameters: trivial
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Iaacb48e5c45305869bd22335ce112b21cf67e848
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/48786
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Ian Ziemba <ian.ziemba@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Reviewed-by: Ron Gredvig <ron.gredvig@hpe.com>
lnet/klnds/kfilnd/kfilnd.c
lnet/klnds/kfilnd/kfilnd.h
lnet/klnds/kfilnd/kfilnd_tn.c

index e4efdd4..5f8d688 100644 (file)
@@ -159,7 +159,7 @@ static int kfilnd_send(struct lnet_ni *ni, void *private, struct lnet_msg *msg)
                return rc;
        }
 
-       if (kfilnd_peer_needs_hello(tn->tn_kp)) {
+       if (kfilnd_peer_needs_hello(tn->tn_kp, true)) {
                rc = kfilnd_send_hello_request(dev, cpt, tn->tn_kp);
                if (rc && kfilnd_peer_is_new_peer(tn->tn_kp)) {
                        /* Only fail the send if this is a new peer. Otherwise
index 4c21532..b8243d8 100644 (file)
@@ -273,14 +273,24 @@ static inline bool kfilnd_peer_is_new_peer(struct kfilnd_peer *kp)
 /* Peer needs hello if it is not up to date and there is not already a hello
  * in flight.
  *
+ * Called from the send path and the receive path. When called from send path
+ * we additionally consider the peer's last alive value, and proactively
+ * handshake peers that we haven't talked to in a while.
+ *
  * If hello was sent more than LND timeout seconds ago, and we never received a
  * response, then send another one.
  */
-static inline bool kfilnd_peer_needs_hello(struct kfilnd_peer *kp)
+static inline bool kfilnd_peer_needs_hello(struct kfilnd_peer *kp,
+                                          bool proactive_handshake)
 {
        if (atomic_read(&kp->kp_hello_pending) == 0) {
                if (atomic_read(&kp->kp_state) != KP_STATE_UPTODATE)
                        return true;
+               else if (proactive_handshake &&
+                        ktime_before(kp->kp_last_alive +
+                                     lnet_get_lnd_timeout() * 2,
+                                     ktime_get_seconds()))
+                       return true;
        } else if (ktime_before(kp->kp_hello_ts + lnet_get_lnd_timeout(),
                                ktime_get_seconds())) {
                /* Sent hello but never received reply */
index 9518672..505577e 100644 (file)
@@ -728,7 +728,7 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn,
                break;
 
        case TN_EVENT_RX_OK:
-               if (kfilnd_peer_needs_hello(tn->tn_kp)) {
+               if (kfilnd_peer_needs_hello(tn->tn_kp, false)) {
                        rc = kfilnd_send_hello_request(tn->tn_ep->end_dev,
                                                       tn->tn_ep->end_cpt,
                                                       tn->tn_kp);