From c64484bb87eab9834e825927fbf18e658a2a8d57 Mon Sep 17 00:00:00 2001
From: Chris Horn <chris.horn@hpe.com>
Date: Fri, 28 Oct 2022 16:27:17 -0600
Subject: [PATCH] LU-16451 kfilnd: Throttle traffic to down peers

If a transaction fails with -EHOSTUNREACH then this suggests the
target is actually down. We want to avoid consuming resources on the
local NIC by trying to send messages to down peers, so we will
require a hello handshake before injecting other new messages to a
peer we suspect is down.

Introduce a new kfilnd_peer state, KP_STATE_DOWN. Peers in either
KP_STATE_UPTODATE or KP_STATE_STALE can transition to KP_STATE_DOWN.
We'll transition a peer to KP_STATE_DOWN if we fail a transaction with
them and the errno we get is either -EHOSTUNREACH or -ENOTCONN.
kfilnd_peer_down() transitions a peer to KP_STATE_DOWN as appropriate.

Similar to stale peers, if we continue to fail transactions with peers
that are down then we want to eventually purge them from the peer
cache. This logic in kfilnd_peer_stale() is moved to
kfilnd_peer_purge_old_peer(), and this new function is called by both
kfilnd_peer_stale() and kfilnd_peer_down().

Introduce kfilnd_peer_needs_throttle() that determines whether we
should queue a message for future replay pending a successful
handshake. Integrate this into kfilnd_tn_state_idle() so that we queue
messages for peers in KP_STATE_DOWN in addition to peers in
KP_STATE_NEW. Modify debug statements in this area to remove redundant
info and reflect that we can hit these conditions for down peers, not
just new peers.

Introduce kfilnd_peer_tn_failed() to interpret the errno for a
transaction failure and call kfilnd_peer_down() or kfilnd_peer_stale()
as appropriate. This function replaces all existing calls to
kfilnd_peer_stale(). kfilnd_peer_stale() is now a static function.

HPE-bug-id: LUS-11314
Test-Parameters: trivial
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I206075c3a1b2836715dc79b49b098dab51c6bb94
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/49591
Reviewed-by: Ron Gredvig <ron.gredvig@hpe.com>
Reviewed-by: Ian Ziemba <ian.ziemba@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
---
 lnet/klnds/kfilnd/kfilnd.h      |  9 +++++
 lnet/klnds/kfilnd/kfilnd_peer.c | 75 ++++++++++++++++++++++++++++++++++-------
 lnet/klnds/kfilnd/kfilnd_peer.h |  2 +-
 lnet/klnds/kfilnd/kfilnd_tn.c   | 35 +++++++++----------
 4 files changed, 88 insertions(+), 33 deletions(-)

diff --git a/lnet/klnds/kfilnd/kfilnd.h b/lnet/klnds/kfilnd/kfilnd.h
index b8243d8..3173959 100644
--- a/lnet/klnds/kfilnd/kfilnd.h
+++ b/lnet/klnds/kfilnd/kfilnd.h
@@ -227,6 +227,8 @@ struct kfilnd_ep {
 #define KP_STATE_UPTODATE 0x2
 /* Peer experienced some sort of network failure */
 #define KP_STATE_STALE 0x3
+/* We suspect this peer is actually down or otherwise unreachable */
+#define KP_STATE_DOWN 0x4
 
 struct kfilnd_peer {
 	struct rhash_head kp_node;
@@ -270,6 +272,13 @@ static inline bool kfilnd_peer_is_new_peer(struct kfilnd_peer *kp)
 	return atomic_read(&kp->kp_state) == KP_STATE_NEW;
 }
 
+static inline bool kfilnd_peer_needs_throttle(struct kfilnd_peer *kp)
+{
+	unsigned int kp_state = atomic_read(&kp->kp_state);
+
+	return (kp_state == KP_STATE_NEW || kp_state == KP_STATE_DOWN);
+}
+
 /* Peer needs hello if it is not up to date and there is not already a hello
  * in flight.
  *
diff --git a/lnet/klnds/kfilnd/kfilnd_peer.c b/lnet/klnds/kfilnd/kfilnd_peer.c
index d65a21b..412d2ee 100644
--- a/lnet/klnds/kfilnd/kfilnd_peer.c
+++ b/lnet/klnds/kfilnd/kfilnd_peer.c
@@ -56,20 +56,14 @@ static void kfilnd_peer_free(void *ptr, void *arg)
 }
 
 /**
- * kfilnd_peer_stale() - Mark a peer as stale.
- * @kp: Peer to be marked stale
- * Note: only "up-to-date" peers can be marked stale. If we haven't completed
- * a transaction with this peer within 5 LND timeouts then delete this peer.
+ * kfilnd_peer_purge_old_peer() - Delete the specified peer from the cache
+ * if we haven't heard from it within 5x LND timeouts.
+ * @kp: The peer to be checked or purged
  */
-void kfilnd_peer_stale(struct kfilnd_peer *kp)
+static void kfilnd_peer_purge_old_peer(struct kfilnd_peer *kp)
 {
-	if (atomic_cmpxchg(&kp->kp_state,
-			   KP_STATE_UPTODATE,
-			   KP_STATE_STALE) == KP_STATE_UPTODATE) {
-		CDEBUG(D_NET, "%s(%p):0x%llx is stale\n",
-		       libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
-	} else if (ktime_before(kp->kp_last_alive + lnet_get_lnd_timeout() * 5,
-			       ktime_get_seconds())) {
+	if (ktime_after(ktime_get_seconds(),
+			kp->kp_last_alive + (lnet_get_lnd_timeout() * 5))) {
 		CDEBUG(D_NET,
 		       "Haven't heard from %s(%p):0x%llx in %lld seconds\n",
 		       libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr,
@@ -79,6 +73,63 @@ void kfilnd_peer_stale(struct kfilnd_peer *kp)
 }
 
 /**
+ * kfilnd_peer_stale() - Mark a peer as stale. If the peer is already stale then
+ * check whether it should be deleted.
+ * @kp: Peer to be marked stale
+ * Note: only "up-to-date" peers can be marked stale.
+ */
+static void kfilnd_peer_stale(struct kfilnd_peer *kp)
+{
+	if (atomic_cmpxchg(&kp->kp_state,
+			   KP_STATE_UPTODATE,
+			   KP_STATE_STALE) == KP_STATE_UPTODATE) {
+		CDEBUG(D_NET, "%s(%p):0x%llx uptodate -> stale\n",
+		       libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
+	} else {
+		kfilnd_peer_purge_old_peer(kp);
+	}
+}
+
+/**
+ * kfilnd_peer_down() - Mark a peer as down. If the peer is already down then
+ * check whether it should be deleted.
+ * @kp: Peer to be marked down
+ * Note: Only peers that are "up-to-date" or "stale" can be marked down.
+ */
+static void kfilnd_peer_down(struct kfilnd_peer *kp)
+{
+	if (atomic_read(&kp->kp_state) == KP_STATE_DOWN) {
+		kfilnd_peer_purge_old_peer(kp);
+	} else if (atomic_cmpxchg(&kp->kp_state,
+				  KP_STATE_UPTODATE,
+				  KP_STATE_DOWN) == KP_STATE_UPTODATE) {
+		CDEBUG(D_NET, "%s(%p):0x%llx uptodate -> down\n",
+		       libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
+	} else if (atomic_cmpxchg(&kp->kp_state,
+				  KP_STATE_STALE,
+				  KP_STATE_DOWN) == KP_STATE_STALE) {
+		CDEBUG(D_NET, "%s(%p):0x%llx stale -> down\n",
+		       libcfs_nid2str(kp->kp_nid), kp, kp->kp_addr);
+	}
+}
+
+/**
+ * kfilnd_peer_tn_failed() - A transaction with this peer has failed. Mark the
+ * peer as either stale or down depending on the provided error value.
+ * @kp: The peer to be marked down or stale
+ * @error: An errno indicating why the transaction failed.
+ * Note: We currently only consider EHOSTUNREACH which corresponds to
+ * C_RC_UNDELIVERABLE, and ENOTCONN which corresponds to C_RC_VNI_NOT_FOUND.
+ */
+void kfilnd_peer_tn_failed(struct kfilnd_peer *kp, int error)
+{
+	if (error == -EHOSTUNREACH || error == -ENOTCONN)
+		kfilnd_peer_down(kp);
+	else
+		kfilnd_peer_stale(kp);
+}
+
+/**
  * kfilnd_peer_del() - Mark a peer for deletion
  * @kp: Peer to be deleted
  */
diff --git a/lnet/klnds/kfilnd/kfilnd_peer.h b/lnet/klnds/kfilnd/kfilnd_peer.h
index 612c883..963d802 100644
--- a/lnet/klnds/kfilnd/kfilnd_peer.h
+++ b/lnet/klnds/kfilnd/kfilnd_peer.h
@@ -34,7 +34,6 @@
 
 #include "kfilnd.h"
 
-void kfilnd_peer_stale(struct kfilnd_peer *kp);
 void kfilnd_peer_del(struct kfilnd_peer *kp);
 void kfilnd_peer_put(struct kfilnd_peer *kp);
 struct kfilnd_peer *kfilnd_peer_get(struct kfilnd_dev *dev, lnet_nid_t nid);
@@ -44,5 +43,6 @@ void kfilnd_peer_init(struct kfilnd_dev *dev);
 kfi_addr_t kfilnd_peer_get_kfi_addr(struct kfilnd_peer *kp);
 u16 kfilnd_peer_target_rx_base(struct kfilnd_peer *kp);
 void kfilnd_peer_process_hello(struct kfilnd_peer *kp, struct kfilnd_msg *msg);
+void kfilnd_peer_tn_failed(struct kfilnd_peer *kp, int error);
 
 #endif /* _KFILND_PEER_ */
diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c
index 9666d85..a442b70 100644
--- a/lnet/klnds/kfilnd/kfilnd_tn.c
+++ b/lnet/klnds/kfilnd/kfilnd_tn.c
@@ -623,7 +623,7 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn,
 	/* For new peers, send a hello request message and queue the true LNet
 	 * message for replay.
 	 */
-	if (kfilnd_peer_is_new_peer(tn->tn_kp) &&
+	if (kfilnd_peer_needs_throttle(tn->tn_kp) &&
 	    (event == TN_EVENT_INIT_IMMEDIATE || event == TN_EVENT_INIT_BULK)) {
 		if (kfilnd_peer_deleted(tn->tn_kp)) {
 			/* We'll assign a NETWORK_TIMEOUT message health status
@@ -631,23 +631,19 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn,
 			 * for removal
 			 */
 			rc = -ESTALE;
-			KFILND_TN_DEBUG(tn,
-					"Dropping message to stale peer %s\n",
-					libcfs_nid2str(tn->tn_kp->kp_nid));
+			KFILND_TN_DEBUG(tn, "Drop message to deleted peer");
 		} else if (kfilnd_peer_needs_hello(tn->tn_kp, false)) {
-			/* This transaction was setup against a new peer, which
-			 * implies a HELLO was sent. If a HELLO is no longer
-			 * in flight then that means it has failed, and we
-			 * should cancel this TN. Otherwise we are stuck
-			 * waiting for the TN deadline.
+			/* We're throttling transactions to this peer until
+			 * a handshake can be completed, but there is no HELLO
+			 * currently in flight. This implies the HELLO has
+			 * failed, and we should cancel this TN. Otherwise we
+			 * are stuck waiting for the TN deadline.
 			 *
 			 * We assign NETWORK_TIMEOUT health status below because
 			 * we do not know why the HELLO failed.
 			 */
 			rc = -ECANCELED;
-			KFILND_TN_DEBUG(tn,
-					"Peer is new but there is no outstanding hello %s\n",
-					libcfs_nid2str(tn->tn_kp->kp_nid));
+			KFILND_TN_DEBUG(tn, "Cancel throttled TN");
 		} else if (ktime_after(tn->deadline, ktime_get_seconds())) {
 			/* If transaction deadline has not been met, return
 			 * -EAGAIN. This will cause this transaction event to be
@@ -655,8 +651,7 @@ static int kfilnd_tn_state_idle(struct kfilnd_transaction *tn,
 			 * peer should occur at which point the kfilnd version
 			 * should be negotiated.
 			 */
-			KFILND_TN_DEBUG(tn, "%s hello response pending",
-					libcfs_nid2str(tn->tn_kp->kp_nid));
+			KFILND_TN_DEBUG(tn, "hello response pending");
 			return -EAGAIN;
 		} else {
 			rc = -ETIMEDOUT;
@@ -887,7 +882,7 @@ static int kfilnd_tn_state_imm_send(struct kfilnd_transaction *tn,
 			hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
 
 		kfilnd_tn_status_update(tn, status, hstatus);
-		kfilnd_peer_stale(tn->tn_kp);
+		kfilnd_peer_tn_failed(tn->tn_kp, status);
 		if (tn->msg_type == KFILND_MSG_HELLO_REQ)
 			kfilnd_peer_clear_hello_pending(tn->tn_kp);
 		break;
@@ -1069,7 +1064,7 @@ static int kfilnd_tn_state_wait_comp(struct kfilnd_transaction *tn,
 			hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
 
 		kfilnd_tn_status_update(tn, status, hstatus);
-		kfilnd_peer_stale(tn->tn_kp);
+		kfilnd_peer_tn_failed(tn->tn_kp, status);
 
 		/* Need to cancel the tagged receive to prevent resources from
 		 * being leaked.
@@ -1153,7 +1148,7 @@ static int kfilnd_tn_state_wait_tag_rma_comp(struct kfilnd_transaction *tn,
 			hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
 
 		kfilnd_tn_status_update(tn, status, hstatus);
-		kfilnd_peer_stale(tn->tn_kp);
+		kfilnd_peer_tn_failed(tn->tn_kp, status);
 		break;
 
 	default:
@@ -1234,7 +1229,7 @@ static int kfilnd_tn_state_wait_tag_comp(struct kfilnd_transaction *tn,
 			hstatus = LNET_MSG_STATUS_REMOTE_ERROR;
 
 		kfilnd_tn_status_update(tn, status, hstatus);
-		kfilnd_peer_stale(tn->tn_kp);
+		kfilnd_peer_tn_failed(tn->tn_kp, status);
 		break;
 
 	case TN_EVENT_TAG_TX_OK:
@@ -1260,7 +1255,7 @@ static int kfilnd_tn_state_fail(struct kfilnd_transaction *tn,
 
 	switch (event) {
 	case TN_EVENT_TX_FAIL:
-		kfilnd_peer_stale(tn->tn_kp);
+		kfilnd_peer_tn_failed(tn->tn_kp, status);
 		break;
 
 	case TN_EVENT_TX_OK:
@@ -1292,7 +1287,7 @@ static int kfilnd_tn_state_wait_timeout_tag_comp(struct kfilnd_transaction *tn,
 	case TN_EVENT_TAG_RX_CANCEL:
 		kfilnd_tn_status_update(tn, -ETIMEDOUT,
 					LNET_MSG_STATUS_REMOTE_TIMEOUT);
-		kfilnd_peer_stale(tn->tn_kp);
+		kfilnd_peer_tn_failed(tn->tn_kp, -ETIMEDOUT);
 		break;
 
 	case TN_EVENT_TAG_RX_FAIL:
-- 
1.8.3.1