From 38c1fc88fcd1f4206d6e6a739e58692c5254b1c1 Mon Sep 17 00:00:00 2001
From: Chris Horn <chris.horn@hpe.com>
Date: Wed, 23 Nov 2022 10:28:45 -0700
Subject: [PATCH] LU-16643 lnet: Health logging improvements

LNet health activity can generate noise in console logs. The NI/Peer
NI recovery pings could be expected to fail and the related messages
from lnet_handle_recovery_reply() are generally redundant.

Improve this logging by having the lnet_monitor_thread() provide a
summary of NIs in recovery.

Another useful metric in spotting network trouble is if we have
messages exceeding their deadline. We do not currently log this
information. Keep a count of messages that have exceeded their
deadline and track the total excess time. The lnet_monitor_thread()
will then provide a summary of the number of messages and their
average excess time at a regular interval. These stats are then
reset when the monitor thread prints this information to the console.

Because NIs can be in recovery for extended periods of time, the
interval of console updates will increase from 1 to 5 minutes.
The interval is reset when it is detected that there are no longer any
NIs in recovery and there haven't been any messages past their
deadline since the last console update.

Lustre-change: https://review.whamcloud.com/50305
Lustre-commit: 0cb3d86c4004d75810c54bb897ad7fbb6d5ec05f

Test-Parameters: trivial
HPE-bug-id: LUS-11500
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: I4ffffd0412806184282178ce0aca3073dd30d7e0
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/55073
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Cyril Bordage <cbordage@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
---
 lnet/include/lnet/lib-types.h |  11 ++-
 lnet/lnet/api-ni.c            |   2 +
 lnet/lnet/lib-move.c          | 170 +++++++++++++++++++++++++++++++++++++++---
 lnet/lnet/lib-msg.c           |  16 +++-
 4 files changed, 183 insertions(+), 16 deletions(-)

diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h
index b0c8517..2d9c781 100644
--- a/lnet/include/lnet/lib-types.h
+++ b/lnet/include/lnet/lib-types.h
@@ -1223,12 +1223,17 @@ struct lnet {
 	 */
 	struct completion		ln_started;
 
+	/* Number of messages that have exceeded their message deadline */
+	atomic_t			ln_late_msg_count;
+	/* Total amount of time past their deadline for all late ^ messages */
+	atomic64_t			ln_late_msg_nsecs;
+
 	/* for LNDs to signal that ping buffer needs updating */
 	atomic_t                        ln_update_ping_buf;
-
+	
 	/* workqueue for serving lnd ping buffer update requests */
-	struct workqueue_struct		*ln_pb_update_wq;
-	struct work_struct		ln_pb_update_work;
+	struct workqueue_struct         *ln_pb_update_wq;
+	struct work_struct              ln_pb_update_work;
 
 	atomic_t                        ln_pb_update_ready;
 };
diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c
index e76bf43..1346daf 100644
--- a/lnet/lnet/api-ni.c
+++ b/lnet/lnet/api-ni.c
@@ -1214,6 +1214,8 @@ lnet_prepare(lnet_pid_t requested_pid)
 	init_waitqueue_head(&the_lnet.ln_dc_waitq);
 	the_lnet.ln_mt_handler = NULL;
 	init_completion(&the_lnet.ln_started);
+	atomic_set(&the_lnet.ln_late_msg_count, 0);
+	atomic64_set(&the_lnet.ln_late_msg_nsecs, 0);
 
 	rc = lnet_slab_setup();
 	if (rc != 0)
diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c
index 61d15ef..d0634fb 100644
--- a/lnet/lnet/lib-move.c
+++ b/lnet/lnet/lib-move.c
@@ -3370,8 +3370,11 @@ lnet_unlink_ni_recovery_mdh_locked(struct lnet_ni *ni, int cpt, bool force)
 	lnet_ni_lock(ni);
 }
 
-static void
-lnet_recover_local_nis(void)
+/* Returns the total number of local NIs in recovery.
+ * Records up to @arrsz of the associated NIDs in the @nidarr array
+ */
+static int
+lnet_recover_local_nis(lnet_nid_t *nidarr, unsigned int arrsz)
 {
 	struct lnet_mt_event_info *ev_info;
 	LIST_HEAD(processed_list);
@@ -3383,6 +3386,7 @@ lnet_recover_local_nis(void)
 	int healthv;
 	int rc;
 	time64_t now;
+	unsigned int nnis = 0;
 
 	/*
 	 * splice the recovery queue on a local queue. We will iterate
@@ -3421,6 +3425,10 @@ lnet_recover_local_nis(void)
 			continue;
 		}
 
+		if (nnis < arrsz)
+			nidarr[nnis] = ni->ni_nid;
+		nnis++;
+
 		/*
 		 * if the local NI failed recovery we must unlink the md.
 		 * But we want to keep the local_ni on the recovery queue
@@ -3531,6 +3539,8 @@ lnet_recover_local_nis(void)
 	lnet_net_lock(0);
 	list_splice(&local_queue, &the_lnet.ln_mt_localNIRecovq);
 	lnet_net_unlock(0);
+
+	return nnis;
 }
 
 static int
@@ -3630,8 +3640,11 @@ lnet_clean_resendqs(void)
 	cfs_percpt_free(the_lnet.ln_mt_resendqs);
 }
 
-static void
-lnet_recover_peer_nis(void)
+/* Returns the total number of peer NIs in recovery.
+ * Records up to @arrsz of the associated NIDs in the @nidarr array
+ */
+static unsigned int
+lnet_recover_peer_nis(lnet_nid_t *nidarr, unsigned int arrsz)
 {
 	struct lnet_mt_event_info *ev_info;
 	LIST_HEAD(processed_list);
@@ -3643,6 +3656,7 @@ lnet_recover_peer_nis(void)
 	int healthv;
 	int rc;
 	time64_t now;
+	unsigned int nlpnis = 0;
 
 	/*
 	 * Always use cpt 0 for locking across all interactions with
@@ -3674,6 +3688,10 @@ lnet_recover_peer_nis(void)
 			continue;
 		}
 
+		if (nlpnis < arrsz)
+			nidarr[nlpnis] = lpni->lpni_nid;
+		nlpnis++;
+
 		/*
 		 * If the peer NI has failed recovery we must unlink the
 		 * md. But we want to keep the peer ni on the recovery
@@ -3756,14 +3774,141 @@ lnet_recover_peer_nis(void)
 			/* Drop the ref taken by lnet_find_peer_ni_locked() */
 			lnet_peer_ni_decref_locked(lpni);
 			lnet_net_unlock(0);
-		} else
+		} else {
 			spin_unlock(&lpni->lpni_lock);
+		}
 	}
 
 	list_splice_init(&processed_list, &local_queue);
 	lnet_net_lock(0);
 	list_splice(&local_queue, &the_lnet.ln_mt_peerNIRecovq);
 	lnet_net_unlock(0);
+
+	return nlpnis;
+}
+
+#define LNET_MAX_NNIDS 20
+/* @nids is array of nids that are in recovery. It has max size of
+ * LNET_MAX_NNIDS.
+ * @nnids is the total number of nids that are in recovery. It can be
+ * larger than LNET_MAX_NNIDS.
+ * @local tells us whether these are local or peer NIs in recovery.
+ */
+static void
+lnet_print_recovery_list(lnet_nid_t *nids, unsigned int nnids,
+			 bool local)
+{
+	static bool printed;
+	char *buf = NULL;
+	char *tmp;
+	int i;
+	unsigned int arrsz;
+	unsigned int bufsz;
+
+	if (!nnids)
+		return;
+
+	arrsz = nnids < LNET_MAX_NNIDS ? nnids : LNET_MAX_NNIDS;
+
+	/* Printing arrsz NIDs, each has max size LNET_NIDSTR_SIZE, a comma
+	 * and space for each nid after the first (2 * (arrsz - 1)),
+	 * + 1 for terminating null byte
+	 */
+	bufsz = (arrsz * LNET_NIDSTR_SIZE) + (2 * (arrsz - 1)) + 1;
+	LIBCFS_ALLOC(buf, bufsz);
+	if (!buf) {
+		LCONSOLE(D_INFO, "%u %s NIs in recovery\n",
+			 nnids, local ? "local" : "peer");
+		return;
+	}
+
+	tmp = buf;
+	tmp += sprintf(tmp, "%s", libcfs_nid2str(nids[0]));
+	for (i = 1; i < arrsz; i++)
+		tmp += sprintf(tmp, ", %s", libcfs_nid2str(nids[i]));
+
+	/* LCONSOLE() used to avoid rate limiting when we have both local
+	 * and peer NIs in recovery
+	 */
+	LCONSOLE(D_INFO, "%u %s NIs in recovery (showing %u): %s\n",
+		 nnids, local ? "local" : "peer", arrsz, buf);
+
+	LIBCFS_FREE(buf, bufsz);
+
+	if (!printed && nnids > LNET_MAX_NNIDS) {
+		LCONSOLE(D_INFO, "See full list with 'lnetctl debug recovery -(p|l)'\n");
+		printed = true;
+	}
+}
+
+static void
+lnet_health_update_console(lnet_nid_t *lnids, unsigned int nnis,
+			   lnet_nid_t *rnids, unsigned int nlpnis,
+			   time64_t now)
+{
+	static time64_t next_ni_update;
+	static time64_t next_lpni_update;
+	static time64_t next_msg_update;
+	static unsigned int num_ni_updates;
+	static unsigned int num_lpni_updates;
+	static unsigned int num_msg_updates = 1;
+	int late_count;
+
+	if (now >= next_ni_update) {
+		if (nnis) {
+			lnet_print_recovery_list(lnids, nnis, true);
+			if (num_ni_updates < 5)
+				num_ni_updates++;
+			next_ni_update = now + (60 * num_ni_updates);
+		} else {
+			next_ni_update = 0;
+			num_ni_updates = 0;
+		}
+	}
+
+
+	if (now >= next_lpni_update) {
+		if (nlpnis) {
+			lnet_print_recovery_list(rnids, nlpnis, false);
+			if (num_lpni_updates < 5)
+				num_lpni_updates++;
+			next_lpni_update = now + (60 * num_lpni_updates);
+		} else {
+			next_lpni_update = 0;
+			num_lpni_updates = 0;
+		}
+	}
+
+	/* Let late_count accumulate for 60 seconds */
+	if (unlikely(!next_msg_update))
+		next_msg_update = now + 60;
+
+	if (now >= next_msg_update) {
+		late_count = atomic_read(&the_lnet.ln_late_msg_count);
+
+		if (late_count) {
+			s64 avg = atomic64_xchg(&the_lnet.ln_late_msg_nsecs, 0) /
+				  atomic_xchg(&the_lnet.ln_late_msg_count, 0);
+
+			if (avg > NSEC_PER_SEC) {
+				unsigned int avg_msec;
+
+				avg_msec = do_div(avg, NSEC_PER_SEC) /
+						NSEC_PER_MSEC;
+				LCONSOLE_INFO("%u messages in past %us over their deadline by avg %lld.%03us\n",
+					      late_count,
+					      (60 * num_msg_updates), avg,
+					      avg_msec);
+
+				if (num_msg_updates < 5)
+					num_msg_updates++;
+				next_msg_update = now + (60 * num_msg_updates);
+			}
+		} else {
+			next_msg_update = now + 60;
+			num_msg_updates = 1;
+		}
+	}
 }
 
 static int
@@ -3771,8 +3916,13 @@ lnet_monitor_thread(void *arg)
 {
 	time64_t rsp_timeout = 0;
 	time64_t now;
+	unsigned int nnis;
+	unsigned int nlpnis;
+	lnet_nid_t local_nids[LNET_MAX_NNIDS];
+	lnet_nid_t peer_nids[LNET_MAX_NNIDS];
 
 	wait_for_completion(&the_lnet.ln_started);
+
 	/*
 	 * The monitor thread takes care of the following:
 	 *  1. Checks the aliveness of routers
@@ -3798,8 +3948,10 @@ lnet_monitor_thread(void *arg)
 			rsp_timeout = now + (lnet_transaction_timeout / 2);
 		}
 
-		lnet_recover_local_nis();
-		lnet_recover_peer_nis();
+		nnis = lnet_recover_local_nis(local_nids, LNET_MAX_NNIDS);
+		nlpnis = lnet_recover_peer_nis(peer_nids, LNET_MAX_NNIDS);
+		lnet_health_update_console(local_nids, nnis, peer_nids, nlpnis,
+					   now);
 
 		lnet_queue_ping_buffer_update();
 
@@ -3919,7 +4071,7 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
 		lnet_net_unlock(0);
 
 		if (status != 0) {
-			CERROR("local NI (%s) recovery failed with %d\n",
+			CDEBUG(D_NET, "local NI (%s) recovery failed with %d\n",
 			       libcfs_nid2str(nid), status);
 			return;
 		}
@@ -3953,7 +4105,7 @@ lnet_handle_recovery_reply(struct lnet_mt_event_info *ev_info,
 		lnet_net_unlock(cpt);
 
 		if (status != 0)
-			CERROR("peer NI (%s) recovery failed with %d\n",
+			CDEBUG(D_NET, "peer NI (%s) recovery failed with %d\n",
 			       libcfs_nid2str(nid), status);
 	}
 }
diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c
index 82d7a2e..8edab9b 100644
--- a/lnet/lnet/lib-msg.c
+++ b/lnet/lnet/lib-msg.c
@@ -790,6 +790,7 @@ lnet_health_check(struct lnet_msg *msg)
 	bool attempt_remote_resend;
 	bool handle_local_health;
 	bool handle_remote_health;
+	ktime_t now;
 
 	/* if we're shutting down no point in handling health. */
 	if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING)
@@ -807,10 +808,6 @@ lnet_health_check(struct lnet_msg *msg)
 		 msg->msg_txni->ni_nid == LNET_NID_LO_0)
 		lo = true;
 
-	if (hstatus != LNET_MSG_STATUS_OK &&
-	    ktime_after(ktime_get(), msg->msg_deadline))
-		return -1;
-
 	/*
 	 * always prefer txni/txpeer if they message is committed for both
 	 * directions.
@@ -830,6 +827,17 @@ lnet_health_check(struct lnet_msg *msg)
 	else
 		LASSERT(ni);
 
+	now = ktime_get();
+	if (ktime_after(now, msg->msg_deadline)) {
+		s64 time = ktime_to_ns(ktime_sub(now, msg->msg_deadline));
+
+		atomic64_add(time, &the_lnet.ln_late_msg_nsecs);
+		atomic_inc(&the_lnet.ln_late_msg_count);
+
+		if (hstatus != LNET_MSG_STATUS_OK)
+			return -1;
+	}
+
 	CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
 	       libcfs_nid2str(ni->ni_nid),
 	       (lo) ? "self" : libcfs_nid2str(lpni->lpni_nid),
-- 
1.8.3.1