From 504b0b0b610b178a9fdc13c16dfb381d17d54827 Mon Sep 17 00:00:00 2001
From: Alexandre Ioffe <aioffe@ddn.com>
Date: Wed, 7 Feb 2024 14:12:26 -0800
Subject: [PATCH] EX-9054 lipe: Add SSH stats per agent

- Add stats counters on SSH per agent: error and
  disconnection counters
- Add summary counter on total SSH inactivity
- Disconnect agent when lfs command option request fails
- Report log on INFO level when agent becomes active again
- Fixed minor bugs:
  o memory leak when system error happens in
    pthread_tryjoin_np()
  o Missed stats on job retries

Test-Parameters: trivial testlist=hot-pools
Signed-off-by: Alexandre Ioffe <aioffe@ddn.com>
Change-Id: I35eebf61d35eb913a167ebd795779188a6217dac
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/53957
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
---
 lipe/src/lamigo.c | 147 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 88 insertions(+), 59 deletions(-)

diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c
index d7d88fa..d5ebc53 100644
--- a/lipe/src/lamigo.c
+++ b/lipe/src/lamigo.c
@@ -204,6 +204,7 @@ struct resync_agent {
 	int rag_index;
 	int rag_maxjobs;
 	int rag_jobs;
+	unsigned long rag_total_jobs;
 	bool rag_bad;
 	bool rag_is_local;
 	bool rag_opts_checked;
@@ -215,6 +216,8 @@ struct resync_agent {
 	pthread_mutex_t rag_ssh_lock;
 	pthread_cond_t rag_ssh_cond;
 	char rag_extra_options[128];
+	unsigned long rag_s_ssh_err;	/* SSH error counter */
+	unsigned long rag_s_inact_cnt;	/* Agent disconnected counter */
 };
 
 struct resync_job;
@@ -226,8 +229,8 @@ struct resync_job {
 	__u64 rj_index;
 	pthread_t rj_pid;
 	int rj_resync;
-	int rj_setprefer;
-	int rj_check_job;
+	bool rj_setprefer;
+	bool rj_check_job;
 	int rj_stripes;
 	int rj_stripe_size;
 	struct pool_list *rj_pool;
@@ -329,6 +332,7 @@ struct stats {
 	unsigned long s_skip_by_source;
 	unsigned long s_extend_by_target;
 	unsigned long s_job_timedout;
+	unsigned long s_all_agents_inact; /* All agents inactive */
 	struct history s_hist[100];
 	int	       s_hist_cur;
 };
@@ -762,19 +766,21 @@ static void lamigo_dump_stats_file(void)
 		lamigo_last_cleared, timestr);
 
 #define AGENT_FMT \
-	"    agent%d: { host: %s, mnt: %s, jobs: %u, max: %u, state: %s }\n"
+	"    agent%d: { host: %s, mnt: %s, jobs: %u, max: %u, state: %s, ssh_err: %lu/%lu, inact_cnt: %lu }\n"
 
 	fprintf(f, "agents:\n");
 	lipe_list_for_each_entry(a, &lamigo_agent_list, rag_list) {
 		fprintf(f, AGENT_FMT, a->rag_index, a->rag_hostname,
 			a->rag_mountpoint, a->rag_jobs, a->rag_maxjobs,
-			a->rag_bad ? "inactive" : "active");
+			a->rag_bad ? "inactive" : "active",
+			a->rag_s_ssh_err, a->rag_total_jobs, a->rag_s_inact_cnt);
 	}
+	fprintf(f, "all_agents_inact: %lu\n", stats.s_all_agents_inact);
 
 	fprintf(f, "jobs:\n");
 	lamigo_dump_jobs(f, &lamigo_job_list);
 
-	fprintf(f, "failed jobs:\n");
+	fprintf(f, "delayed jobs:\n");
 	lamigo_dump_jobs(f, &lamigo_failed_job_list);
 
 	fprintf(f, "history:\n");
@@ -1014,6 +1020,31 @@ out:
 	return rc;
 }
 
+static void check_bad_agent(struct resync_agent *agent, intptr_t retval)
+{
+	if (retval == SSH_ERROR) { /* couldn't connect */
+		LX_ERROR("SSH error on host '%s'\n", agent->rag_hostname);
+		agent->rag_s_ssh_err++;
+	}
+
+	if (agent->rag_bad)
+		return;
+
+	agent->rag_bad = true;
+	assert(lamigo_max_jobs >= agent->rag_maxjobs);
+	lamigo_max_jobs -= agent->rag_maxjobs;
+	agent->rag_opts_checked = false;
+	if (lamigo_max_jobs != 0) {
+		LX_WARN("agent '%s' is bad. New job limit: %lu\n",
+			agent->rag_hostname, lamigo_max_jobs);
+	} else {
+		LX_ERROR("agent '%s' is bad. All agents disconnected\n",
+			 agent->rag_hostname);
+		stats.s_all_agents_inact++;
+	}
+	agent->rag_s_inact_cnt++;
+}
+
 static void lamigo_agent_check_options(struct resync_agent *agent)
 {
 	struct resync_ssh_session *rss;
@@ -1041,7 +1072,10 @@ static void lamigo_agent_check_options(struct resync_agent *agent)
 	}
 
 	if (rc != 0) {
-		LX_ERROR("getting lfs options failed\n");
+		LX_ERROR("getting lfs options on '%s' failed\n",
+			 agent->rag_hostname);
+		check_bad_agent(agent, SSH_ERROR);
+
 		goto out;
 	}
 
@@ -1166,7 +1200,10 @@ static int lamigo_spawn_replication(struct resync_job *rj)
 		}
 	}
 	if (!a) {
-		LX_DEBUG("no good agent\n");
+		/* This must not happen. We never spawn replication
+		 * when no agents available
+		 */
+		LX_ERROR("no good agent\n");
 		rc = -EBUSY;
 		goto out;
 	}
@@ -1184,6 +1221,7 @@ static int lamigo_spawn_replication(struct resync_job *rj)
 
 	rj->rj_pid = pid;
 	a->rag_jobs++;
+	a->rag_total_jobs++;
 	stats.s_spawned++;
 	pthread_setname_np(pid, "lamigo-job");
 	rc = 0;
@@ -1936,13 +1974,14 @@ static void lamigo_check_bad_agents(void)
 		pthread_setname_np(pid, "lamigo-job-ca");
 
 		rj = xcalloc(1, sizeof(*rj));
-		rj->rj_check_job = 1;
+		rj->rj_check_job = true;
 		rj->rj_pid = pid;
 		rj->rj_agent = a;
 		rj->rj_start = time(NULL);
 		rj->rj_pool = NULL;
 		rj->rj_mirror_opts = NULL;
 		a->rag_check_in_progress = true;
+		/* Add to job list, but do not count */
 		lipe_list_add_tail(&rj->rj_list, &lamigo_job_list);
 	}
 }
@@ -1988,7 +2027,7 @@ static void lamigo_schedule_setprefer(struct resync_job *rj, void *cbdata, int r
 
 	srj = xcalloc(1, sizeof(*srj));
 	srj->rj_fid = rj->rj_fid;
-	srj->rj_setprefer = 1;
+	srj->rj_setprefer = true;
 	/* XXX: few src pools? */
 	srj->rj_pool = fast_pools;
 
@@ -2014,7 +2053,7 @@ static int lamigo_update_one(struct fid_rec *f)
 
 	if (are_agents_busy()) {
 		/* all the agents are busy */
-		LX_DEBUG("no agents avilable (max: %d)\n", lamigo_max_jobs);
+		LX_DEBUG("no agents available (max: %d)\n", lamigo_max_jobs);
 		return 1;
 	}
 
@@ -2280,30 +2319,22 @@ static void lamigo_check_and_clear_changelog(void)
 
 static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
 {
+	struct resync_agent *agent = rj->rj_agent;
+
 	rj->rj_done_timestamp = time(NULL);
-	rj->rj_try++;
 
 	if (rj->rj_check_job) {
-		rj->rj_agent->rag_check_in_progress = false;
+		agent->rag_check_in_progress = false;
 		if (retval == 0) {
 			/* the agent is back */
-			if (rj->rj_agent->rag_bad) {
-				LX_DEBUG("agent %s is back\n",
-					 rj->rj_agent->rag_hostname);
-				rj->rj_agent->rag_bad = false;
-				lamigo_max_jobs += rj->rj_agent->rag_maxjobs;
+			if (agent->rag_bad) {
+				LX_INFO("agent '%s' is back\n",
+					agent->rag_hostname);
+				agent->rag_bad = false;
+				lamigo_max_jobs += agent->rag_maxjobs;
 			}
 		} else {
-			/* the agent is still bad */
-			if (rj->rj_agent->rag_bad == false) {
-				LX_DEBUG("agent %s is bad\n",
-					 rj->rj_agent->rag_hostname);
-
-				assert(lamigo_max_jobs >= rj->rj_agent->rag_maxjobs);
-				lamigo_max_jobs -= rj->rj_agent->rag_maxjobs;
-				rj->rj_agent->rag_bad = true;
-				rj->rj_agent->rag_opts_checked = false;
-			}
+			check_bad_agent(agent, retval);
 		}
 		free(rj);
 		return;
@@ -2311,15 +2342,11 @@ static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
 
 	LX_DEBUG("job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n",
 		 rj->rj_pid, PFID(&rj->rj_fid), rj->rj_done_timestamp - rj->rj_start,
-		 retval, rj->rj_agent->rag_bad);
+		 retval, agent->rag_bad);
 
 	if (rj->rj_callback)
 		rj->rj_callback(rj, rj->rj_callback_data, retval);
 
-	assert(rj->rj_agent->rag_jobs > 0);
-	rj->rj_agent->rag_jobs--;
-	lamigo_jobs_running--;
-
 	if (retval == EBUSY) {	/* Must not happen */
 		/*
 		 * the file was busy, there will be another CLOSE
@@ -2327,29 +2354,22 @@ static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
 		 */
 		stats.s_busy++;
 	} else if (retval == SSH_ERROR || retval == -ETIMEDOUT) {
-		/* couldn't connect */
-		/* XXX: resubmit the job */
-		if (!rj->rj_agent->rag_bad) {
-			rj->rj_agent->rag_bad = true;
-			rj->rj_agent->rag_opts_checked = false;
-			lamigo_max_jobs -= rj->rj_agent->rag_maxjobs;
+		check_bad_agent(agent, retval);
+		if (retval == SSH_ERROR) {
+			/* Resubmit the job */
+			rj->rj_pid = 0;	/* This is used in stats dump */
+			rj->rj_agent = NULL;
+			rj->rj_try++;
+			lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list);
+			lamigo_jobs_delayed++;
+			return;
 		}
-		rj->rj_pid = 0;
-		rj->rj_agent = NULL;
-
 		/*
 		 * if we interrupted the job with ETIMEDOUT, then
 		 * ssh was doing OK and the changelog got new
 		 * OPEN/CLOSE events which will cause another try
 		 */
-		if (retval == -ETIMEDOUT) {
-			stats.s_job_timedout++;
-		} else if (retval == SSH_ERROR) {
-			lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list);
-			lamigo_jobs_delayed++;
-			rj = NULL;
-		}
-
+		stats.s_job_timedout++;
 	} else if (retval == 127) {
 		/* likely invalid setup on the agent (missing lfs?) */
 		/* XXX: resubmit the job */
@@ -2361,8 +2381,7 @@ static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
 			stats.s_replicated++;
 	}
 
-	if (rj)
-		free(rj);
+	free(rj);
 }
 
 static void lamigo_check_jobs(void)
@@ -2373,13 +2392,25 @@ static void lamigo_check_jobs(void)
 
 	lipe_list_for_each_entry_safe(rj, tmp, &lamigo_job_list, rj_list) {
 		rc = pthread_tryjoin_np(rj->rj_pid, (void **)&retval);
-		if (rc == EBUSY) {
+		if (rc == EBUSY)
 			continue;
-		} else if (rc != 0) {
+
+		lipe_list_del(&rj->rj_list);
+		/* Check agent job is not counted */
+		if (!rj->rj_check_job) {
+			assert(lamigo_jobs_running > 0);
+			lamigo_jobs_running--;
+
+			assert(rj->rj_agent->rag_jobs > 0);
+			rj->rj_agent->rag_jobs--;
+		}
+
+		if (rc != 0) {
+			/* Should not happen. */
 			LX_ERROR("cannot join thread %lld: %s\n",
 				 (long long)rj->rj_pid, strerror(rc));
+			free(rj);
 		} else {
-			lipe_list_del(&rj->rj_list);
 			lamigo_job_fini(rj, retval);
 		}
 	}
@@ -2395,7 +2426,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 
 	if (strcmp(host, "-") == 0) {
 		a->rag_hostname = xstrdup("localhost");
-		a->rag_is_local = 1;
+		a->rag_is_local = true;
 	} else {
 		a->rag_hostname = xstrdup(host);
 	}
@@ -4111,11 +4142,8 @@ static void lamigo_submit_sync(const struct lu_fid *fid, enum amigo_resync_type
 	rj->rj_fid = *fid;
 	rj->rj_stripes = stripes;
 	rj->rj_stripe_size = stripesz;
-	rj->rj_index = 0;
 	rj->rj_resync = sync;
-	rj->rj_check_job = 0;
-	rj->rj_pid = 0;
-	rj->rj_pool = pl;
+	rj->rj_pool = pl->pl_pool;
 	/* only fast pool mirrors should be marked "prefer" */
 	rj->rj_mirror_opts = pl->pl_is_fast ? "prefer" : NULL;
 	rj->rj_callback = lamigo_alr_mirror_cb;
@@ -4123,6 +4151,7 @@ static void lamigo_submit_sync(const struct lu_fid *fid, enum amigo_resync_type
 	rc = lamigo_submit_job(rj);
 	if (rc == 1) {
 		/* probably a dedicated list would be better */
+		rj->rj_try++;
 		lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list);
 		lamigo_jobs_delayed++;
 	}
-- 
1.8.3.1