From bd54e2b52b955d7f1c5423c2bc9a206976b3e317 Mon Sep 17 00:00:00 2001 From: Alexandre Ioffe Date: Thu, 10 Nov 2022 11:42:33 -0800 Subject: [PATCH] EX-6298 lipe: Decrease wait time to reconnect to ALR 1) Made delay between reconnections to ALR gradually increasing starting from as little as 5 seconds when ssh session to ALR fails. It makes attempt to reconnect more often initially. 2) Enable hot-pools test 72 previously excepted Signed-off-by: Alexandre Ioffe Test-Parameters: trivial testlist=hot-pools mdtcount=6 env=ONLY=72,ONLY_REPEAT=40 Change-Id: Iafae62d733390f92370f5d224830944f285da934 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49106 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- lipe/src/lamigo_alr.c | 10 +++++++--- lustre/tests/hot-pools.sh | 1 - 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/lipe/src/lamigo_alr.c b/lipe/src/lamigo_alr.c index 0ddc7de..c70a774 100644 --- a/lipe/src/lamigo_alr.c +++ b/lipe/src/lamigo_alr.c @@ -52,7 +52,8 @@ static bool alr_initialized; * ofd_access_log_reader sends "# keepalive" message first time */ static bool restart_log_reader_on_timeout; - +/* Wait seconds to restart ssh session when connection to alr fails */ +static int alr_read_restart_delay = 5; /* * all history is broken into 'period': this is needed to limit * number of files to process: heat calculation and sorting. @@ -334,6 +335,7 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) } offset = lamigo_alr_parse(ala->ala_host, buffer, offset + rc, &received); now = time(NULL); + alr_read_restart_delay = 5; /* Reset restart delay when read successful */ ala->ala_last_msg_received = now; if (now - last_checked > opt.o_progress_interval) { LX_DEBUG("received %d access log records from host '%s'\n", @@ -362,11 +364,13 @@ static void *lamigo_alr_data_collection_thread(void *arg) while (1) { rc = lamigo_alr_agent_run(ala); if (rc == SSH_OK) { - /* Read timeout or eof + /* Read timeout or eof. * No access logs were found on the host, * likely because no OSTs were mounted. */ - sleep(60); + sleep(alr_read_restart_delay); + if (alr_read_restart_delay < 60) + alr_read_restart_delay += 10; } else { /* Read error XXX: should be configurable? diff --git a/lustre/tests/hot-pools.sh b/lustre/tests/hot-pools.sh index 9f166bb..b0c10ab 100755 --- a/lustre/tests/hot-pools.sh +++ b/lustre/tests/hot-pools.sh @@ -14,7 +14,6 @@ init_logging ALWAYS_EXCEPT="$HOT_POOLS_EXCEPT " always_except EX-3442 15 -always_except EX-6298 72 (( CLIENT_VERSION == MDS1_VERSION )) || skip_env "skipped for interop testing" -- 1.8.3.1