From 0256f9995bd18e39c378f9582381f7b6c02cf073 Mon Sep 17 00:00:00 2001 From: Alexandre Ioffe Date: Fri, 18 Nov 2022 21:39:08 -0800 Subject: [PATCH] EX-6298 lipe: decrease delay before ALR restart - Decrease delay before restarting access log reader and eliminate this delay when the read from ALR fails due to timeout. Increase SSH poll/read timeout while keep-alive message in ofd_access_log_reader is not implemented This will decrease probability of missing ALR. - Remove excluding hot-pools test_72 Test-Parameters: trivial testlist=hot-pools Signed-off-by: Alexandre Ioffe Change-Id: I36989e9c3fd877aee5ce1cfb8525db8604e666bd Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49196 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger --- lipe/src/lamigo_alr.c | 60 +++++++++++++++++++++-------------------------- lustre/tests/hot-pools.sh | 5 ++-- 2 files changed, 29 insertions(+), 36 deletions(-) diff --git a/lipe/src/lamigo_alr.c b/lipe/src/lamigo_alr.c index c70a774..0bd9525 100644 --- a/lipe/src/lamigo_alr.c +++ b/lipe/src/lamigo_alr.c @@ -36,6 +36,10 @@ #define LAMIGO_DEBUG_ALR +/* Minimum delay included before each access log reader restart */ +#define PREVENT_SSH_BUSY_LOOP_SEC 5 + +/* periodical message from access_log reader */ #define KEEP_ALIVE_MSG "# keepalive" struct alr_agent { @@ -44,6 +48,7 @@ struct alr_agent { struct lipe_ssh_context ala_ctx; pthread_t ala_pid; time_t ala_last_msg_received; + time_t ala_last_alr_restarted; }; static LIPE_LIST_HEAD(alr_agent_list); @@ -52,8 +57,6 @@ static bool alr_initialized; * ofd_access_log_reader sends "# keepalive" message first time */ static bool restart_log_reader_on_timeout; -/* Wait seconds to restart ssh session when connection to alr fails */ -static int alr_read_restart_delay = 5; /* * all history is broken into 'period': this is needed to limit * number of files to process: heat calculation and sorting. @@ -258,8 +261,9 @@ static int lamigo_alr_parse(const char *host, char *buf, int size, int *received * into the global structure. * * Return: SSH_OK - eof or timeout. Likely OST is down or not mounted. - * Restart slowly - * Otherwise: error. Restart quickly + * Or no ALR update and no keep-alive msg + * from access log reader + * Otherwise: ssh error */ static int lamigo_alr_agent_run(struct alr_agent *ala) { @@ -279,6 +283,7 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) opt.o_alr_cmd, opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args); + ala->ala_last_alr_restarted = time(NULL); rc = lipe_ssh_start_cmd_timeout(&ala->ala_ctx, cmd, &channel, 5); if (rc != SSH_OK) { LX_ERROR("cannot start '%s' on host '%s': rc = %d\n", @@ -286,11 +291,12 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) goto out; } - LX_DEBUG("started '%s' on '%s'\n", opt.o_alr_cmd, ala->ala_host); + LX_DEBUG("started '%s' on '%s' at %lu\n", + opt.o_alr_cmd, ala->ala_host, ala->ala_last_alr_restarted); while (ssh_channel_is_open(channel) && !ssh_channel_is_eof(channel)) { /* Wait at max 4*ofd_access_log_reader batch interval */ - const int timeout_sec = opt.o_alr_ofd_interval * 4; + const int timeout_sec = opt.o_alr_ofd_interval * 8; time_t now; rc = ssh_channel_poll_timeout(channel, timeout_sec * 1000, 0); @@ -303,7 +309,6 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) rc = SSH_OK; break; } - if (rc == SSH_AGAIN || /* Bug fixed since libssh 0.9.5 */ rc == SSH_OK) { /* Timeout */ rc = SSH_OK; @@ -316,12 +321,15 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) rc = ssh_channel_read(channel, buffer + offset, sizeof(buffer) - offset, 0); - if (rc == SSH_AGAIN || /* Bug fixed since libssh 0.9.5 */ - rc == SSH_OK) { /* Read timeout or eof. */ - rc = SSH_OK; - if (ssh_channel_is_eof(channel)) + if (rc == SSH_ERROR) { + LX_ERROR("reading ssh '%s' failed\n", ala->ala_host); + break; + } + if (rc == SSH_AGAIN || rc == SSH_OK) { /* Nothing is available or timeout */ + if (ssh_channel_is_eof(channel)) { + rc = SSH_OK; break; - /* Otherwise - timeout */ + } if (restart_log_reader_on_timeout) { /* OST stopped communicating with us - restart slowly */ LX_ERROR("timeout waiting for message from '%s' on '%s'\n", @@ -329,13 +337,8 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) } break; } - if (rc < 0) { - LX_ERROR("reading ssh '%s' failed\n", ala->ala_host); - break; - } offset = lamigo_alr_parse(ala->ala_host, buffer, offset + rc, &received); now = time(NULL); - alr_read_restart_delay = 5; /* Reset restart delay when read successful */ ala->ala_last_msg_received = now; if (now - last_checked > opt.o_progress_interval) { LX_DEBUG("received %d access log records from host '%s'\n", @@ -359,24 +362,15 @@ out: static void *lamigo_alr_data_collection_thread(void *arg) { struct alr_agent *ala = arg; - int rc; while (1) { - rc = lamigo_alr_agent_run(ala); - if (rc == SSH_OK) { - /* Read timeout or eof. - * No access logs were found on the host, - * likely because no OSTs were mounted. - */ - sleep(alr_read_restart_delay); - if (alr_read_restart_delay < 60) - alr_read_restart_delay += 10; - } else { - /* Read error - XXX: should be configurable? - */ - sleep(5); - } + time_t since_last_restart; + + lamigo_alr_agent_run(ala); + since_last_restart = time(NULL) - ala->ala_last_alr_restarted; + + if (since_last_restart < PREVENT_SSH_BUSY_LOOP_SEC) + sleep(PREVENT_SSH_BUSY_LOOP_SEC - since_last_restart); } /* Add a return to make GCC happy. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97793. */ diff --git a/lustre/tests/hot-pools.sh b/lustre/tests/hot-pools.sh index 9f166bb..495f291 100755 --- a/lustre/tests/hot-pools.sh +++ b/lustre/tests/hot-pools.sh @@ -14,7 +14,6 @@ init_logging ALWAYS_EXCEPT="$HOT_POOLS_EXCEPT " always_except EX-3442 15 -always_except EX-6298 72 (( CLIENT_VERSION == MDS1_VERSION )) || skip_env "skipped for interop testing" @@ -2210,7 +2209,7 @@ test_72() { stack_trap stop_lamigo_cmd rm -f $file # if previously test failed to clean up leftover - + echo OSTCOUNT=$OSTCOUNT for ((i = 0; i < $OSTCOUNT; i++)); do $LFS setstripe -c 1 -i $i $file || error "$LFS setstripe $file failed" @@ -2220,7 +2219,7 @@ test_72() { local fid=$($LFS path2fid $file) local mdt_indx=$($LFS getstripe --mdt-index $file) - # echo "File: $file OST=$i Fid:$fid Mdt:$mdt_indx" + echo "File: $file OST=$i Fid:$fid Mdt:$mdt_indx" check_entry $mdt_indx $fid $size -- 1.8.3.1