From 51ec673af118019c69ea63196301ae5bf3126149 Mon Sep 17 00:00:00 2001 From: Alexandre Ioffe Date: Tue, 29 Mar 2022 00:48:35 -0700 Subject: [PATCH] EX-4141 lipe: lamigo should detect dead OST and restart ALR Use '# keepalive' message and ssh read with timeout to detect OST is down and restart ALR. Add stats for ALR last seen message To make lamigo compatible with older ofd_access_log_reader lamigo can work in two modes: 1. lamigo does not expect '# keepalive' message. In this case after timeout it will restart ofd_access_log_reader silently 2. lamigo expects periodical # keepalive message. If lamigo does not receive keepalive message or any other message from ofd_access_log_reader within timeout it reports error message and restarts ofd_access_log_reader. lamigo switches from 1 to 2 once it receives '# keepalive' message Signed-off-by: Alexandre Ioffe Test-Parameters: trivial testlist=hot-pools Change-Id: I55bc92b03ef5b45b72ff59ffd4b450cd1927cdb0 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/48647 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- lipe/.gitignore | 1 + lipe/src/lamigo.c | 11 +++++ lipe/src/lamigo.h | 2 + lipe/src/lamigo_alr.c | 126 +++++++++++++++++++++++++++++++++++++++----------- 4 files changed, 113 insertions(+), 27 deletions(-) diff --git a/lipe/.gitignore b/lipe/.gitignore index 0fb68d0..07d48e8 100644 --- a/lipe/.gitignore +++ b/lipe/.gitignore @@ -11,6 +11,7 @@ /src/*.o /src/ext4_inode2path /src/generate_definition +/src/lamigo /src/laudit /src/laudit-report /src/ldumpstripe diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c index 227ca7b..ae06e96 100644 --- a/lipe/src/lamigo.c +++ b/lipe/src/lamigo.c @@ -84,6 +84,7 @@ #define DEF_COOLDOWN_K 15 /* how fast to cool down / 100 */ #define DEF_HOT_FRACTION 20 /* %% of the hottest's heat - hot files */ #define DEF_HOT_AFTER_IDLE 3 +#define DEF_ALR_CMD "ofd_access_log_reader" #define DEF_ALR_EXTRA_ARGS "--exit-on-close" #define DEF_STATFS_REFRESH_INTV 5 /* OST statfs update interval, in seconds */ #define DEF_FAST_POOL_MAX_USED 30 /* open for migration if % space used is less than */ @@ -129,6 +130,7 @@ static void usage(void) "\t-v, --verbose, produce more verbose ouput\n" "\t-w, --dump, stats file (via USR1 signal, default: %s)\n" "\t-W, --heat-dump, heat table file (via USR2 signal, default: %s)\n" + "\t--alr-cmd=CMD, ALR command (default: '%s')\n" "\t--alr-extra-args=ARGS, extra arguments for ALR (default: '%s')\n" "\t--fast-pool=POOL (default '%s')\n" "\t--fast-pool-max-used=MAX stop mirroring to POOL when %% used reaches MAX (default %d)\n" @@ -160,6 +162,7 @@ static void usage(void) LAMIGO_DUMPFILE, LAMIGO_JSONFILE, LAMIGO_HEAT_FILE, + DEF_ALR_CMD, DEF_ALR_EXTRA_ARGS, DEF_FAST_POOL, DEF_FAST_POOL_MAX_USED, @@ -251,6 +254,7 @@ struct options opt = { .o_slow_pool_max_used = DEF_SLOW_POOL_MAX_USED, .o_progress_interval = DEF_PROGRESS_INTV, .o_ssh_exec_to = DEF_SSH_EXEC_TO_SECS, + .o_alr_cmd = DEF_ALR_CMD, .o_alr_extra_args = DEF_ALR_EXTRA_ARGS, .o_alr_periods = DEF_ALR_PERIODS, .o_alr_period_time = DEF_ALR_PERIOD_SECS, @@ -719,6 +723,8 @@ static void lamigo_dump_stats_file(void) fprintf(f, "history:\n"); lamigo_dump_history(f); + lamigo_alr_dump_stat(f); + fflush(f); fclose(f); } @@ -1904,6 +1910,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) } enum { + LAMIGO_OPT_ALR_CMD, LAMIGO_OPT_ALR_EXTRA_ARGS, LAMIGO_OPT_FAST_POOL, LAMIGO_OPT_FAST_POOL_MAX_USED, @@ -1934,6 +1941,7 @@ enum { static struct option options[] = { { "agent", required_argument, NULL, 'g'}, + { "alr-cmd", required_argument, NULL, LAMIGO_OPT_ALR_CMD }, { "alr-extra-args", required_argument, NULL, LAMIGO_OPT_ALR_EXTRA_ARGS}, { "config", required_argument, NULL, 'f' }, { "debug", optional_argument, NULL, 'b' }, @@ -2207,6 +2215,9 @@ static void lamigo_process_opt(int c, char *optarg) if (*endptr != '\0' || opt.o_progress_interval < 1) LX_FATAL("invalid progress interval '%s'\n", optarg); break; + case LAMIGO_OPT_ALR_CMD: + opt.o_alr_cmd = optarg; + break; case LAMIGO_OPT_ALR_EXTRA_ARGS: opt.o_alr_extra_args = optarg; break; diff --git a/lipe/src/lamigo.h b/lipe/src/lamigo.h index 883c7e8..b0f3279 100644 --- a/lipe/src/lamigo.h +++ b/lipe/src/lamigo.h @@ -12,6 +12,7 @@ void lamigo_add_alr_agent(const char *host); void lamigo_alr_init(void); +void lamigo_alr_dump_stat(FILE *f); enum alr_pool { ALR_FAST = 0, @@ -91,6 +92,7 @@ struct options { int o_slow_pool_max_used; int o_progress_interval; /* how often to show progress */ long o_ssh_exec_to; /* ssh connection timeout for exec command */ + char *o_alr_cmd; char *o_alr_extra_args; int o_alr_periods; int o_alr_period_time; diff --git a/lipe/src/lamigo_alr.c b/lipe/src/lamigo_alr.c index 6036bcd..0ddc7de 100644 --- a/lipe/src/lamigo_alr.c +++ b/lipe/src/lamigo_alr.c @@ -36,14 +36,22 @@ #define LAMIGO_DEBUG_ALR +#define KEEP_ALIVE_MSG "# keepalive" + struct alr_agent { char *ala_host; struct lipe_list_head ala_list; struct lipe_ssh_context ala_ctx; pthread_t ala_pid; + time_t ala_last_msg_received; }; static LIPE_LIST_HEAD(alr_agent_list); +static bool alr_initialized; +/* This option is activated when + * ofd_access_log_reader sends "# keepalive" message first time + */ +static bool restart_log_reader_on_timeout; /* * all history is broken into 'period': this is needed to limit @@ -170,6 +178,16 @@ static void lamigo_alr_parse_one(const char *host, const char *line) char ostname[256]; char rw; + /* Switch to new protocol "# keepalive" */ + if (strncmp(line, KEEP_ALIVE_MSG, sizeof(KEEP_ALIVE_MSG)) == 0) { + /* Found "# keepalive" */ + restart_log_reader_on_timeout = true; + return; + } + + if (line[0] == '#') /* Line started with '#' to be ignored */ + return; + rc = sscanf(line, "o=%s f=["SFID"] t=%lld b=%llu e=%llu s=%llu " "g=%llu n=%llu d=%c\n", ostname, RFID(&fid), ×tamp, &begin, &end, &iosize, &segments, &count, &rw); @@ -236,13 +254,12 @@ static int lamigo_alr_parse(const char *host, char *buf, int size, int *received * * it initiates ssh session with running ofd_access_batch and gets * data every N seconds. then the data are processed and inserted - * into the global structure . + * into the global structure. * - * FIXME Do a read with timeout to detect silently disconnected - * peers. It would be nice to use ssh_keepalive_send() but I cannot - * make this work. Consider having ofd_access_log_reader send a line - * that we ignore at the start of every batch (even if the batch is - * empty). */ + * Return: SSH_OK - eof or timeout. Likely OST is down or not mounted. + * Restart slowly + * Otherwise: error. Restart quickly + */ static int lamigo_alr_agent_run(struct alr_agent *ala) { ssh_channel channel = NULL; @@ -257,41 +274,74 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) * lipe_ssh_exec(). */ snprintf(cmd, sizeof(cmd), - "ofd_access_log_reader -i %d -I %d %s 2> /dev/null", - opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args); + "%s -i %d -I %d %s 2> /dev/null", + opt.o_alr_cmd, opt.o_alr_ofd_interval, + mdtidx, opt.o_alr_extra_args); rc = lipe_ssh_start_cmd_timeout(&ala->ala_ctx, cmd, &channel, 5); if (rc != SSH_OK) { - LX_ERROR("cannot start access log reader agent on host '%s': rc = %d\n", - ala->ala_host, rc); + LX_ERROR("cannot start '%s' on host '%s': rc = %d\n", + cmd, ala->ala_host, rc); goto out; } - LX_DEBUG("started access log reader agent on '%s'\n", ala->ala_host); + LX_DEBUG("started '%s' on '%s'\n", opt.o_alr_cmd, ala->ala_host); while (ssh_channel_is_open(channel) && !ssh_channel_is_eof(channel)) { + /* Wait at max 4*ofd_access_log_reader batch interval */ + const int timeout_sec = opt.o_alr_ofd_interval * 4; time_t now; - rc = ssh_channel_read_timeout(channel, buffer + offset, - sizeof(buffer) - offset, 0, - -1); /* SSH_TIMEOUT_INFINITE Internally */ - if (rc <= 0) + rc = ssh_channel_poll_timeout(channel, timeout_sec * 1000, 0); + if (rc == SSH_ERROR) { + LX_ERROR("polling ssh channel from '%s' failed\n", + ala->ala_host); + break; + } + if (rc == SSH_EOF) { + rc = SSH_OK; + break; + } + + if (rc == SSH_AGAIN || /* Bug fixed since libssh 0.9.5 */ + rc == SSH_OK) { /* Timeout */ + rc = SSH_OK; + if (restart_log_reader_on_timeout) { + LX_ERROR("waiting message from '%s' exceeded %dsec\n", + ala->ala_host, timeout_sec); + } + break; + } + + rc = ssh_channel_read(channel, buffer + offset, + sizeof(buffer) - offset, 0); + if (rc == SSH_AGAIN || /* Bug fixed since libssh 0.9.5 */ + rc == SSH_OK) { /* Read timeout or eof. */ + rc = SSH_OK; + if (ssh_channel_is_eof(channel)) + break; + /* Otherwise - timeout */ + if (restart_log_reader_on_timeout) { + /* OST stopped communicating with us - restart slowly */ + LX_ERROR("timeout waiting for message from '%s' on '%s'\n", + opt.o_alr_cmd, ala->ala_host); + } break; + } + if (rc < 0) { + LX_ERROR("reading ssh '%s' failed\n", ala->ala_host); + break; + } offset = lamigo_alr_parse(ala->ala_host, buffer, offset + rc, &received); now = time(NULL); + ala->ala_last_msg_received = now; if (now - last_checked > opt.o_progress_interval) { LX_DEBUG("received %d access log records from host '%s'\n", received, ala->ala_host); received = 0; last_checked = now; } - } - - rc = ssh_channel_get_exit_status(channel); - if (rc == 0) - LX_DEBUG("access log reader on '%s' exited with status 0\n", ala->ala_host); - else - LX_ERROR("access log reader on '%s' terminated with status %d\n", ala->ala_host, rc); + } /* while */ out: if (channel != NULL) { @@ -302,7 +352,7 @@ out: ssh_channel_free(channel); return rc; -} +} /* lamigo_alr_agent_run */ static void *lamigo_alr_data_collection_thread(void *arg) { @@ -311,12 +361,16 @@ static void *lamigo_alr_data_collection_thread(void *arg) while (1) { rc = lamigo_alr_agent_run(ala); - if (rc == 0) { - /* No access logs were found on the host, - * likely because no OSTs were mounted. */ + if (rc == SSH_OK) { + /* Read timeout or eof + * No access logs were found on the host, + * likely because no OSTs were mounted. + */ sleep(60); } else { - /* XXX: should be configurable? */ + /* Read error + XXX: should be configurable? + */ sleep(5); } } @@ -890,6 +944,24 @@ void lamigo_alr_init(void) rc = pthread_create(&pid, NULL, lamigo_alr_heat_thread, NULL); if (rc) LX_FATAL("cannot start heat-maint thread: %s\n", strerror(rc)); + + alr_initialized = true; +} + +void lamigo_alr_dump_stat(FILE *f) +{ + struct alr_agent *ala; + + if (!alr_initialized) + return; + + fprintf(f, "access_log_readers:\n"); + + lipe_list_for_each_entry(ala, &alr_agent_list, ala_list) { + fprintf(f, " - host:'%s'\n", ala->ala_host); + fprintf(f, " last_message_at: %lu\n", + ala->ala_last_msg_received); + } } void lamigo_add_alr_agent(const char *host) -- 1.8.3.1