From c2e56982f2cc19f5f35e0d53327f7f0f93967981 Mon Sep 17 00:00:00 2001 From: Alexandre Ioffe Date: Tue, 12 Apr 2022 22:34:18 -0700 Subject: [PATCH] EX-5099 lipe: Made controllable ssh exec timeout - Introduce new lipe ssh API:lipe_ssh_exec_timeout() and lipe_ssh_start_cmd_timeout(). - Introduce new lamigo command option: --ssh-exec-timeout to configure ssh connection timeout for ssh exec cmd - Use lipe_ssh_start_cmd_timeout() to start remote access log reader with timeout. Use ssh_channel_read_timeout() with infinite timeout when reads access log records - Use lipe_ssh_start_cmd_timeout() to start remote "lfs ..." commands with a long timeout to prevent premature timeout when "lfs mirror extend ..." command for a big file takes too long time. - Use default timeout 600 seconds for ssh exec cmd. Such long timeout should allow to finish long lasting replications This fixes EX-5429. Test-Parameters: trivial clientdistro=el8.5 serverdistro=el8.5 testlist=hot-pools env=FAIL_ON_ERROR=false,ONLY="56 59",ONLY_REPEAT=20 Signed-off-by: Alexandre Ioffe Change-Id: I8de9b1db2014abd1e6f201cda73a0812128f6bb6 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/47057 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger --- lipe/src/lamigo.c | 35 +++++++++++++++++++++++++++++++---- lipe/src/lamigo.h | 1 + lipe/src/lamigo_alr.c | 18 +++++++++--------- lipe/src/lipe_ssh.c | 42 +++++++++++++++++++++++++++++++++++++++--- lipe/src/lipe_ssh.h | 6 ++++++ 5 files changed, 86 insertions(+), 16 deletions(-) diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c index 6ac35b9..227ca7b 100644 --- a/lipe/src/lamigo.c +++ b/lipe/src/lamigo.c @@ -88,6 +88,7 @@ #define DEF_STATFS_REFRESH_INTV 5 /* OST statfs update interval, in seconds */ #define DEF_FAST_POOL_MAX_USED 30 /* open for migration if % space used is less than */ #define DEF_SLOW_POOL_MAX_USED 90 /* open for migration if % space used is less than */ +#define DEF_SSH_EXEC_TO_SECS (10 * 60) /* ssh exec timeout */ #define LAMIGO_USERFILE "/var/lib/lamigo-%s.chlg" #define LAMIGO_DUMPFILE "/var/run/lamigo-%s.stats" @@ -145,6 +146,7 @@ static void usage(void) /* --tgt-free, deprecated */ "\t--slow-pool=POOL (default '%s')\n" "\t--slow-pool-max-used=MAX stop mirroring to POOL when %% used reaches MAX (default %d)\n" + "\t--ssh-exec-timeout=SECS, ssh connection timeout for remote exec command (default: %d)\n" "\t--ssh-log-verbosity=V, set SSH_OPTIONS_LOG_VERBOSITY to V (default: 0)\n" "\t--timestamps, add timestamp to log messages (default: no)\n" "\t--version, print version information and exit\n", @@ -166,7 +168,8 @@ static void usage(void) DEF_HOT_FRACTION, DEF_HOT_AFTER_IDLE, DEF_SLOW_POOL, - DEF_SLOW_POOL_MAX_USED); + DEF_SLOW_POOL_MAX_USED, + DEF_SSH_EXEC_TO_SECS); exit(EXIT_SUCCESS); } @@ -247,6 +250,7 @@ struct options opt = { .o_fast_pool_max_used = DEF_FAST_POOL_MAX_USED, .o_slow_pool_max_used = DEF_SLOW_POOL_MAX_USED, .o_progress_interval = DEF_PROGRESS_INTV, + .o_ssh_exec_to = DEF_SSH_EXEC_TO_SECS, .o_alr_extra_args = DEF_ALR_EXTRA_ARGS, .o_alr_periods = DEF_ALR_PERIODS, .o_alr_period_time = DEF_ALR_PERIOD_SECS, @@ -616,7 +620,8 @@ static void lamigo_dump_stats_file(void) " hot_after_idle: %d\n" " fast_pool_max_used: %d\n" " slow_pool_max_used: %d\n" - " include_dom: %d\n", + " include_dom: %d\n" + " ssh_exec_to: %ld\n", opt.o_slow_pool, opt.o_min_age, opt.o_cache_size, opt.o_rescan, opt.o_num_threads, opt.o_pool_refresh, opt.o_progress_interval, opt.o_alr_periods, @@ -625,7 +630,8 @@ static void lamigo_dump_stats_file(void) opt.o_alr_hot_fraction, opt.o_alr_hot_after_idle, opt.o_fast_pool_max_used, opt.o_slow_pool_max_used, - opt.o_include_dom); + opt.o_include_dom, + opt.o_ssh_exec_to); for (pl = fast_pools; pl != NULL; pl = pl->pl_next, i++) fprintf(f, "pool %s:\n" " osts: %d\n" @@ -772,7 +778,8 @@ static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, int *pstatus rc = a->rag_is_local ? system(cmd) : - lipe_ssh_exec(&rss->rss_ctx, cmd, pstatus); + lipe_ssh_exec_timeout(&rss->rss_ctx, cmd, pstatus, + opt.o_ssh_exec_to); if (rc) llapi_error(LLAPI_MSG_INFO, rc, "error executing ssh command '%s' on '%s'", @@ -1917,6 +1924,7 @@ enum { LAMIGO_OPT_SLOW_POOL_MAX_USED, LAMIGO_OPT_SRC_DOM, /* == LAMIGO_OPT_INCLUDE_DOM + warning */ LAMIGO_OPT_SRC_FREE, /* == LAMIGO_OPT_FAST_POOL_MAX_USED + math + warning */ + LAMIGO_OPT_SSH_EXEC_TO, LAMIGO_OPT_SSH_LOG_VERBOSITY, LAMIGO_OPT_STATFS_REFRESH, LAMIGO_OPT_TGT_FREE, /* == LAMIGO_OPT_SLOW_POOL_MAX_USED + math + warning */ @@ -1961,6 +1969,7 @@ static struct option options[] = { { "src", required_argument, NULL, 's'}, { "src-dom", no_argument, NULL, LAMIGO_OPT_SRC_DOM}, { "src-free", required_argument, NULL, LAMIGO_OPT_SRC_FREE}, + { "ssh-exec-timeout", required_argument, NULL, LAMIGO_OPT_SSH_EXEC_TO }, { "ssh-log-verbosity", required_argument, NULL, LAMIGO_OPT_SSH_LOG_VERBOSITY }, { "statfs-refresh", required_argument, NULL, LAMIGO_OPT_STATFS_REFRESH }, { "tgt", required_argument, NULL, 't'}, @@ -2221,6 +2230,24 @@ static void lamigo_process_opt(int c, char *optarg) case LAMIGO_OPT_INCLUDE_DOM: opt.o_include_dom = 1; break; + case LAMIGO_OPT_SSH_EXEC_TO: + { + char *end; + long result = strtol(optarg, &end, 10); + + if (end == optarg || + ((result == LONG_MIN || result == LONG_MAX) && + errno == ERANGE) || + result < 0 ) { /* 0 for infinite timeout + * Minimum tiemout is 1 second + */ + LX_FATAL("invalid ssh exec connection timeout '%s'\n", + optarg); + } + + opt.o_ssh_exec_to = result; + } + break; case LAMIGO_OPT_SSH_LOG_VERBOSITY: lipe_ssh_log_verbosity = atoi(optarg); break; diff --git a/lipe/src/lamigo.h b/lipe/src/lamigo.h index 6f81554..883c7e8 100644 --- a/lipe/src/lamigo.h +++ b/lipe/src/lamigo.h @@ -90,6 +90,7 @@ struct options { int o_fast_pool_max_used; int o_slow_pool_max_used; int o_progress_interval; /* how often to show progress */ + long o_ssh_exec_to; /* ssh connection timeout for exec command */ char *o_alr_extra_args; int o_alr_periods; int o_alr_period_time; diff --git a/lipe/src/lamigo_alr.c b/lipe/src/lamigo_alr.c index a20fcd1..6036bcd 100644 --- a/lipe/src/lamigo_alr.c +++ b/lipe/src/lamigo_alr.c @@ -246,8 +246,7 @@ static int lamigo_alr_parse(const char *host, char *buf, int size, int *received static int lamigo_alr_agent_run(struct alr_agent *ala) { ssh_channel channel = NULL; - unsigned long now = time(NULL); - unsigned long last_checked = now; + time_t last_checked = time(NULL); int rc, offset = 0, received = 0; char cmd[PATH_MAX]; char buffer[16 * 1024]; @@ -261,9 +260,9 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) "ofd_access_log_reader -i %d -I %d %s 2> /dev/null", opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args); - rc = lipe_ssh_start_cmd(&ala->ala_ctx, cmd, &channel); + rc = lipe_ssh_start_cmd_timeout(&ala->ala_ctx, cmd, &channel, 5); if (rc != SSH_OK) { - LX_ERROR("cannot start ofd_access_log_reader on host '%s': rc = %d\n", + LX_ERROR("cannot start access log reader agent on host '%s': rc = %d\n", ala->ala_host, rc); goto out; } @@ -271,11 +270,12 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) LX_DEBUG("started access log reader agent on '%s'\n", ala->ala_host); while (ssh_channel_is_open(channel) && !ssh_channel_is_eof(channel)) { - rc = ssh_channel_read(channel, buffer + offset, - sizeof(buffer) - offset, 0); - if (rc == 0) /* ssh_channel_read timeout */ - continue; - if (rc < 0) + time_t now; + + rc = ssh_channel_read_timeout(channel, buffer + offset, + sizeof(buffer) - offset, 0, + -1); /* SSH_TIMEOUT_INFINITE Internally */ + if (rc <= 0) break; offset = lamigo_alr_parse(ala->ala_host, buffer, offset + rc, &received); now = time(NULL); diff --git a/lipe/src/lipe_ssh.c b/lipe/src/lipe_ssh.c index a980dfe..62dcbb9 100644 --- a/lipe/src/lipe_ssh.c +++ b/lipe/src/lipe_ssh.c @@ -37,6 +37,7 @@ static int lipe_ssh_session_start_cmd(ssh_session session, const char *cmd, ssh_ goto out; } + /* This is always executed with SSH_TIMEOUT_DEFAULT internally */ rc = ssh_channel_request_exec(channel, cmd); if (rc != SSH_OK) { LX_ERROR("cannot execute SSH command: %d: %s\n", @@ -101,7 +102,7 @@ static void lipe_ssh_session_destroy(ssh_session *psession) static int lipe_ssh_session_create(ssh_session *psession, const char *host) { ssh_session session = NULL; - long timeout = 5; + const long timeout = 5; /* Default timeout used for ssh connect */ int rc; assert(SSH_OK == 0); @@ -143,6 +144,8 @@ static int lipe_ssh_session_create(ssh_session *psession, const char *host) goto out; } + ssh_set_blocking(session, 1); + /* Connect to the ssh server */ rc = ssh_connect(session); if (rc != SSH_OK) { @@ -205,7 +208,8 @@ int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host) return SSH_OK; } -int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel) +int lipe_ssh_start_cmd_timeout(struct lipe_ssh_context *ctx, const char *cmd, + ssh_channel *pchannel, long timeout_sec) { int rc; @@ -213,6 +217,16 @@ int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channe if (rc != SSH_OK) return rc; + if (timeout_sec == LIPE_SSH_TIMEOUT_INFINITE) + timeout_sec = 0; /* Blocking with timeout 0 is infinite */ + + rc = ssh_options_set(ctx->lsc_session, SSH_OPTIONS_TIMEOUT, &timeout_sec); + if (rc != SSH_OK) { + LX_ERROR("cannot set SSH timeout to %ld: %dsec: %s\n", + timeout_sec, rc, ssh_get_error(ctx->lsc_session)); + return SSH_ERROR; + } + rc = lipe_ssh_session_start_cmd(ctx->lsc_session, cmd, pchannel); if (rc != SSH_OK) lipe_ssh_context_fail(ctx); @@ -222,7 +236,13 @@ int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channe return rc; } -int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus) +int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel) +{ + return lipe_ssh_start_cmd_timeout(ctx, cmd, pchannel, LIPE_SSH_TIMEOUT_INFINITE); +} + +int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, + int *pstatus, long timeout_sec) { int rc; @@ -230,6 +250,17 @@ int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus) if (rc != SSH_OK) return rc; + if (timeout_sec == LIPE_SSH_TIMEOUT_INFINITE) + timeout_sec = 0; /* Blocking call with timeout 0 is infinite */ + + /* Set SSH_TIMEOUT_DEFAULT */ + rc = ssh_options_set(ctx->lsc_session, SSH_OPTIONS_TIMEOUT, &timeout_sec); + if (rc != SSH_OK) { + LX_ERROR("cannot set SSH timeout to %ld: %dsec: %s\n", + timeout_sec, rc, ssh_get_error(ctx->lsc_session)); + return SSH_ERROR; + } + /* Execute a remote command */ rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd, pstatus); if (rc < 0) @@ -239,3 +270,8 @@ int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus) return rc; } + +int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus) +{ + return lipe_ssh_exec_timeout(ctx, cmd, pstatus, LIPE_SSH_TIMEOUT_INFINITE); +} diff --git a/lipe/src/lipe_ssh.h b/lipe/src/lipe_ssh.h index fd51a53..25c1019 100644 --- a/lipe/src/lipe_ssh.h +++ b/lipe/src/lipe_ssh.h @@ -13,9 +13,15 @@ struct lipe_ssh_context { ssh_session lsc_session; }; +#define LIPE_SSH_TIMEOUT_INFINITE (0) + int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host); void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx); +int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, + int *pstatus, long timeout_sec); +int lipe_ssh_start_cmd_timeout(struct lipe_ssh_context *ctx, const char *cmd, + ssh_channel *pchannel, long timeout_sec); int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus); int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel); -- 1.8.3.1