From 00797e94f63a8dcad56cd115b205d944a5eb519a Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Wed, 19 Jul 2023 11:35:04 +0300 Subject: [PATCH] EX-3598 lamigo: dump lfs's stdout/stderr in case of error Lustre-change: https://review.whamcloud.com/44505 Lustre-commit: f8d365d8d7eaa24715fe4f68687051a6addc94bd Test-Parameters: trivial testlist=hot-pools Change-Id: Iea0c6fc097255f02df3698c1e3a31b39bfa09ca1 Signed-off-by: Alex Zhuravlev Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51711 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- lipe/src/lamigo.c | 24 +++++++++++++++++------- lipe/src/lipe_ssh.c | 36 +++++++++++++++++++++++++++++------- lipe/src/lipe_ssh.h | 6 ++++-- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c index 098512d..9fe9233 100644 --- a/lipe/src/lamigo.c +++ b/lipe/src/lamigo.c @@ -770,7 +770,9 @@ static int lamigo_init_cache(void) return 0; } -static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, int *pstatus) +static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, + int *pstatus, char *outbuf, int outbuf_size, + char *errbuf, int errbuf_size) { struct resync_ssh_session *rss; int rc; @@ -795,6 +797,7 @@ static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, int *pstatus rc = a->rag_is_local ? system(cmd1) : lipe_ssh_exec_timeout(&rss->rss_ctx, cmd1, pstatus, + outbuf, outbuf_size, errbuf, errbuf_size, opt.o_ssh_exec_to); if (rc) llapi_error(LLAPI_MSG_INFO, rc, @@ -813,6 +816,7 @@ static void *lamigo_replicate_one(void *args) { struct resync_job *rj = (struct resync_job *)args; struct resync_agent *agent = rj->rj_agent; + char outbuf[4096], errbuf[4096]; int resync = rj->rj_resync; char cmd[PATH_MAX * 2]; int status = INT_MAX; @@ -821,7 +825,7 @@ static void *lamigo_replicate_one(void *args) if (rj->rj_setprefer) { snprintf(cmd, sizeof(cmd), "lfs setstripe --comp-set --comp-flags=prefer --pool='%s' " - "'%s/.lustre/fid/"DFID"' > /dev/null 2>&1", rj->rj_pool, + "'%s/.lustre/fid/"DFID"'", rj->rj_pool, agent->rag_mountpoint, PFID(&rj->rj_fid)); LX_DEBUG("set prefer on "DFID"\n", @@ -842,7 +846,7 @@ static void *lamigo_replicate_one(void *args) i += snprintf(cmd + i, sizeof(cmd) - i, " --flags='%s'", rj->rj_mirror_opts); i += snprintf(cmd + i, sizeof(cmd) - i, - " '%s/.lustre/fid/"DFID"' > /dev/null 2>&1", + " '%s/.lustre/fid/"DFID"'", agent->rag_mountpoint, PFID(&rj->rj_fid)); } else if (resync == AMIGO_RESYNC_RESYNC) { snprintf(cmd, sizeof(cmd), @@ -857,16 +861,22 @@ static void *lamigo_replicate_one(void *args) /* rc < 0 means an ssh error. Otherwise command exit status is * in status. Mask common exit statuses. */ - rc = lamigo_exec_cmd(agent, cmd, &status); + rc = lamigo_exec_cmd(agent, cmd, &status, outbuf, sizeof(outbuf), + errbuf, sizeof(errbuf)); LX_DEBUG("exec command '%s' on '%s': rc = %d, status = %d\n", cmd, agent->rag_hostname, rc, status); if (rc < 0 || /* 1 from setprefer (see EX-3591) */ (rj->rj_setprefer && status != 0 && status != 1) || /* EBUSY from mirror extend/resync */ - (!rj->rj_setprefer && status != 0 && status != EBUSY && status != ENODATA)) + (!rj->rj_setprefer && status != 0 && status != EBUSY && status != ENODATA)) { LX_ERROR("command '%s' on '%s' failed: rc = %d, status = %d\n", cmd, agent->rag_hostname, rc, status); + if (outbuf[0] != '\0') + LX_ERROR("STDOUT: %s\n", outbuf); + if (errbuf[0] != '\0') + LX_ERROR("STDERR: %s\n", errbuf); + } out: /* notify the main thread about completion */ write(lamigo_sigpipe[1], &rc, 1); @@ -1410,10 +1420,10 @@ static void *lamigo_check_agent_func(void *args) int status = INT_MAX; int rc; - snprintf(cmd, sizeof(cmd), "lfs path2fid '%s' > /dev/null 2>&1", + snprintf(cmd, sizeof(cmd), "lfs path2fid '%s'", a->rag_mountpoint); - rc = lamigo_exec_cmd(a, cmd, &status); + rc = lamigo_exec_cmd(a, cmd, &status, NULL, 0, NULL, 0); pthread_exit((void *)(intptr_t)(rc < 0 ? rc : status)); } diff --git a/lipe/src/lipe_ssh.c b/lipe/src/lipe_ssh.c index 62dcbb9..cd171e8 100644 --- a/lipe/src/lipe_ssh.c +++ b/lipe/src/lipe_ssh.c @@ -60,10 +60,12 @@ out: return rc; } -static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd, int *pstatus) +static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd, int *pstatus, + char *stdout_buf, int stdout_size, + char *stderr_buf, int stderr_size) { ssh_channel channel = NULL; - int rc; + int rc, nr; rc = lipe_ssh_session_start_cmd(session, cmd, &channel); if (rc != SSH_OK) @@ -80,6 +82,22 @@ static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd, int * *pstatus = rc; rc = SSH_OK; out: + if (stdout_buf) { + stdout_buf[0] = '\0'; + nr = ssh_channel_read_nonblocking(channel, stdout_buf, + stdout_size - 1, false); + if (nr > 0) + stdout_buf[nr] = '\0'; + } + + if (stderr_buf) { + stderr_buf[0] = '\0'; + nr = ssh_channel_read_nonblocking(channel, stderr_buf, + stderr_size - 1, true); + if (nr > 0) + stderr_buf[nr] = '\0'; + } + ssh_channel_send_eof(channel); ssh_channel_close(channel); ssh_channel_free(channel); @@ -241,8 +259,9 @@ int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channe return lipe_ssh_start_cmd_timeout(ctx, cmd, pchannel, LIPE_SSH_TIMEOUT_INFINITE); } -int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, - int *pstatus, long timeout_sec) +int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus, + char *outbuf, int outbuf_size, char *errbuf, int errbuf_size, + long timeout_sec) { int rc; @@ -262,7 +281,8 @@ int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, } /* Execute a remote command */ - rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd, pstatus); + rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd, pstatus, + outbuf, outbuf_size, errbuf, errbuf_size); if (rc < 0) lipe_ssh_context_fail(ctx); @@ -271,7 +291,9 @@ int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, return rc; } -int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus) +int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus, + char *outbuf, int outbuf_size, char *errbuf, int errbuf_size) { - return lipe_ssh_exec_timeout(ctx, cmd, pstatus, LIPE_SSH_TIMEOUT_INFINITE); + return lipe_ssh_exec_timeout(ctx, cmd, pstatus, outbuf, outbuf_size, + errbuf, errbuf_size, LIPE_SSH_TIMEOUT_INFINITE); } diff --git a/lipe/src/lipe_ssh.h b/lipe/src/lipe_ssh.h index 25c1019..5067663 100644 --- a/lipe/src/lipe_ssh.h +++ b/lipe/src/lipe_ssh.h @@ -19,10 +19,12 @@ int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host); void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx); int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, - int *pstatus, long timeout_sec); + int *pstatus, char *outbuf, int outbuf_size, + char *errbuf, int errbuf_size, long timeout_sec); int lipe_ssh_start_cmd_timeout(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel, long timeout_sec); -int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus); +int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus, + char *outbuf, int outbuf_size, char *errbuf, int errbuf_size); int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel); #endif /* _LIPE_SSH_H_ */ -- 1.8.3.1