Whamcloud - gitweb
EX-3598 lamigo: dump lfs's stdout/stderr
authorAlex Zhuravlev <bzzz@whamcloud.com>
Wed, 19 Jul 2023 08:35:04 +0000 (11:35 +0300)
committerAndreas Dilger <adilger@whamcloud.com>
Mon, 21 Aug 2023 08:41:57 +0000 (08:41 +0000)
in case of error

Lustre-change: https://review.whamcloud.com/44505
Lustre-commit: f8d365d8d7eaa24715fe4f68687051a6addc94bd

Test-Parameters: trivial testlist=hot-pools
Change-Id: Iea0c6fc097255f02df3698c1e3a31b39bfa09ca1
Signed-off-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/51711
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lipe/src/lamigo.c
lipe/src/lipe_ssh.c
lipe/src/lipe_ssh.h

index 098512d..9fe9233 100644 (file)
@@ -770,7 +770,9 @@ static int lamigo_init_cache(void)
        return 0;
 }
 
-static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, int *pstatus)
+static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd,
+                          int *pstatus, char *outbuf, int outbuf_size,
+                          char *errbuf, int errbuf_size)
 {
        struct resync_ssh_session *rss;
        int rc;
@@ -795,6 +797,7 @@ static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, int *pstatus
        rc = a->rag_is_local ?
                system(cmd1) :
                lipe_ssh_exec_timeout(&rss->rss_ctx, cmd1, pstatus,
+                                     outbuf, outbuf_size, errbuf, errbuf_size,
                                      opt.o_ssh_exec_to);
        if (rc)
                llapi_error(LLAPI_MSG_INFO, rc,
@@ -813,6 +816,7 @@ static void *lamigo_replicate_one(void *args)
 {
        struct resync_job *rj = (struct resync_job *)args;
        struct resync_agent *agent = rj->rj_agent;
+       char outbuf[4096], errbuf[4096];
        int resync = rj->rj_resync;
        char cmd[PATH_MAX * 2];
        int status = INT_MAX;
@@ -821,7 +825,7 @@ static void *lamigo_replicate_one(void *args)
        if (rj->rj_setprefer) {
                snprintf(cmd, sizeof(cmd),
                         "lfs setstripe --comp-set --comp-flags=prefer --pool='%s' "
-                        "'%s/.lustre/fid/"DFID"' > /dev/null 2>&1", rj->rj_pool,
+                        "'%s/.lustre/fid/"DFID"'", rj->rj_pool,
                         agent->rag_mountpoint,
                         PFID(&rj->rj_fid));
                LX_DEBUG("set prefer on "DFID"\n",
@@ -842,7 +846,7 @@ static void *lamigo_replicate_one(void *args)
                        i += snprintf(cmd + i, sizeof(cmd) - i,
                                      " --flags='%s'", rj->rj_mirror_opts);
                i += snprintf(cmd + i, sizeof(cmd) - i,
-                             " '%s/.lustre/fid/"DFID"' > /dev/null 2>&1",
+                             " '%s/.lustre/fid/"DFID"'",
                              agent->rag_mountpoint, PFID(&rj->rj_fid));
        } else if (resync == AMIGO_RESYNC_RESYNC) {
                snprintf(cmd, sizeof(cmd),
@@ -857,16 +861,22 @@ static void *lamigo_replicate_one(void *args)
 
        /* rc < 0 means an ssh error. Otherwise command exit status is
         * in status. Mask common exit statuses. */
-       rc = lamigo_exec_cmd(agent, cmd, &status);
+       rc = lamigo_exec_cmd(agent, cmd, &status, outbuf, sizeof(outbuf),
+                            errbuf, sizeof(errbuf));
        LX_DEBUG("exec command '%s' on '%s': rc = %d, status = %d\n",
                 cmd, agent->rag_hostname, rc, status);
        if (rc < 0 ||
            /* 1 from setprefer (see EX-3591) */
            (rj->rj_setprefer && status != 0 && status != 1) ||
            /* EBUSY from mirror extend/resync */
-           (!rj->rj_setprefer && status != 0 && status != EBUSY && status != ENODATA))
+           (!rj->rj_setprefer && status != 0 && status != EBUSY && status != ENODATA)) {
                LX_ERROR("command '%s' on '%s' failed: rc = %d, status = %d\n",
                         cmd, agent->rag_hostname, rc, status);
+               if (outbuf[0] != '\0')
+                       LX_ERROR("STDOUT: %s\n", outbuf);
+               if (errbuf[0] != '\0')
+                       LX_ERROR("STDERR: %s\n", errbuf);
+       }
 out:
        /* notify the main thread about completion */
        write(lamigo_sigpipe[1], &rc, 1);
@@ -1410,10 +1420,10 @@ static void *lamigo_check_agent_func(void *args)
        int status = INT_MAX;
        int rc;
 
-       snprintf(cmd, sizeof(cmd), "lfs path2fid '%s' > /dev/null 2>&1",
+       snprintf(cmd, sizeof(cmd), "lfs path2fid '%s'",
                 a->rag_mountpoint);
 
-       rc = lamigo_exec_cmd(a, cmd, &status);
+       rc = lamigo_exec_cmd(a, cmd, &status, NULL, 0, NULL, 0);
 
        pthread_exit((void *)(intptr_t)(rc < 0 ? rc : status));
 }
index 62dcbb9..cd171e8 100644 (file)
@@ -60,10 +60,12 @@ out:
        return rc;
 }
 
-static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd, int *pstatus)
+static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd, int *pstatus,
+                                       char *stdout_buf, int stdout_size,
+                                       char *stderr_buf, int stderr_size)
 {
        ssh_channel channel = NULL;
-       int rc;
+       int rc, nr;
 
        rc = lipe_ssh_session_start_cmd(session, cmd, &channel);
        if (rc != SSH_OK)
@@ -80,6 +82,22 @@ static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd, int *
        *pstatus = rc;
        rc = SSH_OK;
 out:
+       if (stdout_buf) {
+               stdout_buf[0] = '\0';
+               nr = ssh_channel_read_nonblocking(channel, stdout_buf,
+                                                 stdout_size - 1, false);
+               if (nr > 0)
+                       stdout_buf[nr] = '\0';
+       }
+
+       if (stderr_buf) {
+               stderr_buf[0] = '\0';
+               nr = ssh_channel_read_nonblocking(channel, stderr_buf,
+                                                 stderr_size - 1, true);
+               if (nr > 0)
+                       stderr_buf[nr] = '\0';
+       }
+
        ssh_channel_send_eof(channel);
        ssh_channel_close(channel);
        ssh_channel_free(channel);
@@ -241,8 +259,9 @@ int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channe
        return lipe_ssh_start_cmd_timeout(ctx, cmd, pchannel, LIPE_SSH_TIMEOUT_INFINITE);
 }
 
-int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd,
-                         int *pstatus, long timeout_sec)
+int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus,
+                         char *outbuf, int outbuf_size, char *errbuf, int errbuf_size,
+                         long timeout_sec)
 {
        int rc;
 
@@ -262,7 +281,8 @@ int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd,
        }
 
        /* Execute a remote command */
-       rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd, pstatus);
+       rc = lipe_ssh_session_exec_cmd(ctx->lsc_session, cmd, pstatus,
+                                      outbuf, outbuf_size, errbuf, errbuf_size);
        if (rc < 0)
                lipe_ssh_context_fail(ctx);
 
@@ -271,7 +291,9 @@ int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd,
        return rc;
 }
 
-int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus)
+int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus,
+                 char *outbuf, int outbuf_size, char *errbuf, int errbuf_size)
 {
-       return lipe_ssh_exec_timeout(ctx, cmd, pstatus, LIPE_SSH_TIMEOUT_INFINITE);
+       return lipe_ssh_exec_timeout(ctx, cmd, pstatus, outbuf, outbuf_size,
+                                       errbuf, errbuf_size, LIPE_SSH_TIMEOUT_INFINITE);
 }
index 25c1019..5067663 100644 (file)
@@ -19,10 +19,12 @@ int lipe_ssh_context_init(struct lipe_ssh_context *ctx, const char *host);
 void lipe_ssh_context_destroy(struct lipe_ssh_context *ctx);
 
 int lipe_ssh_exec_timeout(struct lipe_ssh_context *ctx, const char *cmd,
-                         int *pstatus, long timeout_sec);
+                         int *pstatus, char *outbuf, int outbuf_size,
+                         char *errbuf, int errbuf_size, long timeout_sec);
 int lipe_ssh_start_cmd_timeout(struct lipe_ssh_context *ctx, const char *cmd,
                               ssh_channel *pchannel, long timeout_sec);
-int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus);
+int lipe_ssh_exec(struct lipe_ssh_context *ctx, const char *cmd, int *pstatus,
+                 char *outbuf, int outbuf_size, char *errbuf, int errbuf_size);
 int lipe_ssh_start_cmd(struct lipe_ssh_context *ctx, const char *cmd, ssh_channel *pchannel);
 
 #endif /* _LIPE_SSH_H_ */