From aae033a5ed7e69a2aa352b4c3f29bf4d43adb397 Mon Sep 17 00:00:00 2001 From: "John L. Hammond" Date: Thu, 14 Oct 2021 11:24:47 -0500 Subject: [PATCH] EX-4061 lipe: reinitialize channel after alr disconnection Move most of lamigo_alr_data_collection_thread() into lamigo_alr_agent_run() which manages ssh channel resources properly. Keep a while-try-sleep loop in lamigo_alr_data_collection_thread(). Test-Parameters: trivial testlist=hot-pools Signed-off-by: John L. Hammond Change-Id: Ibe866f5b4a6ddea560dcb57fb0ed3fbbf3d52710 Reviewed-on: https://review.whamcloud.com/45243 Tested-by: jenkins Reviewed-by: Alexandre Ioffe Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-on: https://review.whamcloud.com/46097 --- lipe/src/lamigo_alr.c | 57 ++++++++++++++++++++++++++++++++++----------------- lipe/src/lipe_ssh.c | 2 +- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/lipe/src/lamigo_alr.c b/lipe/src/lamigo_alr.c index 47e78eb..0cf3df4 100644 --- a/lipe/src/lamigo_alr.c +++ b/lipe/src/lamigo_alr.c @@ -232,15 +232,20 @@ int lamigo_alr_parse(char *buf, int size, int *received) return 0; } - -/* this function works with specific OSS */ -/* it initiates ssh session with running ofd_access_batch */ -/* and gets data every N seconds */ -/* then the data are processed and inserted into the global structure */ -void *lamigo_alr_data_collection_thread(void *arg) +/* Run ofd_access_log_reader once on the specified OSS. + * + * it initiates ssh session with running ofd_access_batch and gets + * data every N seconds. then the data are processed and inserted + * into the global structure . + * + * FIXME Do a read with timeout to detect silently disconnected + * peers. It would be nice to use ssh_keepalive_send() but I cannot + * make this work. Consider having ofd_access_log_reader send a line + * that we ignore at the start of every batch (even if the batch is + * empty). */ +static int lamigo_alr_agent_run(struct alr_agent *ala) { - ssh_channel channel; - struct alr_agent *ala = arg; + ssh_channel channel = NULL; unsigned long now = time(NULL); unsigned long last_checked = now; int rc, offset = 0, received = 0; @@ -253,16 +258,15 @@ void *lamigo_alr_data_collection_thread(void *arg) * lipe_ssh_exec(). */ snprintf(cmd, sizeof(cmd), - "ofd_access_log_reader -i %d -I %d %s 2> /dev/null", - opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args); + "ofd_access_log_reader -i %d -I %d %s 2> /dev/null", + opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args); -repeat: rc = lipe_ssh_start_cmd(&ala->ala_ctx, cmd, &channel); if (rc != SSH_OK) { llapi_error(LLAPI_MSG_ERROR|LLAPI_MSG_NO_ERRNO, 0, "cannot start ofd_access_log_reader on host '%s': rc = %d\n", ala->ala_host, rc); - goto err; + goto out; } llapi_err_noerrno(LLAPI_MSG_DEBUG, "alr agent on %s is running\n", @@ -287,16 +291,31 @@ repeat: llapi_err_noerrno(rc == 0 ? LLAPI_MSG_DEBUG : LLAPI_MSG_ERROR, "alr agent on %s exited with status %d\n", ala->ala_host, rc); -err: - ssh_channel_send_eof(channel); - ssh_channel_close(channel); +out: + if (channel != NULL) { + ssh_channel_send_eof(channel); + ssh_channel_close(channel); + } + ssh_channel_free(channel); - /* wait for a while */ - /* XXX: should be configurable? */ - sleep(5); + return rc; +} - goto repeat; +static void *lamigo_alr_data_collection_thread(void *arg) +{ + struct alr_agent *ala = arg; + + while (1) { + lamigo_alr_agent_run(ala); + /* wait for a while */ + /* XXX: should be configurable? */ + sleep(5); + } + + /* Add a return to make GCC happy. See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97793. */ + + return NULL; } static void lamigo_clean_oldest_list(void) diff --git a/lipe/src/lipe_ssh.c b/lipe/src/lipe_ssh.c index 8df19fa..1bf8ab2 100644 --- a/lipe/src/lipe_ssh.c +++ b/lipe/src/lipe_ssh.c @@ -66,7 +66,7 @@ out: static int lipe_ssh_session_exec_cmd(ssh_session session, const char *cmd, int *pstatus) { - ssh_channel channel; + ssh_channel channel = NULL; int rc; rc = lipe_ssh_session_start_cmd(session, cmd, &channel); -- 1.8.3.1