From 720ea630a8b5a98deaf0ab4ea3730a343d0f2db9 Mon Sep 17 00:00:00 2001 From: Vitaliy Kuznetsov Date: Mon, 2 Dec 2024 16:13:20 +0100 Subject: [PATCH] LU-18503 utils: keepalive message from ofd_access_log_reader Checks ofd_access_log_reader for keepalive message feature availability. Start ofd_access_log_reader with keepalive message option if it is available. Added tests for ofd_access_log_reader to test keepalive message feature. Fixed a bug when trace file is opened. Moved FATAL, ERROR, DEBUG macros to include and used FATAL macro where possible Signed-off-by: Alexandre Ioffe Signed-off-by: Vitaliy Kuznetsov Change-Id: I0f218e2394c9d0ab6cd425860ba79956a10cbd58 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57691 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin --- lustre/tests/sanity.sh | 32 ++++++++++++++++ lustre/tests/yaml.sh | 2 +- lustre/utils/ofd_access_batch.c | 71 ++++++++++++++++++++++++------------ lustre/utils/ofd_access_batch.h | 32 ++++++++++++++++ lustre/utils/ofd_access_log_reader.c | 43 +++++++--------------- 5 files changed, 127 insertions(+), 53 deletions(-) diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index cb1442e..266a27d 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -20893,6 +20893,38 @@ test_165f() { } run_test 165f "ofd_access_log_reader --exit-on-close works" +test_165g() { + local count + local trace=$TMP/${tfile}.trace + local ofd_keepalive=$(do_facet ost1 ofd_access_log_reader --help | + grep keepalive) + + [[ -n "$ofd_keepalive" ]] || skip "ALR keepalive message unsupported" + + setup_165 + do_facet ost1 timeout 60 ofd_access_log_reader \ + --keepalive=3 --batch-interval=4 --debug=- > "${trace}" & + sleep 40 + stop ost1 + + do_facet ost1 killall -TERM ofd_access_log_reader + wait + rc=$? + + if ((rc != 0)); then + error "ofd_access_log_reader exited with rc = '${rc}'" + fi + + count=$(grep -c "send keepalive" ${trace}) + + # Normal duration 40 seconds / 4 secs interval = 10 times or so. + # But sometimes the duration may be unpredictably long + (( count >= 7 )) || error "keepalive msg received $count. Expected >= 7" + + echo "keepalive msg received $count" +} +run_test 165g "ofd_access_log_reader --keepalive works" + test_169() { # do directio so as not to populate the page cache log "creating a 10 Mb file" diff --git a/lustre/tests/yaml.sh b/lustre/tests/yaml.sh index 9a9a98f..49a9888 100644 --- a/lustre/tests/yaml.sh +++ b/lustre/tests/yaml.sh @@ -9,7 +9,7 @@ split_output() { while read line; do host=${line%%:*}; echo "$line" | sed "s/^${host}: //" | sed "s/^${host}://" \ - >> $logdir/node.$host.yml; + >> "$logdir/node.$host.yml"; done } diff --git a/lustre/utils/ofd_access_batch.c b/lustre/utils/ofd_access_batch.c index 7bfee69..378beb4 100644 --- a/lustre/utils/ofd_access_batch.c +++ b/lustre/utils/ofd_access_batch.c @@ -45,6 +45,7 @@ #include "lstddef.h" #include "ofd_access_batch.h" +static time_t when_last_printed; struct fid_hash_node { struct list_head fhn_node; struct lu_fid fhn_fid; @@ -74,7 +75,7 @@ static void fhn_del_init(struct fid_hash_node *fhn) } static inline void fhn_replace_init(struct fid_hash_node *old_fhn, - struct fid_hash_node *new_fhn) + struct fid_hash_node *new_fhn) { list_add(&new_fhn->fhn_node, &old_fhn->fhn_node); list_del_init(&old_fhn->fhn_node); @@ -267,6 +268,7 @@ static void alre_printf(FILE *f, struct alr_entry *alre, enum alr_rw d) (unsigned long long)alre->alre_segment_count[d], (unsigned long long)alre->alre_count[d], (d == ALR_READ) ? 'r' : 'w'); + when_last_printed = time(NULL); } struct alr_thread_arg { @@ -276,6 +278,35 @@ struct alr_thread_arg { pthread_mutex_t *file_mutex; }; +static void alre_print_keepalive(struct alr_thread_arg *aa) +{ + /* Do not print keepalive if disabled */ + if (keepalive_interval == 0) + return; + + /* If nothing printed during keepalive_interval - send keepalive */ + if (time(NULL) < (when_last_printed + keepalive_interval)) + return; + + fprintf(aa->file, "# keepalive\n"); + when_last_printed = time(NULL); + DEBUG("send keepalive\n"); +} + +static void alre_print_keepalive_locked(struct alr_thread_arg *aa) +{ + int rc = pthread_mutex_lock(aa->file_mutex); + + if (rc != 0) + FATAL("cannot lock batch file: %s\n", strerror(rc)); + + alre_print_keepalive(aa); + + rc = pthread_mutex_unlock(aa->file_mutex); + if (rc != 0) + FATAL("cannot unlock batch file: %s\n", strerror(rc)); +} + /* Fraction < 100 */ static void *alr_sort_and_print_thread(void *arg) { @@ -298,10 +329,8 @@ static void *alr_sort_and_print_thread(void *arg) goto out; sa = calloc(nr, sizeof(*sa)); - if (!sa) { - fprintf(stderr, "cannot allocate memory for sorting\n"); - exit(1); - } + if (!sa) + FATAL("cannot allocate memory for sorting\n"); i = 0; list_for_each_entry(alre, tmp, alre_fid_hash_node.fhn_node) { @@ -322,11 +351,8 @@ static void *alr_sort_and_print_thread(void *arg) /* Prevent jumbled output from multiple concurrent sort and * print threads. */ rc = pthread_mutex_lock(aa->file_mutex); - if (rc != 0) { - fprintf(stderr, "cannot lock batch file: %s\n", - strerror(rc)); - exit(1); - } + if (rc != 0) + FATAL("cannot lock batch file: %s\n", strerror(rc)); /* there might be lots of items at @cut, but we want to limit total * output. so the first loop dumps all items > @cut and the second @@ -352,13 +378,13 @@ static void *alr_sort_and_print_thread(void *arg) } rc = pthread_mutex_unlock(aa->file_mutex); - if (rc != 0) { - fprintf(stderr, "cannot unlock batch file: %s\n", - strerror(rc)); - exit(1); - } + if (rc != 0) + FATAL("cannot unlock batch file: %s\n", strerror(rc)); out: + /* send keepalive */ + alre_print_keepalive_locked(aa); + fflush(aa->file); list_for_each_entry_safe(alre, next, tmp, alre_fid_hash_node.fhn_node) { @@ -381,10 +407,8 @@ static void *alr_print_thread_fraction_100(void *arg) /* Prevent jumbled output from multiple concurrent sort and * print threads. */ rc = pthread_mutex_lock(aa->file_mutex); - if (rc != 0) { - fprintf(stderr, "cannot lock batch file: %s\n", strerror(rc)); - exit(1); - } + if (rc != 0) + FATAL("cannot lock batch file: %s\n", strerror(rc)); list_for_each_entry(alre, &aa->list, alre_fid_hash_node.fhn_node) { enum alr_rw d; @@ -395,11 +419,12 @@ static void *alr_print_thread_fraction_100(void *arg) } } + /* send keepalive */ + alre_print_keepalive(aa); + rc = pthread_mutex_unlock(aa->file_mutex); - if (rc != 0) { - fprintf(stderr, "cannot unlock batch file: %s\n", strerror(rc)); - exit(1); - } + if (rc != 0) + FATAL("cannot unlock batch file: %s\n", strerror(rc)); fflush(aa->file); diff --git a/lustre/utils/ofd_access_batch.h b/lustre/utils/ofd_access_batch.h index 7a4771c..a8d5c39 100644 --- a/lustre/utils/ofd_access_batch.h +++ b/lustre/utils/ofd_access_batch.h @@ -4,9 +4,41 @@ #include #include #include +#include + +extern FILE *debug_file; +extern FILE *trace_file; + +#define DEBUG(fmt, args...) \ + do { \ + if (debug_file != NULL) \ + fprintf(debug_file, "DEBUG %s:%d: "fmt, \ + __func__, __LINE__, ##args); \ + } while (0) + +#define TRACE(fmt, args...) \ + do { \ + if (trace_file != NULL) \ + fprintf(trace_file, "TRACE "fmt, ##args); \ + } while (0) + +#define DEBUG_D(x) DEBUG("%s = %"PRIdMAX"\n", #x, (intmax_t)x) +#define DEBUG_P(x) DEBUG("%s = %p\n", #x, x) +#define DEBUG_S(x) DEBUG("%s = '%s'\n", #x, x) +#define DEBUG_U(x) DEBUG("%s = %"PRIuMAX"\n", #x, (uintmax_t)x) + +#define ERROR(fmt, args...) \ + fprintf(stderr, "%s: "fmt, program_invocation_short_name, ##args) + +#define FATAL(fmt, args...) \ + do { \ + ERROR("FATAL: "fmt, ##args); \ + exit(EXIT_FAILURE); \ + } while (0) struct lu_fid; struct alr_batch; +extern unsigned long keepalive_interval; struct alr_batch *alr_batch_create(unsigned int shift); void alr_batch_destroy(struct alr_batch *alrb); diff --git a/lustre/utils/ofd_access_log_reader.c b/lustre/utils/ofd_access_log_reader.c index 277cbcb..b91051e 100644 --- a/lustre/utils/ofd_access_log_reader.c +++ b/lustre/utils/ofd_access_log_reader.c @@ -70,34 +70,8 @@ /* TODO fsname filter */ -static FILE *debug_file; -static FILE *trace_file; - -#define DEBUG(fmt, args...) \ - do { \ - if (debug_file != NULL) \ - fprintf(debug_file, "DEBUG %s:%d: "fmt, __func__, __LINE__, ##args); \ - } while (0) - -#define TRACE(fmt, args...) \ - do { \ - if (trace_file != NULL) \ - fprintf(trace_file, "TRACE "fmt, ##args); \ - } while (0) - -#define DEBUG_D(x) DEBUG("%s = %"PRIdMAX"\n", #x, (intmax_t)x) -#define DEBUG_P(x) DEBUG("%s = %p\n", #x, x) -#define DEBUG_S(x) DEBUG("%s = '%s'\n", #x, x) -#define DEBUG_U(x) DEBUG("%s = %"PRIuMAX"\n", #x, (uintmax_t)x) - -#define ERROR(fmt, args...) \ - fprintf(stderr, "%s: "fmt, program_invocation_short_name, ##args) - -#define FATAL(fmt, args...) \ - do { \ - ERROR("FATAL: "fmt, ##args); \ - exit(EXIT_FAILURE); \ - } while (0) +FILE *debug_file; +FILE *trace_file; enum { ALR_EXIT_SUCCESS = INT_MIN + EXIT_SUCCESS, @@ -144,6 +118,8 @@ static int alr_print_fraction = 100; #define P_ALR_LOG(al) \ P_ALR_DEV(&(al)->alr_dev), major((al)->alr_rdev), minor((al)->alr_rdev) +unsigned long keepalive_interval; + static void alr_dev_free(int epoll_fd, struct alr_dev *ad) { if (ad == NULL) @@ -694,9 +670,10 @@ static void usage(void) " -e, --exit-on-close exit on close of all log devices\n" " -I, --mdt-index-filter=INDEX set log MDT index filter to INDEX\n" " -h, --help display this help and exit\n" +" --keepalive=INTERVAL print keepalive message every INTERVAL seconds\n" " -l, --list print YAML list of available access logs\n" " -d, --debug[=FILE] print debug messages to FILE (stderr)\n" -" -s, --stats=FILE print stats messages to FILE (stderr)\n" +" -s, --stats=FILE print stats messages to FILE (stderr)\n" " -t, --trace[=FILE] print trace messages to FILE (stderr)\n", program_invocation_short_name); } @@ -724,6 +701,7 @@ int main(int argc, char *argv[]) { .name = "batch-interval", .has_arg = required_argument, .val = 'i', }, { .name = "batch-offset", .has_arg = required_argument, .val = 'o', }, { .name = "exit-on-close", .has_arg = no_argument, .val = 'e', }, + { .name = "keepalive", .has_arg = required_argument, .val = 'k' }, { .name = "mdt-index-filter", .has_arg = required_argument, .val = 'I' }, { .name = "debug", .has_arg = optional_argument, .val = 'd', }, { .name = "help", .has_arg = no_argument, .val = 'h', }, @@ -748,6 +726,13 @@ int main(int argc, char *argv[]) errno != 0) FATAL("invalid batch interval '%s'\n", optarg); break; + case 'k': + errno = 0; + keepalive_interval = strtoll(optarg, NULL, 0); + if (keepalive_interval > 1048576 || errno != 0) + FATAL("invalid keepalive message interval '%s'\n", + optarg); + break; case 'o': errno = 0; batch_offset = strtoll(optarg, NULL, 0); -- 1.8.3.1