Whamcloud - gitweb
LU-18503 utils: keepalive message from ofd_access_log_reader 91/57691/5
authorVitaliy Kuznetsov <vkuznetsov@ddn.com>
Mon, 2 Dec 2024 15:13:20 +0000 (16:13 +0100)
committerOleg Drokin <green@whamcloud.com>
Sun, 2 Feb 2025 06:29:43 +0000 (06:29 +0000)
Checks ofd_access_log_reader for keepalive message feature
availability. Start ofd_access_log_reader with keepalive
message option if it is available.
Added tests for ofd_access_log_reader to test keepalive
message feature. Fixed a bug when trace file is opened.
Moved FATAL, ERROR, DEBUG macros to include and used FATAL
macro where possible

Signed-off-by: Alexandre Ioffe <aioffe@ddn.com>
Signed-off-by: Vitaliy Kuznetsov <vkuznetsov@ddn.com>
Change-Id: I0f218e2394c9d0ab6cd425860ba79956a10cbd58
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/57691
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/tests/sanity.sh
lustre/tests/yaml.sh
lustre/utils/ofd_access_batch.c
lustre/utils/ofd_access_batch.h
lustre/utils/ofd_access_log_reader.c

index cb1442e..266a27d 100755 (executable)
@@ -20893,6 +20893,38 @@ test_165f() {
 }
 run_test 165f "ofd_access_log_reader --exit-on-close works"
 
+test_165g() {
+       local count
+       local trace=$TMP/${tfile}.trace
+       local ofd_keepalive=$(do_facet ost1 ofd_access_log_reader --help |
+                               grep keepalive)
+
+       [[ -n "$ofd_keepalive" ]] || skip "ALR keepalive message unsupported"
+
+       setup_165
+       do_facet ost1 timeout 60 ofd_access_log_reader \
+               --keepalive=3 --batch-interval=4 --debug=- > "${trace}" &
+       sleep 40
+       stop ost1
+
+       do_facet ost1 killall -TERM ofd_access_log_reader
+       wait
+       rc=$?
+
+       if ((rc != 0)); then
+               error "ofd_access_log_reader exited with rc = '${rc}'"
+       fi
+
+       count=$(grep -c "send keepalive" ${trace})
+
+       # Normal duration 40 seconds / 4 secs interval = 10 times or so.
+       # But sometimes the duration may be unpredictably long
+       (( count >= 7 )) || error "keepalive msg received $count. Expected >= 7"
+
+       echo "keepalive msg received $count"
+}
+run_test 165g "ofd_access_log_reader --keepalive works"
+
 test_169() {
        # do directio so as not to populate the page cache
        log "creating a 10 Mb file"
index 9a9a98f..49a9888 100644 (file)
@@ -9,7 +9,7 @@ split_output() {
     while read line; do
         host=${line%%:*};
         echo "$line" | sed "s/^${host}: //" | sed "s/^${host}://" \
-            >> $logdir/node.$host.yml;
+            >> "$logdir/node.$host.yml";
     done
 }
 
index 7bfee69..378beb4 100644 (file)
@@ -45,6 +45,7 @@
 #include "lstddef.h"
 #include "ofd_access_batch.h"
 
+static time_t when_last_printed;
 struct fid_hash_node {
        struct list_head fhn_node;
        struct lu_fid fhn_fid;
@@ -74,7 +75,7 @@ static void fhn_del_init(struct fid_hash_node *fhn)
 }
 
 static inline void fhn_replace_init(struct fid_hash_node *old_fhn,
-                               struct fid_hash_node *new_fhn)
+                                   struct fid_hash_node *new_fhn)
 {
        list_add(&new_fhn->fhn_node, &old_fhn->fhn_node);
        list_del_init(&old_fhn->fhn_node);
@@ -267,6 +268,7 @@ static void alre_printf(FILE *f, struct alr_entry *alre, enum alr_rw d)
                (unsigned long long)alre->alre_segment_count[d],
                (unsigned long long)alre->alre_count[d],
                (d == ALR_READ) ? 'r' : 'w');
+       when_last_printed = time(NULL);
 }
 
 struct alr_thread_arg {
@@ -276,6 +278,35 @@ struct alr_thread_arg {
        pthread_mutex_t *file_mutex;
 };
 
+static void alre_print_keepalive(struct alr_thread_arg *aa)
+{
+       /* Do not print keepalive if disabled */
+       if (keepalive_interval == 0)
+               return;
+
+       /* If nothing printed during keepalive_interval - send keepalive */
+       if (time(NULL) < (when_last_printed + keepalive_interval))
+               return;
+
+       fprintf(aa->file, "# keepalive\n");
+       when_last_printed = time(NULL);
+       DEBUG("send keepalive\n");
+}
+
+static void alre_print_keepalive_locked(struct alr_thread_arg *aa)
+{
+       int rc = pthread_mutex_lock(aa->file_mutex);
+
+       if (rc != 0)
+               FATAL("cannot lock batch file: %s\n", strerror(rc));
+
+       alre_print_keepalive(aa);
+
+       rc = pthread_mutex_unlock(aa->file_mutex);
+       if (rc != 0)
+               FATAL("cannot unlock batch file: %s\n", strerror(rc));
+}
+
 /* Fraction < 100 */
 static void *alr_sort_and_print_thread(void *arg)
 {
@@ -298,10 +329,8 @@ static void *alr_sort_and_print_thread(void *arg)
                goto out;
 
        sa = calloc(nr, sizeof(*sa));
-       if (!sa) {
-               fprintf(stderr, "cannot allocate memory for sorting\n");
-               exit(1);
-       }
+       if (!sa)
+               FATAL("cannot allocate memory for sorting\n");
 
        i = 0;
        list_for_each_entry(alre, tmp, alre_fid_hash_node.fhn_node) {
@@ -322,11 +351,8 @@ static void *alr_sort_and_print_thread(void *arg)
        /* Prevent jumbled output from multiple concurrent sort and
         * print threads. */
        rc = pthread_mutex_lock(aa->file_mutex);
-       if (rc != 0) {
-               fprintf(stderr, "cannot lock batch file: %s\n",
-                       strerror(rc));
-               exit(1);
-       }
+       if (rc != 0)
+               FATAL("cannot lock batch file: %s\n", strerror(rc));
 
        /* there might be lots of items at @cut, but we want to limit total
         * output. so the first loop dumps all items > @cut and the second
@@ -352,13 +378,13 @@ static void *alr_sort_and_print_thread(void *arg)
        }
 
        rc = pthread_mutex_unlock(aa->file_mutex);
-       if (rc != 0) {
-               fprintf(stderr, "cannot unlock batch file: %s\n",
-                       strerror(rc));
-               exit(1);
-       }
+       if (rc != 0)
+               FATAL("cannot unlock batch file: %s\n", strerror(rc));
 
 out:
+       /* send keepalive */
+       alre_print_keepalive_locked(aa);
+
        fflush(aa->file);
 
        list_for_each_entry_safe(alre, next, tmp, alre_fid_hash_node.fhn_node) {
@@ -381,10 +407,8 @@ static void *alr_print_thread_fraction_100(void *arg)
        /* Prevent jumbled output from multiple concurrent sort and
         * print threads. */
        rc = pthread_mutex_lock(aa->file_mutex);
-       if (rc != 0) {
-               fprintf(stderr, "cannot lock batch file: %s\n", strerror(rc));
-               exit(1);
-       }
+       if (rc != 0)
+               FATAL("cannot lock batch file: %s\n", strerror(rc));
 
        list_for_each_entry(alre, &aa->list, alre_fid_hash_node.fhn_node) {
                enum alr_rw d;
@@ -395,11 +419,12 @@ static void *alr_print_thread_fraction_100(void *arg)
                }
        }
 
+       /* send keepalive */
+       alre_print_keepalive(aa);
+
        rc = pthread_mutex_unlock(aa->file_mutex);
-       if (rc != 0) {
-               fprintf(stderr, "cannot unlock batch file: %s\n", strerror(rc));
-               exit(1);
-       }
+       if (rc != 0)
+               FATAL("cannot unlock batch file: %s\n", strerror(rc));
 
        fflush(aa->file);
 
index 7a4771c..a8d5c39 100644 (file)
@@ -4,9 +4,41 @@
 #include <sys/types.h>
 #include <linux/types.h>
 #include <libcfs/util/list.h>
+#include <errno.h>
+
+extern FILE *debug_file;
+extern FILE *trace_file;
+
+#define DEBUG(fmt, args...)                                            \
+       do {                                                            \
+               if (debug_file != NULL)                                 \
+                       fprintf(debug_file, "DEBUG %s:%d: "fmt,         \
+                               __func__, __LINE__, ##args);            \
+       } while (0)
+
+#define TRACE(fmt, args...)                                            \
+       do {                                                            \
+               if (trace_file != NULL)                                 \
+                       fprintf(trace_file, "TRACE "fmt, ##args);       \
+       } while (0)
+
+#define DEBUG_D(x) DEBUG("%s = %"PRIdMAX"\n", #x, (intmax_t)x)
+#define DEBUG_P(x) DEBUG("%s = %p\n", #x, x)
+#define DEBUG_S(x) DEBUG("%s = '%s'\n", #x, x)
+#define DEBUG_U(x) DEBUG("%s = %"PRIuMAX"\n", #x, (uintmax_t)x)
+
+#define ERROR(fmt, args...) \
+       fprintf(stderr, "%s: "fmt, program_invocation_short_name, ##args)
+
+#define FATAL(fmt, args...)                    \
+       do {                                    \
+               ERROR("FATAL: "fmt, ##args);    \
+               exit(EXIT_FAILURE);             \
+       } while (0)
 
 struct lu_fid;
 struct alr_batch;
+extern unsigned long keepalive_interval;
 
 struct alr_batch *alr_batch_create(unsigned int shift);
 void alr_batch_destroy(struct alr_batch *alrb);
index 277cbcb..b91051e 100644 (file)
 
 /* TODO fsname filter */
 
-static FILE *debug_file;
-static FILE *trace_file;
-
-#define DEBUG(fmt, args...)                                            \
-       do {                                                            \
-               if (debug_file != NULL)                                 \
-                       fprintf(debug_file, "DEBUG %s:%d: "fmt, __func__, __LINE__, ##args); \
-       } while (0)
-
-#define TRACE(fmt, args...)                                            \
-       do {                                                            \
-               if (trace_file != NULL)                                 \
-                       fprintf(trace_file, "TRACE "fmt, ##args);       \
-       } while (0)
-
-#define DEBUG_D(x) DEBUG("%s = %"PRIdMAX"\n", #x, (intmax_t)x)
-#define DEBUG_P(x) DEBUG("%s = %p\n", #x, x)
-#define DEBUG_S(x) DEBUG("%s = '%s'\n", #x, x)
-#define DEBUG_U(x) DEBUG("%s = %"PRIuMAX"\n", #x, (uintmax_t)x)
-
-#define ERROR(fmt, args...) \
-       fprintf(stderr, "%s: "fmt, program_invocation_short_name, ##args)
-
-#define FATAL(fmt, args...)                    \
-       do {                                    \
-               ERROR("FATAL: "fmt, ##args);    \
-               exit(EXIT_FAILURE);             \
-       } while (0)
+FILE *debug_file;
+FILE *trace_file;
 
 enum {
        ALR_EXIT_SUCCESS = INT_MIN + EXIT_SUCCESS,
@@ -144,6 +118,8 @@ static int alr_print_fraction = 100;
 #define P_ALR_LOG(al) \
        P_ALR_DEV(&(al)->alr_dev), major((al)->alr_rdev), minor((al)->alr_rdev)
 
+unsigned long keepalive_interval;
+
 static void alr_dev_free(int epoll_fd, struct alr_dev *ad)
 {
        if (ad == NULL)
@@ -694,9 +670,10 @@ static void usage(void)
 "  -e, --exit-on-close            exit on close of all log devices\n"
 "  -I, --mdt-index-filter=INDEX   set log MDT index filter to INDEX\n"
 "  -h, --help                     display this help and exit\n"
+"  --keepalive=INTERVAL           print keepalive message every INTERVAL seconds\n"
 "  -l, --list                     print YAML list of available access logs\n"
 "  -d, --debug[=FILE]             print debug messages to FILE (stderr)\n"
-"  -s, --stats=FILE              print stats messages to FILE (stderr)\n"
+"  -s, --stats=FILE               print stats messages to FILE (stderr)\n"
 "  -t, --trace[=FILE]             print trace messages to FILE (stderr)\n",
                program_invocation_short_name);
 }
@@ -724,6 +701,7 @@ int main(int argc, char *argv[])
                { .name = "batch-interval", .has_arg = required_argument, .val = 'i', },
                { .name = "batch-offset", .has_arg = required_argument, .val = 'o', },
                { .name = "exit-on-close", .has_arg = no_argument, .val = 'e', },
+               { .name = "keepalive", .has_arg = required_argument, .val = 'k' },
                { .name = "mdt-index-filter", .has_arg = required_argument, .val = 'I' },
                { .name = "debug", .has_arg = optional_argument, .val = 'd', },
                { .name = "help", .has_arg = no_argument, .val = 'h', },
@@ -748,6 +726,13 @@ int main(int argc, char *argv[])
                            errno != 0)
                                FATAL("invalid batch interval '%s'\n", optarg);
                        break;
+               case 'k':
+                       errno = 0;
+                       keepalive_interval = strtoll(optarg, NULL, 0);
+                       if (keepalive_interval > 1048576 || errno != 0)
+                               FATAL("invalid keepalive message interval '%s'\n",
+                                     optarg);
+                       break;
                case 'o':
                        errno = 0;
                        batch_offset = strtoll(optarg, NULL, 0);