From 028bee14d2c6d8feb5eb418302f8751643e731c6 Mon Sep 17 00:00:00 2001 From: Alexandre Ioffe Date: Tue, 29 Mar 2022 00:48:35 -0700 Subject: [PATCH] EX-4141 lipe: lamigo should detect dead OST and restart ALR Use #keepalive message and ssh read with timeout to detect OST is down and restart ALR. Add stats for ALR last seen message Duplicate ofd_access_log_reader from lustre/utils into lipe/src/es_ofd_access_log_reader Use common lamigo_hash.h for lamigo and es_ofd_access_log_reader Signed-off-by: Alexandre Ioffe Test-Parameters: trivial testlist=hot-pools Change-Id: I26dc631a8663046821e049fc6e091108b2a62f87 Reviewed-on: https://review.whamcloud.com/46944 Tested-by: jenkins Tested-by: Maloo Reviewed-by: John Hammond Reviewed-by: Alex Zhuravlev --- lipe/.gitignore | 2 + lipe/lipe.spec.in | 2 + lipe/src/Makefile.am | 11 +- lipe/src/es_ofd_access_batch.c | 552 ++++++++++++++++++++ lipe/src/es_ofd_access_batch.h | 18 + lipe/src/es_ofd_access_log_reader.c | 998 ++++++++++++++++++++++++++++++++++++ lipe/src/general_policy.c | 2 +- lipe/src/lamigo.c | 11 + lipe/src/lamigo.h | 2 + lipe/src/lamigo_alr.c | 93 +++- lipe/src/lamigo_hash.c | 93 +--- lipe/src/lamigo_hash.h | 91 +++- lipe/src/list.h | 11 +- lipe/src/lstddef.h | 293 +++++++++++ 14 files changed, 2055 insertions(+), 124 deletions(-) create mode 100644 lipe/src/es_ofd_access_batch.c create mode 100644 lipe/src/es_ofd_access_batch.h create mode 100644 lipe/src/es_ofd_access_log_reader.c create mode 100644 lipe/src/lstddef.h diff --git a/lipe/.gitignore b/lipe/.gitignore index 0fb68d0..5ef4000 100644 --- a/lipe/.gitignore +++ b/lipe/.gitignore @@ -9,8 +9,10 @@ /lipe-*.tar.gz /pylipe/lipe_constant.py /src/*.o +/src/es_ofd_access_log_reader /src/ext4_inode2path /src/generate_definition +/src/lamigo /src/laudit /src/laudit-report /src/ldumpstripe diff --git a/lipe/lipe.spec.in b/lipe/lipe.spec.in index b27b566..517c09d 100644 --- a/lipe/lipe.spec.in +++ b/lipe/lipe.spec.in @@ -243,6 +243,7 @@ cp src/laudit \ %if %{with hotpool} cp src/lamigo \ src/lpurge \ + src/es_ofd_access_log_reader \ $RPM_BUILD_ROOT%{_sbindir} mkdir -p $RPM_BUILD_ROOT%{ddntoolsdir}/ install -m 0755 scripts/*.sh $RPM_BUILD_ROOT%{ddntoolsdir}/ @@ -341,6 +342,7 @@ rm -rf $RPM_BUILD_ROOT %{_bindir}/ldumpstripe %config(noreplace) %{_sysconfdir}/lipe.conf %if %{with hotpool} +%{_sbindir}/es_ofd_access_log_reader %{_sbindir}/lamigo %{_sbindir}/lpurge %config(noreplace) %{_sysconfdir}/lamigo.conf.example diff --git a/lipe/src/Makefile.am b/lipe/src/Makefile.am index 2be68ed..7257800 100644 --- a/lipe/src/Makefile.am +++ b/lipe/src/Makefile.am @@ -32,7 +32,7 @@ bin_PROGRAMS += laudit laudit-report endif if BUILD_HOTPOOL_UTILS -bin_PROGRAMS += lamigo lpurge +bin_PROGRAMS += es_ofd_access_log_reader lamigo lpurge endif endif @@ -111,6 +111,15 @@ lipe_expression_test_SOURCES = lipe_expression_test.c $(LIPE_SOURCES) lipe_expression_test_CFLAGS = $(LIPE_CFLAGS) lipe_expression_test_LDADD = $(LIPE_LDADD) +es_ofd_access_log_reader_SOURCES = \ + lamigo_hash.h \ + es_ofd_access_batch.c \ + es_ofd_access_batch.h \ + es_ofd_access_log_reader.c \ + lstddef.h + +es_ofd_access_log_reader_LDADD := -lpthread + endif generate_definition_SOURCES = generate_definition.c debug.c debug.h \ diff --git a/lipe/src/es_ofd_access_batch.c b/lipe/src/es_ofd_access_batch.c new file mode 100644 index 0000000..7f1dcb9 --- /dev/null +++ b/lipe/src/es_ofd_access_batch.c @@ -0,0 +1,552 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + * + * Copyright 2020, DataDirect Networks Storage. + * + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: John L. Hammond + * + * lustre/utils/ofd_access_batch.c + * + * Access log entry batching for ofd_access_log_reader. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "es_ofd_access_batch.h" +#include "lamigo_hash.h" +#include "lstddef.h" +#include "list.h" + +struct fid_hash_node { + struct lipe_list_head fhn_node; + struct lu_fid fhn_fid; +}; + +static inline bool fid_eq(const struct lu_fid *f1, const struct lu_fid *f2) +{ + return f1->f_seq == f2->f_seq && f1->f_oid == f2->f_oid && + f1->f_ver == f2->f_ver; +} + +static unsigned long fid_hash(const struct lu_fid *f, unsigned int shift) +{ +#if __BITS_PER_LONG == 32 + return hash_long(fid_flatten32(f), shift); +#elif __BITS_PER_LONG == 64 + return hash_long(fid_flatten(f), shift); +#else +# error "Wordsize not 32 or 64" +#endif +} + +static void fhn_init(struct fid_hash_node *fhn, const struct lu_fid *fid) +{ + LIPE_INIT_LIST_HEAD(&fhn->fhn_node); + fhn->fhn_fid = *fid; +} + +static bool fhn_is_hashed(struct fid_hash_node *fhn) +{ + return !lipe_list_empty(&fhn->fhn_node); +} + +static void fhn_del_init(struct fid_hash_node *fhn) +{ + if (fhn_is_hashed(fhn)) + lipe_list_del_init(&fhn->fhn_node); +} + +static inline void fhn_replace_init(struct fid_hash_node *old_fhn, + struct fid_hash_node *new_fhn) +{ + lipe_list_add(&new_fhn->fhn_node, &old_fhn->fhn_node); + lipe_list_del_init(&old_fhn->fhn_node); +} + +static void +es_ofd_fid_hash_add(struct lipe_list_head *head, unsigned int shift, + struct fid_hash_node *fhn) +{ + assert(!fhn_is_hashed(fhn)); + + lipe_list_add(&fhn->fhn_node, &head[fid_hash(&fhn->fhn_fid, shift)]); +} + +#if 0 /* Not used */ +static struct fid_hash_node * +es_ofd_fid_hash_find(struct lipe_list_head *head, unsigned int shift, const struct lu_fid *fid) +{ + struct lipe_list_head *hash_list; + struct fid_hash_node *fhn, *next; + + hash_list = &head[fid_hash(fid, shift)]; + lipe_list_for_each_entry_safe(fhn, next, hash_list, fhn_node) { + assert(fhn_is_hashed(fhn)); + + if (fid_eq(fid, &fhn->fhn_fid)) + return fhn; + } + + return NULL; +} +#endif + +static struct fid_hash_node * +es_ofd_fid_hash_insert(struct lipe_list_head *head, unsigned int shift, + struct fid_hash_node *new_fhn) +{ + struct lipe_list_head *list; + struct fid_hash_node *old_fhn, *next; + + list = &head[fid_hash(&new_fhn->fhn_fid, shift)]; + lipe_list_for_each_entry_safe(old_fhn, next, list, fhn_node) { + assert(fhn_is_hashed(old_fhn)); + + if (fid_eq(&old_fhn->fhn_fid, &new_fhn->fhn_fid)) + return old_fhn; + } + + lipe_list_add(&new_fhn->fhn_node, list); + + return new_fhn; +} + +static int +es_ofd_fid_hash_init(struct lipe_list_head **phead, unsigned int *pshift, + unsigned int shift) +{ + struct lipe_list_head *new_head; + unsigned int i; + + new_head = malloc(sizeof(*new_head) << shift); + if (new_head == NULL) + return -1; + + for (i = 0; i < (1 << shift); i++) + LIPE_INIT_LIST_HEAD(&new_head[i]); + + *phead = new_head; + *pshift = shift; + + return 0; +} + +static int +es_ofd_fid_hash_resize(struct lipe_list_head **phead, unsigned int *pshift, + unsigned int new_shift) +{ + struct lipe_list_head *new_head; + unsigned int i; + int rc; + + if (*pshift == new_shift) + return 0; + + rc = es_ofd_fid_hash_init(&new_head, &new_shift, new_shift); + if (rc < 0) + return rc; + + for (i = 0; i < (1 << *pshift); i++) { + struct lipe_list_head *list = &(*phead)[i]; + struct fid_hash_node *fhn, *next; + + lipe_list_for_each_entry_safe(fhn, next, list, fhn_node) { + fhn_del_init(fhn); + es_ofd_fid_hash_add(new_head, new_shift, fhn); + } + } + + free(*phead); + *phead = new_head; + *pshift = new_shift; + + return 0; +} + +enum { + ALR_READ = 0, + ALR_WRITE = 1, +}; + +/* Entry in the batching hash. */ +struct alr_entry { + struct fid_hash_node alre_fid_hash_node; + time_t alre_time[2]; /* Not strictly needed. */ + __u64 alre_begin[2]; + __u64 alre_end[2]; + __u64 alre_size[2]; + __u64 alre_segment_count[2]; + __u64 alre_count[2]; + char alre_obd_name[]; +}; + +enum { + ALR_BATCH_HASH_SHIFT_DEFAULT = 10, + ALR_BATCH_HASH_SHIFT_MAX = 30, +}; + +struct alr_batch { + struct lipe_list_head *alrb_hash; + unsigned int alrb_hash_shift; + unsigned int alrb_count; +}; + +static void alre_del_init(struct alr_entry *alre) +{ + fhn_del_init(&alre->alre_fid_hash_node); +} + +static void alre_update(struct alr_entry *alre, time_t time, __u64 begin, + __u64 end, __u32 size, __u32 segment_count, __u32 flags) +{ + unsigned int d = (flags & OFD_ACCESS_READ) ? ALR_READ : ALR_WRITE; + + alre->alre_time[d] = max_t(time_t, alre->alre_time[d], time); + alre->alre_begin[d] = min_t(__u64, alre->alre_begin[d], begin); + alre->alre_end[d] = max_t(__u64, alre->alre_end[d], end); + alre->alre_size[d] += size; + alre->alre_segment_count[d] += segment_count; + alre->alre_count[d] += 1; +} + +int alr_batch_add(struct alr_batch *alrb, const char *obd_name, + const struct lu_fid *pfid, time_t time, __u64 begin, __u64 end, + __u32 size, __u32 segment_count, __u32 flags) +{ + struct fid_hash_node fhn, *p; + struct alr_entry *alre; + int rc; + + if (alrb == NULL) + return 0; + + assert(sizeof(time_t) == sizeof(__u64)); + + fhn_init(&fhn, pfid); + + /* Find old or insert sentinel (fhn). Replace sentinel if returned. */ + p = es_ofd_fid_hash_insert(alrb->alrb_hash, alrb->alrb_hash_shift, &fhn); + if (p == &fhn) { + size_t alre_size = sizeof(*alre) + strlen(obd_name) + 1; + + alre = calloc(1, alre_size); + if (alre == NULL) { + rc = -1; + goto out; + } + + fhn_init(&alre->alre_fid_hash_node, pfid); + strcpy(alre->alre_obd_name, obd_name); + fhn_replace_init(&fhn, &alre->alre_fid_hash_node); + alrb->alrb_count++; + } else { + alre = container_of(p, struct alr_entry, alre_fid_hash_node); + } + + alre_update(alre, time, begin, end, size, segment_count, flags); + rc = 0; +out: + fhn_del_init(&fhn); + + return rc; +} + +int sort_compare(const void *a1, const void *a2) +{ + int l = *(const int *)a1; + int r = *(const int *)a2; + if (l > r) + return -1; + if (l < r) + return 1; + return 0; +} + +static void alre_printf(FILE *f, struct alr_entry *alre, int d) +{ + fprintf(f, "o=%s f="DFID" t=%lld b=%llu e=%llu s=%llu g=%llu n=%llu d=%c\n", + alre->alre_obd_name, + PFID(&alre->alre_fid_hash_node.fhn_fid), + (long long)alre->alre_time[d], + (unsigned long long)alre->alre_begin[d], + (unsigned long long)alre->alre_end[d], + (unsigned long long)alre->alre_size[d], + (unsigned long long)alre->alre_segment_count[d], + (unsigned long long)alre->alre_count[d], + (d == ALR_READ) ? 'r' : 'w'); +} + +struct alr_thread_arg { + struct lipe_list_head list; + int fraction; + FILE *file; + pthread_mutex_t *file_mutex; +}; + +static void alre_print_keepalive(struct alr_thread_arg *aa) +{ + int rc = pthread_mutex_lock(aa->file_mutex); + + if (rc != 0) { + fprintf(stderr, "cannot lock batch file: %s\n", + strerror(rc)); + exit(1); + } + + fprintf(aa->file, "#!keepalive\n"); + + rc = pthread_mutex_unlock(aa->file_mutex); + if (rc != 0) { + fprintf(stderr, "cannot unlock batch file: %s\n", + strerror(rc)); + exit(1); + } +} + +void *alr_sort_and_print_thread(void *arg) +{ + struct alr_entry *alre, *next; + struct alr_thread_arg *aa = arg; + struct lipe_list_head *tmp = &aa->list; + int *sa = NULL; + int rc, d, i, nr = 0; + unsigned long cut; + bool printed = false; + + lipe_list_for_each_entry(alre, tmp, alre_fid_hash_node.fhn_node) { + if (alre->alre_count[0] > 0) + nr++; + if (alre->alre_count[1] > 0) + nr++; + } + + if (nr == 0) + goto out; + + sa = calloc(nr, sizeof(*sa)); + if (!sa) { + fprintf(stderr, "cannot allocate memory for sorting\n"); + exit(1); + } + + i = 0; + lipe_list_for_each_entry(alre, tmp, alre_fid_hash_node.fhn_node) { + if (alre->alre_count[0] > 0) + sa[i++] = alre->alre_count[0]; + if (alre->alre_count[1] > 0) + sa[i++] = alre->alre_count[1]; + } + + qsort(sa, nr, sizeof(*sa), sort_compare); + i = nr * aa->fraction / 100; + cut = sa[i]; + if (cut < 1) + cut = 1; + free(sa); + + /* Prevent jumbled output from multiple concurrent sort and + * print threads. */ + rc = pthread_mutex_lock(aa->file_mutex); + if (rc != 0) { + fprintf(stderr, "cannot lock batch file: %s\n", + strerror(rc)); + exit(1); + } + + /* there might be lots of items at @cut, but we want to limit total + * output. so the first loop dumps all items > @cut and the second + * loop dumps items=@cut so that total number (@i) is not exceeeded. + * XXX: possible optimization - move items=@cut to another list, so + * that 2nd pass takes < O(n) */ + lipe_list_for_each_entry(alre, tmp, alre_fid_hash_node.fhn_node) { + for (d = 0; d < 2; d++) { + if (alre->alre_count[d] <= cut) + continue; + alre_printf(aa->file, alre, d); + printed = true; + i--; + } + } + + lipe_list_for_each_entry(alre, tmp, alre_fid_hash_node.fhn_node) { + for (d = 0; d < 2 && i > 0; d++) { + if (alre->alre_count[d] != cut) + continue; + alre_printf(aa->file, alre, d); + printed = true; + i--; + } + } + + rc = pthread_mutex_unlock(aa->file_mutex); + if (rc != 0) { + fprintf(stderr, "cannot unlock batch file: %s\n", + strerror(rc)); + exit(1); + } + +out: + /* Nothing else printed - send keepalive */ + if (!printed) + alre_print_keepalive(aa); + + fflush(aa->file); + + lipe_list_for_each_entry_safe(alre, next, tmp, alre_fid_hash_node.fhn_node) { + alre_del_init(alre); + free(alre); + } + + free(aa); + + return NULL; +} + +/* Print, clear, and resize the batch. */ +int alr_batch_print(struct alr_batch *alrb, FILE *file, + pthread_mutex_t *file_mutex, int fraction) +{ + unsigned int new_hash_shift; + pthread_attr_t attr, *pattr = NULL; + struct alr_thread_arg *aa = NULL; + pthread_t pid; + int i, rc; + + if (alrb == NULL) + return 0; + + aa = calloc(1, sizeof(*aa)); + if (aa == NULL) + return -ENOMEM; + + /* move all collected items to the temp list */ + LIPE_INIT_LIST_HEAD(&aa->list); + for (i = 0; i < (1 << alrb->alrb_hash_shift); i++) { + if (lipe_list_empty(&alrb->alrb_hash[i])) + continue; + lipe_list_splice(&alrb->alrb_hash[i], &aa->list); + LIPE_INIT_LIST_HEAD(&alrb->alrb_hash[i]); + } + aa->file = file; + aa->file_mutex = file_mutex; + aa->fraction = fraction; + + rc = pthread_attr_init(&attr); + if (rc != 0) + goto out; + + pattr = &attr; + + rc = pthread_attr_setdetachstate(pattr, PTHREAD_CREATE_DETACHED); + if (rc != 0) + goto out; + + /* as sorting may take time and we don't want to lose access + * records we better do sorting and printing in a different thread */ + rc = pthread_create(&pid, pattr, &alr_sort_and_print_thread, aa); + if (rc != 0) + goto out; + + aa = NULL; /* Sort and print thread owns it now. */ +out: + /* Resize hash based on previous count. */ + new_hash_shift = alrb->alrb_hash_shift; + + while (new_hash_shift < ALR_BATCH_HASH_SHIFT_MAX && + (1 << new_hash_shift) < alrb->alrb_count) + new_hash_shift++; + + es_ofd_fid_hash_resize(&alrb->alrb_hash, &alrb->alrb_hash_shift, + new_hash_shift); + + alrb->alrb_count = 0; + + if (pattr != NULL) + pthread_attr_destroy(pattr); + + if (aa != NULL) { + struct alr_entry *alre, *next; + + lipe_list_for_each_entry_safe(alre, next, &aa->list, + alre_fid_hash_node.fhn_node) { + alre_del_init(alre); + free(alre); + } + } + + free(aa); + + if (rc > 0) + rc = -rc; /* Fixup pthread return conventions. */ + + return rc; +} + +struct alr_batch *alr_batch_create(unsigned int shift) +{ + struct alr_batch *alrb; + int rc; + + if (shift == -1U) + shift = ALR_BATCH_HASH_SHIFT_DEFAULT; + + alrb = calloc(1, sizeof(*alrb)); + if (alrb == NULL) + return NULL; + + rc = es_ofd_fid_hash_init(&alrb->alrb_hash, &alrb->alrb_hash_shift, shift); + if (rc < 0) { + free(alrb); + return NULL; + } + + return alrb; +} + +void alr_batch_destroy(struct alr_batch *alrb) +{ + unsigned int i; + + if (alrb == NULL) + return; + + for (i = 0; i < (1 << alrb->alrb_hash_shift); i++) { + struct lipe_list_head *list = &alrb->alrb_hash[i]; + struct alr_entry *alre, *next; + + lipe_list_for_each_entry_safe(alre, next, list, alre_fid_hash_node.fhn_node) { + alre_del_init(alre); + free(alre); + } + } + + free(alrb->alrb_hash); + free(alrb); +} diff --git a/lipe/src/es_ofd_access_batch.h b/lipe/src/es_ofd_access_batch.h new file mode 100644 index 0000000..446719a --- /dev/null +++ b/lipe/src/es_ofd_access_batch.h @@ -0,0 +1,18 @@ +#ifndef _ES_OFD_ACCESS_BATCH_H_ +#define _OFD_ACCESS_BATCH_H_ +#include +#include +#include + +struct lu_fid; +struct alr_batch; + +struct alr_batch *alr_batch_create(unsigned int shift); +void alr_batch_destroy(struct alr_batch *alrb); +int alr_batch_add(struct alr_batch *alrb, const char *obd_name, + const struct lu_fid *pfid, time_t time, __u64 begin, __u64 end, + __u32 size, __u32 segment_count, __u32 flags); +int alr_batch_print(struct alr_batch *alrb, FILE *file, + pthread_mutex_t *file_mutex, int fraction); + +#endif /* _ES_OFD_ACCESS_BATCH_H_ */ diff --git a/lipe/src/es_ofd_access_log_reader.c b/lipe/src/es_ofd_access_log_reader.c new file mode 100644 index 0000000..6173cb7 --- /dev/null +++ b/lipe/src/es_ofd_access_log_reader.c @@ -0,0 +1,998 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.html + * + * GPL HEADER END + * + * Copyright 2020, DataDirect Networks Storage. + * + * This file is part of Lustre, http://www.lustre.org/ + * + * Author: John L. Hammond + * + * lustre/utils/ofd_access_log_reader.c + * + * Sample utility to discover and read Lustre (ofd) access logs. + * + * This demonstrates the discovery and reading of Lustre access logs + * (see linux/lustre/lustre_access_log.h and + * lustre/ofd/ofd_access_log.c.). By default it opens the control + * device, discovers and opens all access log devices, and consumes + * all access log entries. If invoked with the --list option then it + * prints information about all available devices to stdout and exits. + * + * Structured trace points (when --trace is used) are added to permit + * testing of the access log functionality (see test_165* in + * lustre/tests/sanity.sh). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "es_ofd_access_batch.h" +#include "lstddef.h" + +/* TODO fsname filter */ + +static FILE *debug_file; +static FILE *trace_file; + +#define DEBUG(fmt, args...) \ + do { \ + if (debug_file != NULL) \ + fprintf(debug_file, "DEBUG %s:%d: "fmt, __func__, __LINE__, ##args); \ + } while (0) + +#define TRACE(fmt, args...) \ + do { \ + if (trace_file != NULL) \ + fprintf(trace_file, "TRACE "fmt, ##args); \ + } while (0) + +#define DEBUG_D(x) DEBUG("%s = %"PRIdMAX"\n", #x, (intmax_t)x) +#define DEBUG_P(x) DEBUG("%s = %p\n", #x, x) +#define DEBUG_S(x) DEBUG("%s = '%s'\n", #x, x) +#define DEBUG_U(x) DEBUG("%s = %"PRIuMAX"\n", #x, (uintmax_t)x) + +#define ERROR(fmt, args...) \ + fprintf(stderr, "%s: "fmt, program_invocation_short_name, ##args) + +#define FATAL(fmt, args...) \ + do { \ + ERROR("FATAL: "fmt, ##args); \ + exit(EXIT_FAILURE); \ + } while (0) + +#define container_of(ptr, type, member) ({ \ + const typeof(((type *) 0)->member) * __mptr = (ptr); \ + (type *) ((char *) __mptr - offsetof(type, member)); }) + +enum { + ALR_EXIT_SUCCESS = INT_MIN + EXIT_SUCCESS, + ALR_EXIT_FAILURE = INT_MIN + EXIT_FAILURE, + ALR_ERROR = -1, + ALR_EOF = 0, + ALR_OK = 1, +}; + +struct alr_dev { + char *alr_name; + int (*alr_io)(int /* epoll_fd */, struct alr_dev * /* this */, unsigned int /* mask */); + void (*alr_destroy)(struct alr_dev *); + int alr_fd; +}; + +struct alr_log { + struct alr_dev alr_dev; + char *alr_buf; + size_t alr_buf_size; + size_t alr_entry_size; + size_t alr_read_count; + dev_t alr_rdev; +}; + +static unsigned int alr_log_count; +static struct alr_log *alr_log[1 << 20]; /* 20 == MINORBITS */ +static int oal_version; /* FIXME ... major version, minor version */ +static __u32 alr_filter = 0xffffffff; /* no filter by default */ +static unsigned int oal_log_major; +static unsigned int oal_log_minor_max; +static struct alr_batch *alr_batch; +static FILE *alr_batch_file; +static pthread_mutex_t alr_batch_file_mutex = PTHREAD_MUTEX_INITIALIZER; +static const char *alr_batch_file_path; +static const char *alr_stats_file_path; +static int alr_print_fraction = 100; + +#define D_ALR_DEV "%s %d" +#define P_ALR_DEV(ad) \ + (ad)->alr_name, (ad)->alr_fd + +#define D_ALR_LOG D_ALR_DEV" %u:%u" +#define P_ALR_LOG(al) \ + P_ALR_DEV(&(al)->alr_dev), major((al)->alr_rdev), minor((al)->alr_rdev) + +static void alr_dev_free(int epoll_fd, struct alr_dev *ad) +{ + TRACE("alr_dev_free %s\n", ad->alr_name); + + if (!(ad->alr_fd < 0)) + epoll_ctl(epoll_fd, EPOLL_CTL_DEL, ad->alr_fd, NULL); + + if (ad->alr_destroy != NULL) + (*ad->alr_destroy)(ad); + + if (!(ad->alr_fd < 0)) + close(ad->alr_fd); + + free(ad->alr_name); + free(ad); +} + +static struct alr_log **alr_log_lookup(dev_t rdev) +{ + assert(major(rdev) == oal_log_major); + + if (!(minor(rdev) < ARRAY_SIZE(alr_log))) + return NULL; + + return &alr_log[minor(rdev)]; +} + +static const char *alr_flags_to_str(unsigned int flags) +{ + switch (flags & (OFD_ACCESS_READ | OFD_ACCESS_WRITE)) { + default: + return "0"; + case OFD_ACCESS_READ: + return "r"; + case OFD_ACCESS_WRITE: + return "w"; + case OFD_ACCESS_READ | OFD_ACCESS_WRITE: + return "rw"; + } +} + +/* /dev/lustre-access-log/scratch-OST0000 device poll callback: read entries + * from log and print. + */ +static int alr_log_io(int epoll_fd, struct alr_dev *ad, unsigned int mask) +{ + struct alr_log *al = container_of(ad, struct alr_log, alr_dev); + ssize_t i, count; + + TRACE("alr_log_io %s\n", ad->alr_name); + DEBUG_U(mask); + + assert(al->alr_entry_size != 0); + assert(al->alr_buf_size != 0); + assert(al->alr_buf != NULL); + + count = read(ad->alr_fd, al->alr_buf, al->alr_buf_size); + if (count < 0) { + ERROR("cannot read events from '%s': %s\n", ad->alr_name, strerror(errno)); + return ALR_ERROR; + } + + if (count == 0) { + TRACE("alr_log_eof %s\n", ad->alr_name); + return ALR_EOF; + } + + if (count % al->alr_entry_size != 0) { + ERROR("invalid read from "D_ALR_LOG": entry_size = %zu, count = %zd\n", + P_ALR_LOG(al), al->alr_entry_size, count); + return ALR_ERROR; + } + + DEBUG("read "D_ALR_LOG", count = %zd\n", P_ALR_LOG(al), count); + + al->alr_read_count += count / al->alr_entry_size; + + for (i = 0; i < count; i += al->alr_entry_size) { + struct ofd_access_entry_v1 *oae = + (struct ofd_access_entry_v1 *)&al->alr_buf[i]; + + TRACE("alr_log_entry %s "DFID" %lu %lu %lu %u %u %s\n", + ad->alr_name, + PFID(&oae->oae_parent_fid), + (unsigned long)oae->oae_begin, + (unsigned long)oae->oae_end, + (unsigned long)oae->oae_time, + (unsigned int)oae->oae_size, + (unsigned int)oae->oae_segment_count, + alr_flags_to_str(oae->oae_flags)); + + alr_batch_add(alr_batch, ad->alr_name, &oae->oae_parent_fid, + oae->oae_time, oae->oae_begin, oae->oae_end, + oae->oae_size, oae->oae_segment_count, oae->oae_flags); + } + + return ALR_OK; +} + +static void alr_log_destroy(struct alr_dev *ad) +{ + struct alr_log *al = container_of(ad, struct alr_log, alr_dev); + struct alr_log **pal; + + TRACE("alr_log_free %s\n", ad->alr_name); + assert(major(al->alr_rdev) == oal_log_major); + + pal = alr_log_lookup(al->alr_rdev); + if (pal != NULL && *pal == al) + *pal = NULL; + + free(al->alr_buf); + al->alr_buf = NULL; + al->alr_buf_size = 0; + alr_log_count--; +} + +/* Add an access log (identified by path) to the epoll set. */ +static int alr_log_add(int epoll_fd, const char *path) +{ + struct alr_log **pal, *al = NULL; + struct stat st; + int fd = -1; + int rc; + + DEBUG_S(path); + + fd = open(path, O_RDONLY|O_NONBLOCK|O_CLOEXEC); + if (fd < 0) { + ERROR("cannot open device '%s': %s\n", path, strerror(errno)); + rc = (errno == ENOENT ? 0 : -1); /* Possible race. */ + goto out; + } + + /* Revalidate rdev in case of race. */ + rc = fstat(fd, &st); + if (rc < 0) { + ERROR("cannot stat '%s': %s\n", path, strerror(errno)); + goto out; + } + + if (major(st.st_rdev) != oal_log_major) + goto out; + + pal = alr_log_lookup(st.st_rdev); + if (pal == NULL) { + ERROR("no device slot available for '%s' with minor %u\n", + path, minor(st.st_rdev)); + goto out; + } + + if (*pal != NULL) + goto out; /* We already have this device. */ + + struct lustre_access_log_info_v1 lali; + + memset(&lali, 0, sizeof(lali)); + + rc = ioctl(fd, LUSTRE_ACCESS_LOG_IOCTL_INFO, &lali); + if (rc < 0) { + ERROR("cannot get info for device '%s': %s\n", + path, strerror(errno)); + goto out; + } + + if (lali.lali_type != LUSTRE_ACCESS_LOG_TYPE_OFD) { + rc = 0; + goto out; + } + rc = ioctl(fd, LUSTRE_ACCESS_LOG_IOCTL_FILTER, alr_filter); + if (rc < 0) { + ERROR("cannot set filter '%s': %s\n", + path, strerror(errno)); + goto out; + } + + al = calloc(1, sizeof(*al)); + if (al == NULL) + FATAL("cannot allocate struct alr_dev of size %zu: %s\n", + sizeof(*al), strerror(errno)); + + alr_log_count++; + al->alr_dev.alr_io = &alr_log_io; + al->alr_dev.alr_destroy = &alr_log_destroy; + al->alr_dev.alr_fd = fd; + fd = -1; + + al->alr_rdev = st.st_rdev; + + al->alr_dev.alr_name = strdup(lali.lali_name); + if (al->alr_dev.alr_name == NULL) + FATAL("cannot copy name of size %zu: %s\n", + strlen(lali.lali_name), strerror(errno)); + + al->alr_buf_size = lali.lali_log_size; + al->alr_entry_size = lali.lali_entry_size; + + if (al->alr_entry_size == 0) { + ERROR("device '%s' has zero entry size\n", path); + rc = -1; + goto out; + } + + if (al->alr_buf_size == 0) + al->alr_buf_size = 1048576; + + al->alr_buf_size = roundup(al->alr_buf_size, al->alr_entry_size); + + al->alr_buf = malloc(al->alr_buf_size); + if (al->alr_buf == NULL) + FATAL("cannot allocate log buffer for '%s' of size %zu: %s\n", + path, al->alr_buf_size, strerror(errno)); + + struct epoll_event ev = { + .events = EPOLLIN | EPOLLHUP, + .data.ptr = &al->alr_dev, + }; + + rc = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, al->alr_dev.alr_fd, &ev); + if (rc < 0) { + ERROR("cannot add device '%s' to epoll set: %s\n", + path, strerror(errno)); + goto out; + } + + TRACE("alr_log_add %s\n", al->alr_dev.alr_name); + + if (oal_log_minor_max < minor(al->alr_rdev)) + oal_log_minor_max = minor(al->alr_rdev); + + assert(*pal == NULL); + *pal = al; + al = NULL; + rc = 0; +out: + if (al != NULL) + alr_dev_free(epoll_fd, &al->alr_dev); + + if (!(fd < 0)) + close(fd); + + return rc; +} + +/* Call LUSTRE_ACCESS_LOG_IOCTL_INFO to get access log info and print + * YAML formatted info to stdout. */ +static int alr_log_info(struct alr_log *al) +{ + struct lustre_access_log_info_v1 lali; + int rc; + + rc = ioctl(al->alr_dev.alr_fd, LUSTRE_ACCESS_LOG_IOCTL_INFO, &lali); + if (rc < 0) { + ERROR("cannot get info for device '%s': %s\n", + al->alr_dev.alr_name, strerror(errno)); + return -1; + } + + printf("- name: %s\n" + " version: %#x\n" + " type: %#x\n" + " log_size: %u\n" + " entry_size: %u\n", + lali.lali_name, + lali.lali_version, + lali.lali_type, + lali.lali_log_size, + lali.lali_entry_size); + + return 0; +} + +static int alr_log_stats(FILE *file, struct alr_log *al) +{ + struct lustre_access_log_info_v1 lali; + int rc; + + rc = ioctl(al->alr_dev.alr_fd, LUSTRE_ACCESS_LOG_IOCTL_INFO, &lali); + if (rc < 0) { + ERROR("cannot get info for device '%s': %s\n", + al->alr_dev.alr_name, strerror(errno)); + return -1; + } + +#define X(m) \ + fprintf(file, "STATS %s %s %u\n", lali.lali_name, #m, lali.m) + + X(_lali_head); + X(_lali_tail); + X(_lali_entry_space); + X(_lali_entry_count); + X(_lali_drop_count); + X(_lali_is_closed); +#undef X + + fprintf(file, "STATS %s %s %zu\n", + lali.lali_name, "alr_read_count", al->alr_read_count); + + return 0; +} + +static void alr_log_stats_all(void) +{ + FILE *stats_file; + int m; + + if (alr_stats_file_path == NULL) { + stats_file = stderr; + } else if (strcmp(alr_stats_file_path, "-") == 0) { + stats_file = stdout; + } else { + stats_file = fopen(alr_stats_file_path, "a"); + if (stats_file == NULL) { + ERROR("cannot open '%s': %s\n", + alr_stats_file_path, strerror(errno)); + return; + } + } + + for (m = 0; m <= oal_log_minor_max; m++) { + if (alr_log[m] == NULL) + continue; + + alr_log_stats(stats_file, alr_log[m]); + } + + if (stats_file == stdout || stats_file == stderr) + fflush(stats_file); + else + fclose(stats_file); +} + +/* Scan /dev/lustre-access-log/ for new access log devices and add to + * epoll set. */ +static int alr_scan(int epoll_fd) +{ + const char dir_path[] = "/dev/"LUSTRE_ACCESS_LOG_DIR_NAME; + DIR *dir; + int dir_fd; + struct dirent *d; + int rc; + + dir = opendir(dir_path); + if (dir == NULL) { + ERROR("cannot open '%s' for scanning: %s\n", dir_path, strerror(errno)); + return ALR_EXIT_FAILURE; + } + + dir_fd = dirfd(dir); + + /* Scan /dev for devices with major equal to oal_log_major and add + * any new devices. */ + while ((d = readdir(dir)) != NULL) { + char path[6 + PATH_MAX]; + struct alr_log **pal; + struct stat st; + + if (d->d_type != DT_CHR) + continue; + + rc = fstatat(dir_fd, d->d_name, &st, 0); + if (rc < 0) { + ERROR("cannot stat '%s/%s' while scanning: %s\n", + dir_path, d->d_name, strerror(errno)); + continue; + } + + if (!S_ISCHR(st.st_mode)) + continue; + + if (major(st.st_rdev) != oal_log_major) + continue; + + pal = alr_log_lookup(st.st_rdev); + if (pal == NULL) { + ERROR("no device slot available for '%s/%s' with minor %u\n", + dir_path, d->d_name, minor(st.st_rdev)); + continue; + } + + if (*pal != NULL) + continue; /* We already have this device. */ + + snprintf(path, sizeof(path), "%s/%s", dir_path, d->d_name); + + alr_log_add(epoll_fd, path); + } + + closedir(dir); + + return ALR_OK; +} + +/* /dev/lustre-access-log/control device poll callback: call prescan + * ioctl and scan /dev/lustre-access-log/ for new access log + * devices. */ +static int alr_ctl_io(int epoll_fd, struct alr_dev *cd, unsigned int mask) +{ + int rc; + + TRACE("%s\n", __func__); + DEBUG_U(mask); + + if (mask & EPOLLERR) + return ALR_EXIT_FAILURE; + + if (mask & EPOLLHUP) + return ALR_EXIT_SUCCESS; + + rc = ioctl(cd->alr_fd, LUSTRE_ACCESS_LOG_IOCTL_PRESCAN); + if (rc < 0) { + ERROR("cannot start scanning: %s\n", strerror(errno)); + return ALR_EXIT_FAILURE; + } + + return alr_scan(epoll_fd); +} + +/* signalfd epoll callback. Handle SIGINT and SIGTERM by breaking from + * the epoll loop and exiting normally.*/ +static int alr_signal_io(int epoll_fd, struct alr_dev *sd, unsigned int mask) +{ + struct signalfd_siginfo ssi; + ssize_t rc; + + TRACE("%s\n", __func__); + DEBUG_U(mask); + + rc = read(sd->alr_fd, &ssi, sizeof(ssi)); + if (rc <= 0) + return ALR_OK; + + DEBUG_U(ssi.ssi_signo); + switch (ssi.ssi_signo) { + case SIGINT: + case SIGTERM: + return ALR_EXIT_SUCCESS; + case SIGUSR1: + alr_log_stats_all(); + + return ALR_OK; + case SIGUSR2: + if (debug_file == NULL) + debug_file = stderr; + + if (trace_file == NULL) + trace_file = stderr; + + return ALR_OK; + default: + return ALR_OK; + } +} + +/* batching timerfd epoll callback. Print batched access entries to + * alr_batch_file. */ +static int alr_batch_timer_io(int epoll_fd, struct alr_dev *td, unsigned int mask) +{ + time_t now = time(NULL); + uint64_t expire_count; + ssize_t rc; + + TRACE("%s\n", __func__); + DEBUG_D(now); + DEBUG_U(mask); + + rc = read(td->alr_fd, &expire_count, sizeof(expire_count)); + if (rc <= 0) + return ALR_OK; + + DEBUG_U(expire_count); + + rc = alr_batch_print(alr_batch, alr_batch_file, &alr_batch_file_mutex, + alr_print_fraction); + if (rc < 0) { + ERROR("cannot write to '%s': %s\n", + alr_batch_file_path, strerror(errno)); + goto out; + } +out: + /* Failed writes will leave alr_batch_file (pipe) in a + * weird state so make that fatal. */ + return (rc < 0) ? ALR_EXIT_FAILURE : ALR_OK; +} + +/* batch file (stdout) poll callback: detect remote pipe close and exit. */ +static int alr_batch_file_io(int epoll_fd, struct alr_dev *ad, unsigned int mask) +{ + TRACE("%s\n", __func__); + DEBUG_U(mask); + + if (mask & EPOLLHUP) + return ALR_EXIT_SUCCESS; + + if (mask & EPOLLERR) + return ALR_EXIT_FAILURE; + + return ALR_OK; +} + +static struct alr_dev *alr_dev_create(int epoll_fd, int fd, const char *name, + uint32_t events, + int (*io)(int, struct alr_dev *, unsigned int), + void (*destroy)(struct alr_dev *)) +{ + struct alr_dev *alr; + int rc; + + alr = calloc(1, sizeof(*alr)); + if (alr == NULL) + return NULL; + + alr->alr_name = strdup(name); + if (alr->alr_name == NULL) { + free(alr); + return NULL; + } + alr->alr_io = io; + alr->alr_destroy = destroy; + alr->alr_fd = fd; + + struct epoll_event event = { + .events = events, + .data.ptr = alr, + }; + + rc = epoll_ctl(epoll_fd, EPOLL_CTL_ADD, alr->alr_fd, &event); + if (rc < 0) { + free(alr); + return NULL; + } + + return alr; +} + +void usage(void) +{ + printf("Usage: %s: [OPTION]...\n" +"Discover, read, batch, and write Lustre access logs\n" +"\n" +"Mandatory arguments to long options are mandatory for short options too.\n" +" -f, --batch-file=FILE print batch to file (default stdout)\n" +" -F, --batch-fraction=P set batch printing fraction to P/100\n" +" -i, --batch-interval=INTERVAL print batch every INTERVAL seconds\n" +" -o, --batch-offset=OFFSET print batch at OFFSET seconds\n" +" -e, --exit-on-close exit on close of all log devices\n" +" -I, --mdt-index-filter=INDEX set log MDT index filter to INDEX\n" +" -h, --help display this help and exit\n" +" -l, --list print YAML list of available access logs\n" +" -d, --debug[=FILE] print debug messages to FILE (stderr)\n" +" -s, --stats=FILE print stats messages to FILE (stderr)\n" +" -t, --trace[=FILE] print trace messages to FILE (stderr)\n", + program_invocation_short_name); +} + +int main(int argc, char *argv[]) +{ + const char ctl_path[] = "/dev/"LUSTRE_ACCESS_LOG_DIR_NAME"/control"; + struct alr_dev *alr_signal = NULL; + struct alr_dev *alr_batch_timer = NULL; + struct alr_dev *alr_batch_file_hup = NULL; + struct alr_dev *alr_ctl = NULL; + int exit_on_close = 0; + time_t batch_interval = 0; + time_t batch_offset = 0; + unsigned int m; + int list_info = 0; + int epoll_fd = -1; + int exit_status; + int rc; + int c; + + static struct option options[] = { + { .name = "batch-file", .has_arg = required_argument, .val = 'f', }, + { .name = "batch-fraction", .has_arg = required_argument, .val = 'F', }, + { .name = "batch-interval", .has_arg = required_argument, .val = 'i', }, + { .name = "batch-offset", .has_arg = required_argument, .val = 'o', }, + { .name = "exit-on-close", .has_arg = no_argument, .val = 'e', }, + { .name = "mdt-index-filter", .has_arg = required_argument, .val = 'I' }, + { .name = "debug", .has_arg = optional_argument, .val = 'd', }, + { .name = "help", .has_arg = no_argument, .val = 'h', }, + { .name = "list", .has_arg = no_argument, .val = 'l', }, + { .name = "stats", .has_arg = required_argument, .val = 's', }, + { .name = "trace", .has_arg = optional_argument, .val = 't', }, + { .name = NULL, }, + }; + + while ((c = getopt_long(argc, argv, "d::ef:F:hi:I:ls:t::", options, NULL)) != -1) { + switch (c) { + case 'e': + exit_on_close = 1; + break; + case 'f': + alr_batch_file_path = optarg; + break; + case 'i': + errno = 0; + batch_interval = strtoll(optarg, NULL, 0); + if (batch_interval < 0 || batch_interval >= 1048576 || + errno != 0) + FATAL("invalid batch interval '%s'\n", optarg); + break; + case 'o': + errno = 0; + batch_offset = strtoll(optarg, NULL, 0); + if (batch_offset < 0 || batch_offset >= 1048576 || + errno != 0) + FATAL("invalid batch offset '%s'\n", optarg); + break; + case 'd': + if (optarg == NULL) { + debug_file = stderr; + } else if (strcmp(optarg, "-") == 0) { + debug_file = stdout; + } else { + debug_file = fopen(optarg, "a"); + if (debug_file == NULL) + FATAL("cannot open debug file '%s': %s\n", + optarg, strerror(errno)); + } + + break; + case 'h': + usage(); + exit(EXIT_SUCCESS); + case 'F': + alr_print_fraction = strtoll(optarg, NULL, 0); + if (alr_print_fraction < 1 || alr_print_fraction > 100) + FATAL("invalid batch offset '%s'\n", optarg); + break; + case 'I': + alr_filter = strtoll(optarg, NULL, 0); + break; + case 'l': + list_info = 1; + break; + case 's': + alr_stats_file_path = optarg; + break; + case 't': + if (optarg == NULL) { + trace_file = stderr; + } else if (strcmp(optarg, "-") == 0) { + trace_file = stdout; + } else { + trace_file = fopen(optarg, "a"); + if (debug_file == NULL) + FATAL("cannot open debug file '%s': %s\n", + optarg, strerror(errno)); + } + + break; + case '?': + fprintf(stderr, "Try '%s --help' for more information.\n", + program_invocation_short_name); + exit(EXIT_FAILURE); + } + } + + if (batch_interval > 0) { + alr_batch = alr_batch_create(-1); + if (alr_batch == NULL) + FATAL("cannot create batch struct: %s\n", + strerror(errno)); + } + + if (alr_batch_file_path != NULL) { + alr_batch_file = fopen(alr_batch_file_path, "w"); + if (alr_batch_file == NULL) + FATAL("cannot open batch file '%s': %s\n", + alr_batch_file_path, strerror(errno)); + } else { + alr_batch_file_path = "stdout"; + alr_batch_file = stdout; + } + + epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (epoll_fd < 0) + FATAL("cannot create epoll set: %s\n", strerror(errno)); + + /* Setup signal FD and add to epoll set. */ + sigset_t signal_mask; + sigemptyset(&signal_mask); + sigaddset(&signal_mask, SIGINT); + sigaddset(&signal_mask, SIGTERM); + sigaddset(&signal_mask, SIGUSR1); + sigaddset(&signal_mask, SIGUSR2); + rc = sigprocmask(SIG_BLOCK, &signal_mask, NULL); + if (rc < 0) + FATAL("cannot set process signal mask: %s\n", strerror(errno)); + + int signal_fd = signalfd(-1, &signal_mask, SFD_NONBLOCK|SFD_CLOEXEC); + if (signal_fd < 0) + FATAL("cannot create signalfd: %s\n", strerror(errno)); + + alr_signal = alr_dev_create(epoll_fd, signal_fd, "signal", EPOLLIN, + &alr_signal_io, NULL); + if (alr_signal == NULL) + FATAL("cannot register signalfd: %s\n", strerror(errno)); + + signal_fd = -1; + + /* Setup batch timer FD and add to epoll set. */ + struct timespec now; + rc = clock_gettime(CLOCK_REALTIME, &now); + if (rc < 0) + FATAL("cannot read realtime clock: %s\n", strerror(errno)); + + int timer_fd = timerfd_create(CLOCK_REALTIME, TFD_NONBLOCK|TFD_CLOEXEC); + if (timer_fd < 0) + FATAL("cannot create batch timerfd: %s\n", strerror(errno)); + + struct itimerspec it = { + .it_value.tv_sec = (batch_interval > 0) ? + roundup(now.tv_sec, batch_interval) + + (batch_offset % batch_interval) : + 0, + .it_interval.tv_sec = batch_interval, + }; + + DEBUG_D(it.it_value.tv_sec); + + rc = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &it, NULL); + if (rc < 0) + FATAL("cannot arm timerfd: %s\n", strerror(errno)); + + alr_batch_timer = alr_dev_create(epoll_fd, timer_fd, "batch_timer", + EPOLLIN, &alr_batch_timer_io, NULL); + if (alr_batch_timer == NULL) + FATAL("cannot register batch timerfd: %s\n", strerror(errno)); + + timer_fd = -1; + + int batch_fd = dup(fileno(alr_batch_file)); + if (batch_fd < 0) + FATAL("cannot duplicate batch file descriptor: %s\n", + strerror(errno)); + + /* We pass events = 0 since we only care about EPOLLHUP. */ + alr_batch_file_hup = alr_dev_create(epoll_fd, batch_fd, "batch_file", 0, + &alr_batch_file_io, NULL); + if (alr_batch_file_hup == NULL) + FATAL("cannot register batch file HUP: %s\n", strerror(errno)); + + batch_fd = -1; + + /* Open control device. */ + int ctl_fd = open(ctl_path, O_RDONLY|O_NONBLOCK|O_CLOEXEC); + if (ctl_fd < 0) + FATAL("cannot open '%s': %s\n", ctl_path, strerror(errno)); + + /* Get and print interface version. */ + oal_version = ioctl(ctl_fd, LUSTRE_ACCESS_LOG_IOCTL_VERSION); + if (oal_version < 0) + FATAL("cannot get ofd access log interface version: %s\n", strerror(errno)); + + DEBUG_D(oal_version); + + /* Get and print device major used for access log devices. */ + oal_log_major = ioctl(ctl_fd, LUSTRE_ACCESS_LOG_IOCTL_MAJOR); + if (oal_log_major < 0) + FATAL("cannot get ofd access log major: %s\n", strerror(errno)); + + DEBUG_D(oal_log_major); + + /* Add control device to epoll set. */ + alr_ctl = alr_dev_create(epoll_fd, ctl_fd, "control", EPOLLIN, + &alr_ctl_io, NULL); + if (alr_ctl == NULL) + FATAL("cannot register control device: %s\n", strerror(errno)); + + ctl_fd = -1; + + do { + struct epoll_event ev[32]; + int timeout = (list_info ? 0 : -1); + int i, ev_count; + + ev_count = epoll_wait(epoll_fd, ev, ARRAY_SIZE(ev), timeout); + if (ev_count < 0) { + if (errno == EINTR) /* Signal or timeout. */ + continue; + + ERROR("cannot wait on epoll set: %s\n", strerror(errno)); + exit_status = EXIT_FAILURE; + goto out; + } + + DEBUG_D(ev_count); + + for (i = 0; i < ev_count; i++) { + struct alr_dev *ad = ev[i].data.ptr; + unsigned int mask = ev[i].events; + + rc = (*ad->alr_io)(epoll_fd, ad, mask); + switch (rc) { + case ALR_EXIT_FAILURE: + exit_status = EXIT_FAILURE; + goto out; + case ALR_EXIT_SUCCESS: + exit_status = EXIT_SUCCESS; + goto out; + case ALR_ERROR: + case ALR_EOF: + alr_dev_free(epoll_fd, ad); + break; + case ALR_OK: + default: + break; + } + } + + if (exit_on_close && alr_log_count == 0) { + DEBUG("no open logs devices, exiting\n"); + exit_status = EXIT_SUCCESS; + goto out; + } + } while (!list_info); + + exit_status = EXIT_SUCCESS; +out: + assert(oal_log_minor_max < ARRAY_SIZE(alr_log)); + + for (m = 0; m <= oal_log_minor_max; m++) { + if (alr_log[m] == NULL) + continue; + + if (list_info) { + rc = alr_log_info(alr_log[m]); + if (rc < 0) + exit_status = EXIT_FAILURE; + } + + alr_dev_free(epoll_fd, &alr_log[m]->alr_dev); + } + + alr_dev_free(epoll_fd, alr_ctl); + alr_dev_free(epoll_fd, alr_signal); + alr_dev_free(epoll_fd, alr_batch_timer); + alr_dev_free(epoll_fd, alr_batch_file_hup); + close(epoll_fd); + + alr_batch_destroy(alr_batch); + + DEBUG_D(exit_status); + + return exit_status; +} diff --git a/lipe/src/general_policy.c b/lipe/src/general_policy.c index 2628a81..c9aead6 100644 --- a/lipe/src/general_policy.c +++ b/lipe/src/general_policy.c @@ -1883,7 +1883,7 @@ int lipe_policy_value_prefix_init(struct lipe_list_head *list, } else if (parent->u.lpv_expression.ope_right == NULL) { parent->u.lpv_expression.ope_right = child; - list_move(&parent->lpv_linkage, list); + lipe_list_move(&parent->lpv_linkage, list); } else { LIPE_FREE_PTR(child); LERROR("expression [%s] is invalid: values left with no operator\n", diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c index bc3b1be..302e060 100644 --- a/lipe/src/lamigo.c +++ b/lipe/src/lamigo.c @@ -84,6 +84,7 @@ #define DEF_COOLDOWN_K 15 /* how fast to cool down / 100 */ #define DEF_HOT_FRACTION 20 /* %% of the hottest's heat - hot files */ #define DEF_HOT_AFTER_IDLE 3 +#define DEF_ALR_CMD "es_ofd_access_log_reader" #define DEF_ALR_EXTRA_ARGS "--exit-on-close" #define DEF_STATFS_REFRESH_INTV 5 /* OST statfs update interval, in seconds */ #define DEF_FAST_POOL_MAX_USED 30 /* open for migration if % space used is less than */ @@ -128,6 +129,7 @@ static void usage(void) "\t-v, --verbose, produce more verbose ouput\n" "\t-w, --dump, stats file (via USR1 signal, default: %s)\n" "\t-W, --heat-dump, heat table file (via USR2 signal, default: %s)\n" + "\t--alr-cmd=CMD, ALR command (default: '%s')\n" "\t--alr-extra-args=ARGS, extra arguments for ALR (default: '%s')\n" "\t--fast-pool=POOL (default '%s')\n" "\t--fast-pool-max-used=MAX stop mirroring to POOL when %% used reaches MAX (default %d)\n" @@ -158,6 +160,7 @@ static void usage(void) LAMIGO_DUMPFILE, LAMIGO_JSONFILE, LAMIGO_HEAT_FILE, + DEF_ALR_CMD, DEF_ALR_EXTRA_ARGS, DEF_FAST_POOL, DEF_FAST_POOL_MAX_USED, @@ -247,6 +250,7 @@ struct options opt = { .o_fast_pool_max_used = DEF_FAST_POOL_MAX_USED, .o_slow_pool_max_used = DEF_SLOW_POOL_MAX_USED, .o_progress_interval = DEF_PROGRESS_INTV, + .o_alr_cmd = DEF_ALR_CMD, .o_alr_extra_args = DEF_ALR_EXTRA_ARGS, .o_alr_periods = DEF_ALR_PERIODS, .o_alr_period_time = DEF_ALR_PERIOD_SECS, @@ -708,6 +712,8 @@ static void lamigo_dump_stats_file(void) fprintf(f, "history:\n"); lamigo_dump_history(f); + lamigo_alr_dump_stat(f); + fflush(f); fclose(f); } @@ -1886,6 +1892,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) } enum { + LAMIGO_OPT_ALR_CMD, LAMIGO_OPT_ALR_EXTRA_ARGS, LAMIGO_OPT_FAST_POOL, LAMIGO_OPT_FAST_POOL_MAX_USED, @@ -1915,6 +1922,7 @@ enum { static struct option options[] = { { "agent", required_argument, NULL, 'g'}, + { "alr-cmd", required_argument, NULL, LAMIGO_OPT_ALR_CMD }, { "alr-extra-args", required_argument, NULL, LAMIGO_OPT_ALR_EXTRA_ARGS}, { "config", required_argument, NULL, 'f' }, { "debug", optional_argument, NULL, 'b' }, @@ -2187,6 +2195,9 @@ static void lamigo_process_opt(int c, char *optarg) if (*endptr != '\0' || opt.o_progress_interval < 1) LX_FATAL("invalid progress interval '%s'\n", optarg); break; + case LAMIGO_OPT_ALR_CMD: + opt.o_alr_cmd = optarg; + break; case LAMIGO_OPT_ALR_EXTRA_ARGS: opt.o_alr_extra_args = optarg; break; diff --git a/lipe/src/lamigo.h b/lipe/src/lamigo.h index 6f81554..fdcc049 100644 --- a/lipe/src/lamigo.h +++ b/lipe/src/lamigo.h @@ -12,6 +12,7 @@ void lamigo_add_alr_agent(const char *host); void lamigo_alr_init(void); +void lamigo_alr_dump_stat(FILE *f); enum alr_pool { ALR_FAST = 0, @@ -90,6 +91,7 @@ struct options { int o_fast_pool_max_used; int o_slow_pool_max_used; int o_progress_interval; /* how often to show progress */ + char *o_alr_cmd; char *o_alr_extra_args; int o_alr_periods; int o_alr_period_time; diff --git a/lipe/src/lamigo_alr.c b/lipe/src/lamigo_alr.c index a20fcd1..7bdd004 100644 --- a/lipe/src/lamigo_alr.c +++ b/lipe/src/lamigo_alr.c @@ -33,6 +33,7 @@ #include "lx_log.h" #include "lamigo.h" #include "lamigo_hash.h" +#include "lstddef.h" #define LAMIGO_DEBUG_ALR @@ -41,9 +42,11 @@ struct alr_agent { struct lipe_list_head ala_list; struct lipe_ssh_context ala_ctx; pthread_t ala_pid; + time_t ala_last_msg_received; }; static LIPE_LIST_HEAD(alr_agent_list); +static bool alr_initialized; /* * all history is broken into 'period': this is needed to limit @@ -170,6 +173,10 @@ static void lamigo_alr_parse_one(const char *host, const char *line) char ostname[256]; char rw; + + if (line[0] == '#') /* Line started with '#' to be ignored */ + return; + rc = sscanf(line, "o=%s f=["SFID"] t=%lld b=%llu e=%llu s=%llu " "g=%llu n=%llu d=%c\n", ostname, RFID(&fid), ×tamp, &begin, &end, &iosize, &segments, &count, &rw); @@ -238,16 +245,11 @@ static int lamigo_alr_parse(const char *host, char *buf, int size, int *received * data every N seconds. then the data are processed and inserted * into the global structure . * - * FIXME Do a read with timeout to detect silently disconnected - * peers. It would be nice to use ssh_keepalive_send() but I cannot - * make this work. Consider having ofd_access_log_reader send a line - * that we ignore at the start of every batch (even if the batch is - * empty). */ -static int lamigo_alr_agent_run(struct alr_agent *ala) + */ +static void lamigo_alr_agent_run(struct alr_agent *ala) { ssh_channel channel = NULL; - unsigned long now = time(NULL); - unsigned long last_checked = now; + time_t last_checked = time(NULL); int rc, offset = 0, received = 0; char cmd[PATH_MAX]; char buffer[16 * 1024]; @@ -258,27 +260,58 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) * lipe_ssh_exec(). */ snprintf(cmd, sizeof(cmd), - "ofd_access_log_reader -i %d -I %d %s 2> /dev/null", - opt.o_alr_ofd_interval, mdtidx, opt.o_alr_extra_args); + "%s -i %d -I %d %s 2> /dev/null", + opt.o_alr_cmd, opt.o_alr_ofd_interval, + mdtidx, opt.o_alr_extra_args); rc = lipe_ssh_start_cmd(&ala->ala_ctx, cmd, &channel); if (rc != SSH_OK) { - LX_ERROR("cannot start ofd_access_log_reader on host '%s': rc = %d\n", - ala->ala_host, rc); + LX_ERROR("cannot start '%s' on host '%s': rc = %d\n", + cmd, ala->ala_host, rc); goto out; } - LX_DEBUG("started access log reader agent on '%s'\n", ala->ala_host); + LX_DEBUG("started '%s' on '%s'\n", opt.o_alr_cmd, ala->ala_host); while (ssh_channel_is_open(channel) && !ssh_channel_is_eof(channel)) { + /* Wait at max 4*ofd_access_log_reader batch interval */ + const int timeout_sec = opt.o_alr_ofd_interval * 4; + time_t now; + + rc = ssh_channel_poll_timeout(channel, timeout_sec * 1000, 0); + if (rc == SSH_ERROR) { + LX_ERROR("polling ssh channel from '%s' failed\n", + ala->ala_host); + goto out; + } + if (rc == SSH_EOF) + break; + if (rc == SSH_AGAIN) /* Bug fixed since libssh 0.9.5 */ + rc = SSH_OK; + if (rc == SSH_OK) { /* Timeout */ + LX_ERROR("waiting message from '%s' exceeded %dsec\n", + ala->ala_host, timeout_sec); + goto out; + } + rc = ssh_channel_read(channel, buffer + offset, sizeof(buffer) - offset, 0); - if (rc == 0) /* ssh_channel_read timeout */ - continue; - if (rc < 0) - break; + if (rc == SSH_AGAIN) /* Bug fixed since libssh 0.9.5 */ + rc = SSH_OK; + if (rc == SSH_OK) { /* Timeout or eof */ + if (ssh_channel_is_eof(channel)) + break; + LX_ERROR("timeout waiting for message from '%s' on '%s'\n", + opt.o_alr_cmd, ala->ala_host); + goto out; + } + if (rc < 0) { + LX_ERROR("reading ssh '%s' failed\n", ala->ala_host); + goto out; + } offset = lamigo_alr_parse(ala->ala_host, buffer, offset + rc, &received); now = time(NULL); + ala->ala_last_msg_received = now; if (now - last_checked > opt.o_progress_interval) { LX_DEBUG("received %d access log records from host '%s'\n", received, ala->ala_host); @@ -287,11 +320,7 @@ static int lamigo_alr_agent_run(struct alr_agent *ala) } } - rc = ssh_channel_get_exit_status(channel); - if (rc == 0) - LX_DEBUG("access log reader on '%s' exited with status 0\n", ala->ala_host); - else - LX_ERROR("access log reader on '%s' terminated with status %d\n", ala->ala_host, rc); + LX_DEBUG("'%s' on '%s' stopped\n", opt.o_alr_cmd, ala->ala_host); out: if (channel != NULL) { @@ -300,8 +329,6 @@ out: } ssh_channel_free(channel); - - return rc; } static void *lamigo_alr_data_collection_thread(void *arg) @@ -890,6 +917,24 @@ void lamigo_alr_init(void) rc = pthread_create(&pid, NULL, lamigo_alr_heat_thread, NULL); if (rc) LX_FATAL("cannot start heat-maint thread: %s\n", strerror(rc)); + + alr_initialized = true; +} + +void lamigo_alr_dump_stat(FILE *f) +{ + struct alr_agent *ala; + + if (!alr_initialized) + return; + + fprintf(f, "access_log_readers:\n"); + + lipe_list_for_each_entry(ala, &alr_agent_list, ala_list) { + fprintf(f, " - host:'%s'\n", ala->ala_host); + fprintf(f, " last_message_at: %lu\n", + ala->ala_last_msg_received); + } } void lamigo_add_alr_agent(const char *host) diff --git a/lipe/src/lamigo_hash.c b/lipe/src/lamigo_hash.c index a2c77f4..1c9badb 100644 --- a/lipe/src/lamigo_hash.c +++ b/lipe/src/lamigo_hash.c @@ -2,101 +2,10 @@ #include "lx_log.h" #include "lamigo.h" #include "lamigo_hash.h" +#include "lstddef.h" int fid_hash_shift = 14; -/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ -#define GOLDEN_RATIO_PRIME_32 0x9e370001UL -/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ -#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL - -#if __BITS_PER_LONG == 32 -#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32 -#define hash_long(val, bits) hash_32(val, bits) -#elif __BITS_PER_LONG == 64 -#define hash_long(val, bits) hash_64(val, bits) -#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64 -#else -#error Wordsize not 32 or 64 -#endif - -static __always_inline __u64 hash_64(__u64 val, unsigned int bits) -{ - __u64 hash = val; - /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ - __u64 n = hash; - - n <<= 18; - hash -= n; - n <<= 33; - hash -= n; - n <<= 3; - hash += n; - n <<= 3; - hash -= n; - n <<= 4; - hash += n; - n <<= 2; - hash += n; - - /* High bits are more random, so use them. */ - return hash >> (64 - bits); -} - -static inline __u32 hash_32(__u32 val, unsigned int bits) -{ - /* On some cpus multiply is faster, on others gcc will do shifts */ - __u32 hash = val * GOLDEN_RATIO_PRIME_32; - - /* High bits are more random, so use them. */ - return hash >> (32 - bits); -} - -static inline __u64 fid_flatten(const struct lu_fid *fid) -{ - __u64 ino; - __u64 seq; - - if (fid_is_igif(fid)) { - ino = lu_igif_ino(fid); - return ino; - } - - seq = fid_seq(fid); - - ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); - - return ino ?: fid_oid(fid); -} - -/** - * map fid to 32 bit value for ino on 32bit systems. - */ -static inline __u32 fid_flatten32(const struct lu_fid *fid) -{ - __u32 ino; - __u64 seq; - - if (fid_is_igif(fid)) { - ino = lu_igif_ino(fid); - return ino; - } - - seq = fid_seq(fid) - FID_SEQ_START; - - /* Map the high bits of the OID into higher bits of the inode number so - * that inodes generated at about the same time have a reduced chance - * of collisions. This will give a period of 2^12 = 1024 unique clients - * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects - * (from OID), or up to 128M inodes without collisions for new files. - */ - ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + - (seq >> (64 - (40-8)) & 0xffffff00) + - (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); - - return ino ?: fid_oid(fid); -} - void fid_hash_del(struct fid_hash_head *hash, struct fid_hash *f) { int idx; diff --git a/lipe/src/lamigo_hash.h b/lipe/src/lamigo_hash.h index e7bd4ae..9035900 100644 --- a/lipe/src/lamigo_hash.h +++ b/lipe/src/lamigo_hash.h @@ -7,10 +7,6 @@ #include #include "list.h" -#define container_of(ptr, type, member) ({ \ - const typeof(((type *) 0)->member) * __mptr = (ptr); \ - (type *) ((char *) __mptr - offsetof(type, member)); }) - struct fid_hash { struct hlist_node fh_node; struct lu_fid fh_fid; @@ -26,14 +22,101 @@ extern int fid_hash_shift; #define FID_HASH_ENTRIES (1 << fid_hash_shift) #define FID_ON_HASH(f) (!hlist_unhashed(&(f)->fh_node)) +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ +#define GOLDEN_RATIO_PRIME_32 0x9e370001UL +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ +#define GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001UL + #if __BITS_PER_LONG == 32 +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_32 +#define hash_long(val, bits) hash_32(val, bits) #define FID_HASH_FN(f) (hash_long(fid_flatten32(f), fid_hash_shift)) #elif __BITS_PER_LONG == 64 +#define hash_long(val, bits) hash_64(val, bits) +#define GOLDEN_RATIO_PRIME GOLDEN_RATIO_PRIME_64 #define FID_HASH_FN(f) (hash_long(fid_flatten(f), fid_hash_shift)) #else #error Wordsize not 32 or 64 #endif +static __always_inline __u64 hash_64(__u64 val, unsigned int bits) +{ + __u64 hash = val; + + /* Sigh, gcc can't optimise this alone like it does for 32 bits. */ + __u64 n = hash; + + n <<= 18; + hash -= n; + n <<= 33; + hash -= n; + n <<= 3; + hash += n; + n <<= 3; + hash -= n; + n <<= 4; + hash += n; + n <<= 2; + hash += n; + + /* High bits are more random, so use them. */ + return hash >> (64 - bits); +} + +static inline __u32 hash_32(__u32 val, unsigned int bits) +{ + /* On some cpus multiply is faster, on others gcc will do shifts */ + __u32 hash = val * GOLDEN_RATIO_PRIME_32; + + /* High bits are more random, so use them. */ + return hash >> (32 - bits); +} + +static inline __u64 fid_flatten(const struct lu_fid *fid) +{ + __u64 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid); + + ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); + + return ino != 0 ? ino : fid_oid(fid); +} + +/** + * map fid to 32 bit value for ino on 32bit systems. + */ +static inline __u32 fid_flatten32(const struct lu_fid *fid) +{ + __u32 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + return ino; + } + + seq = fid_seq(fid) - FID_SEQ_START; + + /* Map the high bits of the OID into higher bits of the inode number so + * that inodes generated at about the same time have a reduced chance + * of collisions. This will give a period of 2^12 = 1024 unique clients + * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects + * (from OID), or up to 128M inodes without collisions for new files. + */ + ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + + (seq >> (64 - (40-8)) & 0xffffff00) + + (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); + + return ino != 0 ? ino : fid_oid(fid); +} + int fid_hash_init(struct fid_hash_head *hash); void fid_hash_free(struct fid_hash_head *hash); struct fid_hash *fid_hash_find(struct fid_hash_head *hash, const struct lu_fid *fid); diff --git a/lipe/src/list.h b/lipe/src/list.h index 7cfb0a0..b82035d 100644 --- a/lipe/src/list.h +++ b/lipe/src/list.h @@ -115,14 +115,21 @@ static inline void lipe_list_del_init(struct lipe_list_head *entry) LIPE_INIT_LIST_HEAD(entry); } +static inline void lipe_list_replace_init(struct lipe_list_head *old_node, + struct lipe_list_head *new_node) +{ + lipe_list_add(new_node, old_node); + lipe_list_del_init(old_node); +} + /** * Remove an entry from the list it is currently in and insert it at the start * of another list. * \param list the entry to move * \param head the list to move it to */ -static inline void list_move(struct lipe_list_head *list, - struct lipe_list_head *head) +static inline void lipe_list_move(struct lipe_list_head *list, + struct lipe_list_head *head) { __lipe_list_del(list->prev, list->next); lipe_list_add(list, head); diff --git a/lipe/src/lstddef.h b/lipe/src/lstddef.h new file mode 100644 index 0000000..7fd7c8a --- /dev/null +++ b/lipe/src/lstddef.h @@ -0,0 +1,293 @@ +#ifndef _LSTDDEF_H +#define _LSTDDEF_H + +#include +#include +#include +#include +#include + +#define __ALIGN_LSTDDEF_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_LSTDDEF(x, a) __ALIGN_LSTDDEF_MASK(x, (typeof(x))(a) - 1) +#define __LSTDDEF_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + +#define ALIGN(x, a) __ALIGN_LSTDDEF((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_LSTDDEF((x) - ((a) - 1), (a)) +#define __ALIGN_MASK(x, mask) __ALIGN_LSTDDEF_MASK((x), (mask)) +#define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) +#define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) + +#ifndef __must_be_array +# define __must_be_array(arr) 0 +#endif + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr)) +#endif + +/* + * This looks more complex than it should be. But we need to + * get the type for the ~ right in round_down (it needs to be + * as wide as the result!), and we want to evaluate the macro + * arguments just once each. + */ +#define __round_mask(x, y) ((__typeof__(x))((y) - 1)) +#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1) +#define round_down(x, y) ((x) & ~__round_mask(x, y)) + +#define DIV_ROUND_UP __USER_DIV_ROUND_UP + +#define DIV_ROUND_DOWN_ULL(ll, d) \ + ({ unsigned long long _tmp = (ll); do_div(_tmp, d); _tmp; }) + +#define DIV_ROUND_UP_ULL(ll, d) DIV_ROUND_DOWN_ULL((ll) + (d) - 1, (d)) + +#if BITS_PER_LONG == 32 +# define DIV_ROUND_UP_SECTOR_T(ll, d) DIV_ROUND_UP_ULL(ll, d) +#else +# define DIV_ROUND_UP_SECTOR_T(ll, d) DIV_ROUND_UP(ll, d) +#endif + +#define rounddown(x, y) ({ \ + typeof(x) __x = (x); \ + __x - (__x % (y)); \ +}) + +/* + * Divide positive or negative dividend by positive divisor and round + * to closest integer. Result is undefined for negative divisors and + * for negative dividends if the divisor variable type is unsigned. + */ +#define DIV_ROUND_CLOSEST(x, divisor) ({ \ + typeof(x) __x = x; \ + typeof(divisor) __d = divisor; \ + (((typeof(x))-1) > 0 || \ + ((typeof(divisor))-1) > 0 || (__x) > 0) ? \ + (((__x) + ((__d) / 2)) / (__d)) : \ + (((__x) - ((__d) / 2)) / (__d)); \ +}) + +/* + * Same as above but for u64 dividends. divisor must be a 32-bit + * number. + */ +#define DIV_ROUND_CLOSEST_ULL(x, divisor) ({ \ + typeof(divisor) __d = divisor; \ + unsigned long long _tmp = (x) + (__d) / 2; \ + do_div(_tmp, __d); \ + _tmp; \ +}) + +/* + * Multiplies an integer by a fraction, while avoiding unnecessary + * overflow or loss of precision. + */ +#define mult_frac(x, numer, denom) ({ \ + typeof(x) quot = (x) / (denom); \ + typeof(x) rem = (x) % (denom); \ + (quot * (numer)) + ((rem * (numer)) / (denom)); \ +}) + +/** + * upper_32_bits - return bits 32-63 of a number + * @n: the number we're accessing + * + * A basic shift-right of a 64- or 32-bit quantity. Use this to suppress + * the "right shift count >= width of type" warning when that quantity is + * 32-bits. + */ +#define upper_32_bits(n) ((__u32)(((n) >> 16) >> 16)) + +/** + * lower_32_bits - return bits 0-31 of a number + * @n: the number we're accessing + */ +#define lower_32_bits(n) ((__u32)(n)) + +/** + * abs - return absolute value of an argument + * @x: the value. If it is unsigned type, it is converted to signed type first + * (s64, long or int depending on its size). + * + * Return: an absolute value of x. If x is 64-bit, macro's return type is s64, + * otherwise it is signed long. + */ +#define abs(x) __builtin_choose_expr(sizeof(x) == sizeof(__s64), ({ \ + __s64 __x = (x); \ + (__x < 0) ? -__x : __x; \ + }), ({ \ + long ret; \ + if (sizeof(x) == sizeof(long)) { \ + long __x = (x); \ + ret = (__x < 0) ? -__x : __x; \ + } else { \ + int __x = (x); \ + ret = (__x < 0) ? -__x : __x; \ + } \ + ret; \ + })) + +/** + * reciprocal_scale - "scale" a value into range [0, ep_ro) + * @val: value + * @ep_ro: right open interval endpoint + * + * Perform a "reciprocal multiplication" in order to "scale" a value into + * range [0, ep_ro), where the upper interval endpoint is right-open. + * This is useful, e.g. for accessing a index of an array containing + * ep_ro elements, for example. Think of it as sort of modulus, only that + * the result isn't that of modulo. ;) Note that if initial input is a + * small value, then result will return 0. + * + * Return: a result based on val in interval [0, ep_ro). + */ +static inline __u32 reciprocal_scale(__u32 val, __u32 ep_ro) +{ + return (__u32)(((__u64) val * ep_ro) >> 32); +} + +/* + * min()/max()/clamp() macros that also do + * strict type-checking.. See the + * "unnecessary" pointer comparison. + */ +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; \ +}) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; \ +}) + +#define min3(x, y, z) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + typeof(z) _min3 = (z); \ + (void) (&_min1 == &_min2); \ + (void) (&_min1 == &_min3); \ + _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \ + (_min2 < _min3 ? _min2 : _min3); \ +}) + +#define max3(x, y, z) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + typeof(z) _max3 = (z); \ + (void) (&_max1 == &_max2); \ + (void) (&_max1 == &_max3); \ + _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \ + (_max2 > _max3 ? _max2 : _max3); \ +}) + +/** + * min_not_zero - return the minimum that is _not_ zero, unless both are zero + * @x: value1 + * @y: value2 + */ +#define min_not_zero(x, y) ({ \ + typeof(x) __x = (x); \ + typeof(y) __y = (y); \ + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); \ +}) + +/** + * clamp - return a value clamped to a given range with strict typechecking + * @val: current value + * @min: minimum allowable value + * @max: maximum allowable value + * + * This macro does strict typechecking of min/max to make sure they are of the + * same type as val. See the unnecessary pointer comparisons. + */ +#define clamp(val, min, max) ({ \ + typeof(val) __val = (val); \ + typeof(min) __min = (min); \ + typeof(max) __max = (max); \ + (void) (&__val == &__min); \ + (void) (&__val == &__max); \ + __val = __val < __min ? __min : __val; \ + __val > __max ? __max : __val; \ +}) + +/* + * ..and if you can't take the strict + * types, you can specify one yourself. + * + * Or not use min/max/clamp at all, of course. + */ +#define min_t(type, x, y) ({ \ + type __min1 = (x); \ + type __min2 = (y); \ + __min1 < __min2 ? __min1 : __min2; \ +}) + +#define max_t(type, x, y) ({ \ + type __max1 = (x); \ + type __max2 = (y); \ + __max1 > __max2 ? __max1 : __max2; \ +}) + +/** + * clamp_t - return a value clamped to a given range using a given type + * @type: the type of variable to use + * @val: current value + * @min: minimum allowable value + * @max: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of type + * 'type' to make all the comparisons. + */ +#define clamp_t(type, val, min, max) ({ \ + type __val = (val); \ + type __min = (min); \ + type __max = (max); \ + __val = __val < __min ? __min : __val; \ + __val > __max ? __max : __val; \ +}) + +/** + * clamp_val - return a value clamped to a given range using val's type + * @val: current value + * @min: minimum allowable value + * @max: maximum allowable value + * + * This macro does no typechecking and uses temporary variables of whatever + * type the input argument 'val' is. This is useful when val is an unsigned + * type and min and max are literals that will otherwise be assigned a signed + * integer type. + */ +#define clamp_val(val, min, max) ({ \ + typeof(val) __val = (val); \ + typeof(val) __min = (min); \ + typeof(val) __max = (max); \ + __val = __val < __min ? __min : __val; \ + __val > __max ? __max : __val; \ +}) + +/* + * swap - swap value of @a and @b + */ +#define swap(a, b) do { \ + typeof(a) __tmp = (a); \ + (a) = (b); \ + (b) = __tmp; \ +} while (0) + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof(((type *) 0)->member) * __mptr = (ptr); \ + (type *) ((char *) __mptr - offsetof(type, member)); }) + +#endif /* !_LSTDDEF_H */ -- 1.8.3.1