From d4bfc9d4e0e959659feeb24b8ee5b05446043527 Mon Sep 17 00:00:00 2001 From: "John L. Hammond" Date: Fri, 8 Oct 2021 16:17:26 -0500 Subject: [PATCH] EX-3548 lipe: lamigo changelog handling In lamigo, register a named changelog user if no suitable user already exists. Print error messages in cases where lamigo appears to have registered multiple changelog users. Add a test that lamigo registers a named user. Add stratagem-hp-deregister-changelogs.sh to find and remove lamigo registered changelogs on the local node and adjust files accordingly. Call stratagem-hp-deregister-changelogs.sh from stratagem-hp-teardown.sh. Test-Parameters: trivial testlist=hot-pools Signed-off-by: John L. Hammond Change-Id: I95330b4d6b7877f162d941af95c7332c927b5af2 Reviewed-on: https://review.whamcloud.com/45170 Reviewed-by: Jian Yu Tested-by: jenkins Reviewed-by: Alex Zhuravlev Reviewed-by: Alexandre Ioffe Reviewed-on: https://review.whamcloud.com/46099 Tested-by: Maloo --- lipe/scripts/stratagem-hp-deregister-changelogs.sh | 52 +++++ lipe/scripts/stratagem-hp-teardown.sh | 3 + lipe/src/lamigo.c | 228 ++++++++++++++------- lipe/src/lamigo.h | 3 +- lipe/src/lx_log.h | 7 + lustre/tests/hot-pools.sh | 33 ++- 6 files changed, 245 insertions(+), 81 deletions(-) create mode 100755 lipe/scripts/stratagem-hp-deregister-changelogs.sh diff --git a/lipe/scripts/stratagem-hp-deregister-changelogs.sh b/lipe/scripts/stratagem-hp-deregister-changelogs.sh new file mode 100755 index 0000000..bf8641c --- /dev/null +++ b/lipe/scripts/stratagem-hp-deregister-changelogs.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +function mdt_hp_deregister_changelogs() { + local mdt_device="$1" + local conf=/etc/lamigo/${mdt_device}.conf + local user_list + local user + + # Removed clX-lamigo named changelog users. + lctl --device "${mdt_device}" changelog_deregister --user=lamigo + + # Remove changelog users from lamigo-${mdt_device}.conf. + user=$(awk -F = '/^user=/ { print $2 }' "${conf}") + if [[ -n "${user}" ]]; then + lctl --device "${mdt_device}" changelog_deregister "${user}" + + sed -i 's/^\(user=.*\)/## \1 ## removed by stratagem-hp-deregister-changelogs.sh/' "${conf}" + clush -ac "${conf}" + fi + + # Remove changelog users from /var/lib/lamigo-${mdt_device}.chlg + user_list=$(clush -aN cat /var/lib/lamigo-${mdt_device}.chlg) + for user in $user_list; do + lctl --device "${mdt_device}" changelog_deregister "${user}" + done + + clush -a rm -f /var/lib/lamigo-${mdt_device}.chlg +} + +function main() { + local fs="$1" + local mdt_pattern + local mdt_device_list + local mdt_device + + if [[ -z "${fs}" ]]; then + echo "Usage: $(basename $0) FILESYSTEM" >&2 + echo "Find hot pools (lamigo) changelogs on the local node and deregister" >&2 + exit 2 + fi + + mdt_pattern="^${fs}-MDT[[:xdigit:]]{4}\$" + + mdt_device_list=$(lctl device_list | + awk -v mdt_pattern="${mdt_pattern}" '$3 == "mdt" && $4 ~ mdt_pattern { print $4; }') + + for mdt_device in ${mdt_device_list}; do + mdt_hp_deregister_changelogs "${mdt_device}" + done +} + +main "$@" diff --git a/lipe/scripts/stratagem-hp-teardown.sh b/lipe/scripts/stratagem-hp-teardown.sh index fdff7b7..c2bda77 100755 --- a/lipe/scripts/stratagem-hp-teardown.sh +++ b/lipe/scripts/stratagem-hp-teardown.sh @@ -110,5 +110,8 @@ clush -qS --group=ha_heads crm config del $FS-hotpool clush -qS --group=ha_heads crm_ticket --ticket $FS-hotpool-allocated --cleanup +echo "Removing Hotpools changelogs for $FS" +clush -aqS stratagem-hp-deregister-changelogs.sh "$FS" + echo "Local client is still configured" echo " to remove run: stratagem-hp-teardown-client.sh" diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c index 40f2b8a..ae6a41e 100644 --- a/lipe/src/lamigo.c +++ b/lipe/src/lamigo.c @@ -64,6 +64,11 @@ #include "lipe_config.h" #include "lipe_version.h" +#ifndef swap +# define swap(a, b) \ + do { typeof(a) __swap_tmp = (a); (a) = (b); (b) = __swap_tmp; } while (0) +#endif + #define DEF_POOL_REFRESH_INTV (10 * 60) #define DEF_PROGRESS_INTV (10 * 60) #define DEF_MIN_AGE 600 @@ -93,6 +98,7 @@ enum llapi_message_level lx_log_level = LLAPI_MSG_INFO; char *lx_log_prefix = "NONE"; +static char *lamigo_changelog_user; static LIPE_LIST_HEAD(lamigo_rule_list); __u64 lamigo_rule_attrs; /* attributes needed to evalute the rules */ @@ -362,8 +368,8 @@ static int strsize2int(long *sizep, char *str) } } -static void systemf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); -static void systemf(const char *fmt, ...) +static int systemf(const char *fmt, ...) __attribute__ ((format (printf, 1, 2))); +static int systemf(const char *fmt, ...) { char *cmd = NULL; va_list args; @@ -374,10 +380,12 @@ static void systemf(const char *fmt, ...) va_end(args); if (rc < 0) - return; + return rc; - system(cmd); + rc = system(cmd); free(cmd); + + return rc; } #define JOB_FMT \ @@ -455,7 +463,7 @@ static void lamigo_dump_stats_file(void) " mountpoint: %s\n" " source_pool: ", PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION, - opt.o_chlg_user, opt.o_mdtname, opt.o_mntpt); + lamigo_changelog_user, opt.o_mdtname, opt.o_mntpt); i = 0; for (pl = src_pools; pl != NULL; pl = pl->pl_next, i++) fprintf(f, "%s%s", i ? "," : "", pl->pl_pool); @@ -1582,13 +1590,13 @@ static void lamigo_check_and_clear_changelog(void) return; LX_DEBUG("CLEAR upto %llu in %s (%llu last)\n", - index, opt.o_chlg_user, lamigo_last_processed_idx); + index, lamigo_changelog_user, lamigo_last_processed_idx); lamigo_last_cleared_index = index; - rc = llapi_changelog_clear(opt.o_mdtname, opt.o_chlg_user, index); + rc = llapi_changelog_clear(opt.o_mdtname, lamigo_changelog_user, index); if (rc < 0) { llapi_error(LLAPI_MSG_ERROR, rc, "failed to clear changelog record %s:%llu (%llu last)", - opt.o_chlg_user, index, lamigo_last_cleared_index); + lamigo_changelog_user, index, lamigo_last_cleared_index); systemf("lctl get_param 'mdd.%s.changelog_users'", opt.o_mdtname); } } @@ -2120,7 +2128,7 @@ void lamigo_process_opt(int c, char *optarg) opt.o_tgt_pool = strdup(optarg); break; case 'u': - opt.o_chlg_user = strdup(optarg); + opt.o_changelog_user = strdup(optarg); break; case 'v': lx_log_level++; @@ -2651,7 +2659,7 @@ int lamigo_rescan(void) return rc; } -static void lamigo_changelog_check_and_set_mask(void) +static void lamigo_changelog_mask_set(void) { char buf[256]; glob_t paths; @@ -2688,14 +2696,15 @@ static void lamigo_changelog_check_and_set_mask(void) } /* - * check whether @user is registered in changelogs - * returns: 0 - registered, <0 - not registered + * check is @user is registered in changelogs + * returns a malloced copy of whatever it found. */ -static int lamigo_check_changelog_user(const char *user) +static char *lamigo_changelog_user_find(const char *user) { char buf[4096]; - char *p, *s, *e; + char *s, *line; glob_t paths; + char *found = NULL; int rc; rc = cfs_get_param_paths(&paths, "mdd/%s/changelog_users", @@ -2707,86 +2716,147 @@ static int lamigo_check_changelog_user(const char *user) if (rc < 0) LX_FATAL("can't get changelog users\n"); - rc = -1; s = buf; + + /* mdd.lustre-MDT0000.changelog_users= + * current_index: 0 + * ID index (idle) mask + * cl4-lamigo 0 (19) mask=MARK,UNLNK,CLOSE + */ /* skip current index line */ - strsep(&s, "\n\r"); + strsep(&s, "\n"); + /* skip ID index.. header */ - strsep(&s, "\n\r"); - /* try to find specified user */ - while ((p = strsep(&s, "\n\r"))) { - e = strsep(&p, " \t"); - if (!strcmp(e, user)) { - rc = 0; - break; + strsep(&s, "\n"); + + /* try to find @user or a named user is @user == NULL */ + while ((line = strsep(&s, "\n")) != NULL) { + char *s2 = strsep(&line, " \t"); + char *user1_and_name1 = xstrdup(s2); + char *user1 = strsep(&s2, "-"); + char *name1 = s2; + + if (user != NULL) { + if (strcmp(user, user1_and_name1) == 0 || + strcmp(user, user1) == 0) { + found = user1_and_name1; + break; + } + } else { + if (name1 != NULL && strcmp(name1, "lamigo") == 0) { + found = user1_and_name1; + break; + } } + + free(user1_and_name1); } + globfree(&paths); - return rc; + return found; } -static void lamigo_check_changelogs(void) +static void lamigo_changelog_user_find_any(void) { - char buf[4096]; - char user[32]; - int rc; - bool registered = false; + char var_user_path[PATH_MAX]; /* /var/lib/lamigo-MDTxxxx.chlg */ + char var_user_buf[32] = ""; + char *named_user = NULL; + char *opt_user = NULL; + char *var_user = NULL; - lamigo_changelog_check_and_set_mask(); + /* This is a real mess that we've made for ourselves. + * + * Find or register a suitable changelog user for lamigo. + * If there is a user with name "lamigo" then we use that. + * Otherwise we try the user in /var/lib/lamigo-.chlg. + * Otherwise we try the configured user. + * + * Note due to failover pairing, a previously registered user + * may not be in var_use /var/lib/lamigo-.chlg on the + * current node. */ + assert(lamigo_changelog_user == NULL); -again: - /* - * first, check in /var/lib/lamigo-**.chlg - * then try user from the config file - * if not found - create and write to /var/lib/lamigo-**.chlg - */ - snprintf(buf, sizeof(buf), LAMIGO_USERFILE, opt.o_mdtname); - rc = lamigo_read_file(buf, user, sizeof(user)); - if (rc >= 0) { - rc = lamigo_check_changelog_user(user); - if (!rc) { - /* found, use it */ - opt.o_chlg_user = strdup(user); - LX_DEBUG("found Changelog user '%s' in '%s'\n", - user, buf); - return; - } - } + named_user = lamigo_changelog_user_find(NULL); - if (registered) - LX_FATAL("cannot find just registered Changelog user '%s'\n", user); + if (opt.o_changelog_user != NULL) + opt_user = lamigo_changelog_user_find(opt.o_changelog_user); - /* try one from the config file */ - if (opt.o_chlg_user) { - rc = lamigo_check_changelog_user(opt.o_chlg_user); - if (!rc) { - /* found, use it */ - LX_DEBUG("found Changelog user '%s' from config\n", - opt.o_chlg_user); - return; - } + snprintf(var_user_path, sizeof(var_user_path), LAMIGO_USERFILE, opt.o_mdtname); + + /* lamigo_read_file() strips the trailing newline and adds a + * trailing '\0'. */ + lamigo_read_file(var_user_path, var_user_buf, sizeof(var_user_buf)); + if (var_user_buf[0] != '\0') + var_user = lamigo_changelog_user_find(var_user_buf); + + if (named_user != NULL && opt_user != NULL && strcmp(named_user, opt_user) != 0) + LX_ERROR("multiple registered changelog users '%s' and '%s'\n", + named_user, opt_user); + + if (named_user != NULL && var_user != NULL && strcmp(named_user, var_user) != 0) + LX_ERROR("multiple registered changelog users '%s' and '%s'\n", + named_user, var_user); + + if (opt_user != NULL && var_user != NULL && strcmp(opt_user, var_user) != 0) + LX_ERROR("multiple registered changelog users '%s' and '%s'\n", + opt_user, var_user); + + if (named_user != NULL) { + swap(lamigo_changelog_user, named_user); + LX_INFO("using named changelog user '%s'\n", + lamigo_changelog_user); + goto out; + } else if (var_user != NULL) { + swap(lamigo_changelog_user, var_user); + LX_INFO("using changelog user '%s' from '%s'\n", + lamigo_changelog_user, var_user_path); + goto out; + } else if (opt_user != NULL) { + swap(lamigo_changelog_user, opt_user); + LX_INFO("using configured changelog user '%s'\n", + lamigo_changelog_user); + goto out; } - llapi_err_noerrno(LLAPI_MSG_INFO, "register new Changelog user"); - /* not found, try to register own */ - snprintf(buf, sizeof(buf), - "lctl --device %s changelog_register -n >"LAMIGO_USERFILE, - opt.o_mdtname, opt.o_mdtname); - rc = system(buf); +out: + LX_DEBUG_S(lamigo_changelog_user); + LX_DEBUG_S(named_user); + LX_DEBUG_S(opt.o_changelog_user); + LX_DEBUG_S(opt_user); + LX_DEBUG_S(var_user_path); + LX_DEBUG_S(var_user_buf); + LX_DEBUG_S(var_user); + + free(named_user); + free(opt_user); + free(var_user); +} + +static void lamigo_changelog_user_register(void) +{ + int rc; + + lamigo_changelog_user_find_any(); + + if (lamigo_changelog_user != NULL) + return; + + LX_INFO("registering new changelog user\n"); + + rc = systemf("lctl --device %s changelog_register --mask='CLOSE UNLNK' --user=lamigo --nameonly > "LAMIGO_USERFILE, + opt.o_mdtname, opt.o_mdtname); if (rc < 0) - LX_FATAL("changelog user '%s' is not registered\n", - opt.o_chlg_user); - - registered = true; - /* if a new changelog user was just registered, either this is the - * first time lamigo was run on the filesystem, or it has been some - * time since it was last run and changes were not logged. Run a - * full filesystem scan to look for any files that need processing. - */ - opt.o_rescan = true; + LX_FATAL("cannot execute 'lctl changelog_register': %s\n", strerror(errno)); + else if (rc != 0) + LX_FATAL("lctl changelog_register cannot register changelog user: status = %d\n", rc); + + lamigo_changelog_user_find_any(); + + if (lamigo_changelog_user != NULL) + return; - goto again; + LX_FATAL("cannot find just registered changelog user\n"); } static unsigned long progress_timestamp; @@ -3094,8 +3164,12 @@ int main(int argc, char **argv) /* refresh source pools */ lamigo_refresh_pools(); + /* register changelog before full scan to avoid missing new files */ - lamigo_check_changelogs(); + lamigo_changelog_user_register(); + + /* ensure that changelog mask includes "CLOSE" and "UNLNK". */ + lamigo_changelog_mask_set(); /* start heat collection and maintaining */ lamigo_alr_init(); diff --git a/lipe/src/lamigo.h b/lipe/src/lamigo.h index c5c8813..1d30c0a 100644 --- a/lipe/src/lamigo.h +++ b/lipe/src/lamigo.h @@ -69,7 +69,8 @@ struct alr_heat { }; struct options { - const char *o_chlg_user; + /* o_changelog_user is just from config. We actually use lamigo_changelog_user/ */ + const char *o_changelog_user; const char *o_mdtname; const char *o_mntpt; const char *o_src_pool; diff --git a/lipe/src/lx_log.h b/lipe/src/lx_log.h index 02b8160..9715046 100644 --- a/lipe/src/lx_log.h +++ b/lipe/src/lx_log.h @@ -51,6 +51,13 @@ extern char *lx_log_prefix; /* Device name the message is related to */ #define LX_OOM(size) \ LX_OOM_AT(__FILE__, __LINE__, __func__, (size)) +#define LX_DEBUG_B(x) LX_DEBUG("%s = %s\n", #x, (x) ? "true" : "false") +#define LX_DEBUG_D(x) LX_DEBUG("%s = %"PRIdMAX"\n", #x, (intmax_t)x) +#define LX_DEBUG_P(x) LX_DEBUG("%s = %p\n", #x, x) +#define LX_DEBUG_S(x) LX_DEBUG("%s = '%s'\n", #x, x) +#define LX_DEBUG_U(x) LX_DEBUG("%s = %"PRIuMAX"\n", #x, (uintmax_t)x) +#define LX_DEBUG_X(x) LX_DEBUG("%s = %"PRIxMAX"\n", #x, (uintmax_t)x) + static inline void *xmalloc1(const char *file, int line, const char *func, size_t size) { void *ptr = malloc(size); diff --git a/lustre/tests/hot-pools.sh b/lustre/tests/hot-pools.sh index f9d621d..3283823 100755 --- a/lustre/tests/hot-pools.sh +++ b/lustre/tests/hot-pools.sh @@ -174,7 +174,10 @@ init_hot_pools_env() { $MOUNT" # enable changelog on MDT(s) - changelog_register + if ${INIT_HOT_POOLS_CHANGELOG:-true}; then + changelog_register + fi + # create OST pools create_ost_pools @@ -1154,7 +1157,7 @@ test_5() { } run_test 5 "lamigo: start with bad command line options" -test_6() { +test_6a() { local bad_user="foo" local real_user @@ -1167,11 +1170,35 @@ test_6() { real_user=$(get_lamigo_chlg) echo "New Changelog user is '$real_user'" stack_trap "__changelog_deregister mds1 $real_user" + stack_trap "rm -f ${LAMIGO_USERFILE[0]}" + sleep 5 + dump_one_lamigo_stats + verify_one_lamigo_param 0 chlg_user "$real_user" +} +run_test 6a "lamigo: start with bad changelog user" + +test_6b() { + local bad_user="foo" + local real_user + + INIT_HOT_POOLS_CHANGELOG=false init_hot_pools_env + + LAMIGO_USR= start_one_lamigo_cmd + check_one_lamigo_is_started || error "failed to start lamigo" + stack_trap stop_one_lamigo_cmd + + real_user=$(get_lamigo_chlg) + echo "New Changelog user is '$real_user'" + stack_trap "__changelog_deregister mds1 $real_user" + stack_trap "rm -f ${LAMIGO_USERFILE[0]}" sleep 5 dump_one_lamigo_stats verify_one_lamigo_param 0 chlg_user "$real_user" + + [[ "$real_user" =~ cl[0-9]+-lamigo ]] || + error "non named changelog user '$real_user'" } -run_test 6 "lamigo: start with bad changelog user" +run_test 6b "lamigo: start with no changelog user" test_7() { init_hot_pools_env -- 1.8.3.1