From b67ed0c6c815b1816b82001149df1eb30a30d053 Mon Sep 17 00:00:00 2001 From: "John L. Hammond" Date: Wed, 22 Sep 2021 14:15:52 -0500 Subject: [PATCH] EX-3889 lipe: lamigo error reporting and signal handling Add new macros LAMIGO_{FATAL,ERROR,WARN,INFO,DEBUG}() to replace the existing calls to llapi_error() and llapi_printf(). Replace almost all open coded calls to exit() with LAMIGO_FATAL(). Handle signals (SIGTERM, SIGUSR1, SIGUSR2) from a dedicated thread. Add x{malloc,calloc,strdup}() macros that call LAMIGO_FATAL() on OOM conditions. In main() replace the while (!stop) loop with a non-breaking while (1) loop. Test-Parameters: trivial testlist=hot-pools Signed-off-by: John L. Hammond Change-Id: Idc31da6eca847305ca16b9992a7fb22aa4d0f112 Reviewed-on: https://review.whamcloud.com/45026 Tested-by: jenkins Reviewed-by: Jian Yu Reviewed-by: Alex Zhuravlev Reviewed-by: Alexandre Ioffe Reviewed-on: https://review.whamcloud.com/45210 --- lipe/src/lamigo.c | 862 ++++++++++++++++------------------------------ lipe/src/lamigo.h | 83 ++++- lipe/src/lamigo_alr.c | 62 +--- lipe/src/lamigo_hash.c | 20 +- lipe/src/lamigo_hash.h | 2 +- lustre/tests/hot-pools.sh | 2 +- 6 files changed, 403 insertions(+), 628 deletions(-) diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c index 246cccd..e111e18 100644 --- a/lipe/src/lamigo.c +++ b/lipe/src/lamigo.c @@ -90,6 +90,9 @@ #define LAMIGO_HEAT_FILE "/var/run/lamigo-%s.heat" #define LAMIGO_PIDFILE "/var/run/lamigo-%s.pid" +int lamigo_log_level = LLAPI_MSG_INFO; +const char *lamigo_mdt_name = "NONE"; + static LIPE_LIST_HEAD(lamigo_rule_list); __u64 lamigo_rule_attrs; /* attributes needed to evalute the rules */ @@ -148,7 +151,7 @@ static void usage(void) DEF_HOT_AFTER_IDLE, DEF_SRC_FREE, DEF_TGT_FREE); - exit(0); + exit(EXIT_SUCCESS); } #define container_of(ptr, type, member) ({ \ @@ -201,7 +204,6 @@ enum amigo_resync_type { }; struct options opt = { - .o_verbose = LLAPI_MSG_INFO, .o_min_age = DEF_MIN_AGE, .o_cache_size = DEF_CACHE_SIZE, .o_chlg_clear_frequency = 4096, @@ -426,14 +428,7 @@ static void lamigo_dump_history(FILE *out) } } -static void lamigo_sigterm_handler(int sig) -{ - psignal(sig, "exiting"); - - _exit(0); -} - -static void lamigo_sigusr1_handler(int sig) +static void lamigo_dump_stats_file(void) { struct resync_agent *a; struct pool_list *pl; @@ -442,7 +437,7 @@ static void lamigo_sigusr1_handler(int sig) FILE *f; int i; - llapi_printf(LLAPI_MSG_DEBUG, "dump to %s\n", opt.o_dump_file); + LAMIGO_DEBUG("dumping stats to '%s'\n", opt.o_dump_file); if (opt.o_dump_file == NULL) return; f = fopen(opt.o_dump_file, "w"); @@ -569,16 +564,16 @@ static void lamigo_sigusr1_handler(int sig) fclose(f); } -static void lamigo_sigusr2_handler(int sig) +static void lamigo_dump_heat_file(void) { FILE *f; - llapi_printf(LLAPI_MSG_DEBUG, "heat to %s\n", opt.o_heat_file); + LAMIGO_DEBUG("dumping heat to '%s'\n", opt.o_heat_file); if (opt.o_heat_file == NULL) return; f = fopen(opt.o_heat_file, "w"); if (!f) { - llapi_printf(LLAPI_MSG_DEBUG, "can't open heat file\n"); + LAMIGO_ERROR("cannot open heat file '%s': %s\n", opt.o_heat_file, strerror(errno)); return; } lamigo_alr_dump_heat_table(f); @@ -604,24 +599,6 @@ static int lamigo_init_cache(void) return 0; } -static void lamigo_cleanup(void) -{ - struct resync_agent *agent; - - fid_hash_free(&head.lh_hash); - lipe_list_for_each_entry(agent, &lamigo_agent_list, rag_list) { - struct resync_ssh_session *rss, *tmp; - - lipe_list_for_each_entry_safe(rss, tmp, - &agent->rag_ssh_list, - rss_list) { - lipe_ssh_context_destroy(&rss->rss_ctx); - lipe_list_del(&rss->rss_list); - free(rss); - } - } -} - static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, int *pstatus) { struct resync_ssh_session *rss; @@ -656,7 +633,6 @@ void *lamigo_replicate_one(void *args) int resync = rj->rj_resync; char cmd[PATH_MAX * 2]; int status = INT_MAX; - enum llapi_message_level msg_level; int rc; if (rj->rj_setprefer) { @@ -665,7 +641,7 @@ void *lamigo_replicate_one(void *args) "'%s/.lustre/fid/"DFID"' > /dev/null 2>&1", rj->rj_pool, agent->rag_mountpoint, PFID(&rj->rj_fid)); - llapi_printf(LLAPI_MSG_DEBUG, "set prefer on "DFID"\n", + LAMIGO_DEBUG("set prefer on "DFID"\n", PFID(&rj->rj_fid)); } else if (resync == AMIGO_RESYNC_EXTEND) { int i; @@ -688,7 +664,7 @@ void *lamigo_replicate_one(void *args) agent->rag_mountpoint, PFID(&rj->rj_fid)); } else { - llapi_err_noerrno(LLAPI_MSG_ERROR, "unknown resync: %d", resync); + LAMIGO_ERROR("unknown resync: %d\n", resync); rc = -EINVAL; goto out; } @@ -696,18 +672,15 @@ void *lamigo_replicate_one(void *args) /* rc < 0 means an ssh error. Otherwise command exit status is * in status. Mask common exit statuses. */ rc = lamigo_exec_cmd(agent, cmd, &status); + LAMIGO_DEBUG("exec command '%s' on '%s': rc = %d, status = %d\n", + cmd, agent->rag_hostname, rc, status); if (rc < 0 || /* 1 from setprefer (see EX-3591) */ (rj->rj_setprefer && status != 0 && status != 1) || /* EBUSY from mirror extend/resync */ (!rj->rj_setprefer && status != 0 && status != EBUSY)) - msg_level = LLAPI_MSG_ERROR; - else - msg_level = LLAPI_MSG_DEBUG; - - llapi_error(msg_level|LLAPI_MSG_NO_ERRNO, 0, - "error executing command '%s' on '%s': rc = %d, status = %d", - cmd, agent->rag_hostname, rc, status); + LAMIGO_ERROR("command '%s' on '%s' failed: rc = %d, status = %d\n", + cmd, agent->rag_hostname, rc, status); out: /* notify the main thread about completion */ write(lamigo_sigpipe[1], &rc, 1); @@ -742,13 +715,13 @@ static int lamigo_spawn_replication(struct resync_job *rj) } } if (!a) { - llapi_printf(LLAPI_MSG_DEBUG, "no good agent\n"); + LAMIGO_DEBUG("no good agent\n"); return -EBUSY; } rj->rj_agent = a; rj->rj_start = time(NULL); - llapi_printf(LLAPI_MSG_DEBUG, "new job %s for "DFID" spawned on %s\n", + LAMIGO_DEBUG("new job %s for "DFID" spawned on %s\n", resync == AMIGO_RESYNC_EXTEND ? "extend" : "resync", PFID(&rj->rj_fid), rj->rj_agent->rag_hostname); @@ -813,8 +786,7 @@ static int lamigo_get_objects(struct lov_user_md_v3 *v3, } else { *objects = NULL; *stripes = 0; - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "unsupported LOV magic %x", v3->lmm_magic); + LAMIGO_ERROR("unsupported LOV magic %x\n", v3->lmm_magic); return -EINVAL; } return 0; @@ -957,8 +929,7 @@ static int lamigo_striping_is_in_sync(struct lov_user_md *lum, } if (opt.o_src_dom && v3->lmm_pattern == LOV_PATTERN_MDT) { - llapi_printf(LLAPI_MSG_DEBUG, - "DoM component"); + LAMIGO_DEBUG("DoM component"); onsrc++; continue; } @@ -1051,7 +1022,7 @@ lamigo_check_user_rules(struct lipe_object_attrs *attrs, rc = lipe_rule_evaluate(rule, attrs, sysattrs, &result); if (rc) { - llapi_error(LLAPI_MSG_ERROR, rc, "rule failed"); + LAMIGO_ERROR("cannot evaluate rule: %s\n", strerror(-rc)); return AMIGO_RESYNC_NONE; } if (!result) @@ -1155,7 +1126,7 @@ static int lamigo_get_attrs(const struct lu_fid *fid, snprintf(attrs->loa_fid_str, sizeof(attrs->loa_fid_str), DFID_NOBRACE, PFID(&attrs->loa_fid)); attrs->loa_attr_bits |= LIPE_OBJECT_ATTR_LMAEA; - llapi_printf(LLAPI_MSG_DEBUG, "got LMA: %d\n", rc); + LAMIGO_DEBUG("got LMA: %d\n", rc); } } @@ -1231,9 +1202,8 @@ static int lamigo_is_in_sync(struct lu_fid *fid, */ resync = lamigo_check_user_rules(&attrs, &sysattrs); if (resync == AMIGO_RESYNC_NONE) { - llapi_printf(LLAPI_MSG_DEBUG, - "skip "DFID" due to rules\n", - PFID(fid)); + LAMIGO_DEBUG("skip "DFID" due to rules\n", + PFID(fid)); stats.s_skip_by_rule++; goto out; } @@ -1243,7 +1213,7 @@ static int lamigo_is_in_sync(struct lu_fid *fid, out: lamigo_hist_add(fid, resync); - llapi_printf(LLAPI_MSG_DEBUG, "check "DFID" stripes=%d: resync=%d\n", + LAMIGO_DEBUG("check "DFID" stripes=%d: resync=%d\n", PFID(fid), mo->mo_stripes, resync); return resync; @@ -1287,12 +1257,7 @@ static void lamigo_check_bad_agents(void) (void *)a); if (rc) return; - rj = calloc(1, sizeof(struct resync_job)); - if (rj == NULL) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "can't allocate for a test job"); - return; - } + rj = xcalloc(1, sizeof(*rj)); rj->rj_check_job = 1; rj->rj_pid = pid; rj->rj_agent = a; @@ -1342,12 +1307,7 @@ void lamigo_schedule_setprefer(struct resync_job *rj, void *cbdata, int rc) if (rc) return; - srj = calloc(1, sizeof(*srj)); - if (srj == NULL) { - llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job"); - return; - } - + srj = xcalloc(1, sizeof(*srj)); srj->rj_fid = rj->rj_fid; srj->rj_setprefer = 1; /* XXX: few src pools? */ @@ -1367,14 +1327,14 @@ static int lamigo_update_one(struct fid_rec *f) /* cold pool is close to full, skip replication */ /* do this check before expensive layout fetching, rules, etc */ stats.s_skip_tgt_closed++; - llapi_printf(LLAPI_MSG_DEBUG, "pool %s closed for "DFID"\n", + LAMIGO_DEBUG("pool %s closed for "DFID"\n", tgt_pools->pl_pool, PFID(&f->fr_fh.fh_fid)); return 0; } if (are_agents_busy()) { /* all the agents are busy */ - llapi_printf(LLAPI_MSG_DEBUG, "no agents avilable (max: %d)\n", lamigo_max_jobs); + LAMIGO_DEBUG("no agents avilable (max: %d)\n", lamigo_max_jobs); return 1; } @@ -1386,7 +1346,7 @@ static int lamigo_update_one(struct fid_rec *f) return 0; } if (ah.ah_hot && alr_period - ah.ah_hot <= 1) { - llapi_printf(LLAPI_MSG_DEBUG, + LAMIGO_DEBUG( "skip hot "DFID" in %u, now %lu\n", PFID(&f->fr_fh.fh_fid), ah.ah_hot, alr_period); @@ -1402,11 +1362,7 @@ static int lamigo_update_one(struct fid_rec *f) return 0; } - rj = calloc(1, sizeof(struct resync_job)); - if (rj == NULL) { - llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job"); - return 1; - } + rj = xcalloc(1, sizeof(*rj)); rj->rj_fid = f->fr_fh.fh_fid; rj->rj_stripes = mo.mo_stripes; rj->rj_index = f->fr_index; @@ -1461,8 +1417,7 @@ static int lamigo_check_sync(void) struct resync_job, rj_list); lipe_list_del(&rj->rj_list); rc = lamigo_submit_job(rj); - llapi_printf(LLAPI_MSG_DEBUG, - "tried to resubmit failed job %p: rc=%d\n", rj, rc); + LAMIGO_DEBUG("tried to resubmit failed job %p: rc=%d\n", rj, rc); if (rc != 0) return rc; } @@ -1532,14 +1487,7 @@ static int lamigo_process_record(struct changelog_rec *rec) fh = fid_hash_find(&head.lh_hash, &rec->cr_tfid); if (fh == NULL) { - f = calloc(sizeof(struct fid_rec), 1); - if (f == NULL) { - rc = -ENOMEM; - llapi_error(LLAPI_MSG_ERROR, rc, - "failed to alloc memory for fid_rec"); - return rc; - } - + f = xcalloc(1, sizeof(*f)); f->fr_fh.fh_fid = rec->cr_tfid; f->fr_index = index; f->fr_time = rec->cr_time; @@ -1634,7 +1582,7 @@ static void lamigo_check_and_clear_changelog(void) index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency) return; - llapi_printf(LLAPI_MSG_DEBUG, "CLEAR upto %llu in %s (%llu last)\n", + LAMIGO_DEBUG("CLEAR upto %llu in %s (%llu last)\n", index, opt.o_chlg_user, lamigo_last_processed_idx); lamigo_last_cleared_index = index; rc = llapi_changelog_clear(opt.o_mdtname, opt.o_chlg_user, index); @@ -1648,10 +1596,9 @@ static void lamigo_check_and_clear_changelog(void) static void lamigo_job_fini(struct resync_job *rj, intptr_t retval) { - llapi_printf(LLAPI_MSG_DEBUG, - "job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n", - rj->rj_pid, PFID(&rj->rj_fid), time(NULL) - rj->rj_start, - retval, rj->rj_agent->rag_bad); + LAMIGO_DEBUG("job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n", + rj->rj_pid, PFID(&rj->rj_fid), time(NULL) - rj->rj_start, + retval, rj->rj_agent->rag_bad); rj->rj_done_timestamp = time(NULL); @@ -1660,16 +1607,16 @@ static void lamigo_job_fini(struct resync_job *rj, intptr_t retval) if (retval == 0) { /* the agent is back */ if (rj->rj_agent->rag_bad) { - llapi_printf(LLAPI_MSG_DEBUG, "agent %s is back\n", - rj->rj_agent->rag_hostname); + LAMIGO_DEBUG("agent %s is back\n", + rj->rj_agent->rag_hostname); rj->rj_agent->rag_bad = false; lamigo_max_jobs += rj->rj_agent->rag_maxjobs; } } else { /* the agent is still bad */ if (rj->rj_agent->rag_bad == false) { - llapi_printf(LLAPI_MSG_DEBUG, "agent %s is bad\n", - rj->rj_agent->rag_hostname); + LAMIGO_DEBUG("agent %s is bad\n", + rj->rj_agent->rag_hostname); assert(lamigo_max_jobs >= rj->rj_agent->rag_maxjobs); lamigo_max_jobs -= rj->rj_agent->rag_maxjobs; @@ -1744,13 +1691,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) struct resync_agent *a; int i; - a = calloc(1, sizeof(*a)); - if (!a) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "can't allocate memory for agent"); - exit(1); - } - + a = xcalloc(1, sizeof(*a)); a->rag_index = lamigo_agent_count; a->rag_hostname = strdup(host); a->rag_mountpoint = strdup(mnt); @@ -1758,22 +1699,16 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) char *endptr; a->rag_maxjobs = strtol(jobs, &endptr, 10); - if (*endptr != '\0') { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "invalid jobs: '%s' (1-2048 expected)", - jobs); - exit(1); - } + if (*endptr != '\0') + LAMIGO_FATAL("invalid jobs: '%s' (1-2048 expected)\n", jobs); } else { a->rag_maxjobs = DEF_AGENT_JOBS; } - if (a->rag_maxjobs < 1 || a->rag_maxjobs > 2048) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "invalid jobs per agent: %d (1-2048 expected)", - a->rag_maxjobs); - exit(1); - } + if (a->rag_maxjobs < 1 || a->rag_maxjobs > 2048) + LAMIGO_FATAL("invalid jobs per agent: %d (1-2048 expected)\n", + a->rag_maxjobs); + lipe_list_add(&a->rag_list, &lamigo_agent_list); a->rag_jobs = 0; @@ -1785,27 +1720,17 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs) /* ssh context per job, and one more for agent heartbeat */ for (i = 0; i < a->rag_maxjobs + 1; i++) { - struct resync_ssh_session *rss = calloc(1, sizeof(*rss)); + struct resync_ssh_session *rss = xcalloc(1, sizeof(*rss)); int rc; - if (!rss) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "can't allocate memory for agent ssh session\n"); - exit(1); - } - rc = lipe_ssh_context_init(&rss->rss_ctx, a->rag_hostname); - if (rc < 0) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "cannot create SSH context for '%s'\n", - a->rag_hostname); - exit(1); - } - + if (rc < 0) + LAMIGO_FATAL("cannot create SSH context for '%s': rc = %d\n", + a->rag_hostname, rc); lipe_list_add(&rss->rss_list, &a->rag_ssh_list); } - llapi_printf(LLAPI_MSG_DEBUG, "AGENT: %s %s %d\n", a->rag_hostname, + LAMIGO_DEBUG("AGENT: %s %s %d\n", a->rag_hostname, a->rag_mountpoint, a->rag_maxjobs); lamigo_agent_count++; @@ -1917,16 +1842,8 @@ struct pool_list *lamigo_alloc_pool(char *pool) { struct pool_list *pl; - pl = calloc(sizeof(*pl), 1); - if (pl == NULL) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't allocate pool"); - exit(1); - } - pl->pl_pool = strdup(pool); - if (pl->pl_pool == NULL) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't allocate pool name"); - exit(1); - } + pl = xcalloc(sizeof(*pl), 1); + pl->pl_pool = xstrdup(pool); pl->pl_ostnr = 0; pl->pl_osts = NULL; pthread_rwlock_init(&pl->pl_lock, NULL); @@ -1963,38 +1880,29 @@ void lamigo_refresh_osts_from_pool(struct pool_list *pl) int oldlevel; rc = cfs_get_param_paths(&paths, "lod/%s-*/numobd", fsname); - if (rc != 0) { - llapi_error(LLAPI_MSG_FATAL, errno, - "can't find numobd fs '%s'", fsname); - exit(1); - } + if (rc != 0) + LAMIGO_FATAL("cannot read OBD count from 'lod/%s-*/numobd': %s\n", + fsname, strerror(errno)); + for (i = 0; i < paths.gl_pathc; i++) { rc = lamigo_read_file(paths.gl_pathv[i], data, sizeof(data)); if (rc >= 0) { char *endptr; obdcount = strtol(data, &endptr, 10); - if (*endptr != '\0') { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "invalid numobd: '%s'", data); - exit(1); - } + if (*endptr != '\0') + LAMIGO_FATAL("invalid OBD count '%s'\n", data); + break; } } globfree(&paths); - if (obdcount < 0) { - llapi_error(LLAPI_MSG_FATAL, errno, "can't find fs '%s'", fsname); - exit(1); - } + if (obdcount < 0) + LAMIGO_FATAL("cannot find filesystem '%s'\n", fsname); bufsize = sizeof(struct obd_uuid) * obdcount; - buffer = malloc(bufsize + sizeof(*list) * obdcount); - if (buffer == NULL) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't get mem for pool members"); - exit(1); - } + buffer = xmalloc(bufsize + sizeof(*list) * obdcount); list = (char **) (buffer + bufsize); snprintf(poolname, sizeof(poolname), "%s.%s", fsname, pl->pl_pool); oldlevel = llapi_msg_get_level(); @@ -2013,11 +1921,7 @@ void lamigo_refresh_osts_from_pool(struct pool_list *pl) goto out; } if (pl->pl_osts == NULL) - pl->pl_osts = malloc(sizeof(int) * nb); - if (pl->pl_osts == NULL) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't allocate mem for OST ind"); - exit(1); - } + pl->pl_osts = xmalloc(sizeof(int) * nb); fslen = strlen(fsname); for (i = 0; i < nb; i++) { @@ -2088,62 +1992,41 @@ void lamigo_process_opt(int c, char *optarg) break; case LAMIGO_OPT_OFD_INTERVAL: opt.o_alr_ofd_interval = atoi(optarg); - if (opt.o_alr_ofd_interval < 1) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "invalid ofd interval '%s'", optarg); - exit(1); - } + if (opt.o_alr_ofd_interval < 1) + LAMIGO_FATAL("invalid ofd interval '%s'\n", optarg); break; case LAMIGO_OPT_HOT_FRACTION: opt.o_alr_hot_fraction = atoi(optarg); if (opt.o_alr_hot_fraction < 1 || - opt.o_alr_hot_fraction > 100) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "invalid hot fraction '%s'", optarg); - exit(1); - } + opt.o_alr_hot_fraction > 100) + LAMIGO_FATAL("invalid hot fraction '%s'\n", optarg); break; case LAMIGO_OPT_HOT_AFTER_IDLE: opt.o_alr_hot_after_idle = atoi(optarg); if (opt.o_alr_hot_after_idle < 1 || - opt.o_alr_hot_after_idle >= opt.o_alr_periods) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "invalid hot-after-idle '%s'", optarg); - exit(1); - } + opt.o_alr_hot_after_idle >= opt.o_alr_periods) + LAMIGO_FATAL("invalid hot-after-idle '%s'\n", optarg); break; case LAMIGO_OPT_MIRROR_CMD: opt.o_mirror_cmd = strdup(optarg); break; case LAMIGO_OPT_POOL_REFRESH: opt.o_pool_refresh = strtol(optarg, &endptr, 10); - if (*endptr != '\0' || opt.o_pool_refresh < 1) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "bad pool refresh interval '%s'", optarg); - exit(1); - } + if (*endptr != '\0' || opt.o_pool_refresh < 1) + LAMIGO_FATAL("invalid pool refresh interval '%s'\n", optarg); break; case LAMIGO_OPT_PROGRESS_INTV: opt.o_progress_interval = strtol(optarg, &endptr, 10); - if (*endptr != '\0' || opt.o_progress_interval < 1) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "bad progress interval '%s'", optarg); - exit(1); - } + if (*endptr != '\0' || opt.o_progress_interval < 1) + LAMIGO_FATAL("invalid progress interval '%s'\n", optarg); break; case LAMIGO_OPT_ALR_EXTRA_ARGS: opt.o_alr_extra_args = optarg; break; case LAMIGO_OPT_SRC_FREE: opt.o_src_free = atoi(optarg); - if (opt.o_src_free < 1 || opt.o_src_free > 99) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "bad source free space '%s'", optarg); - exit(1); - } + if (opt.o_src_free < 1 || opt.o_src_free > 99) + LAMIGO_FATAL("invalid source free space '%s'\n", optarg); break; case LAMIGO_OPT_SRC_DOM: opt.o_src_dom = 1; @@ -2153,26 +2036,19 @@ void lamigo_process_opt(int c, char *optarg) break; case LAMIGO_OPT_TGT_FREE: opt.o_tgt_free = atoi(optarg); - if (opt.o_tgt_free < 1 || opt.o_tgt_free > 99) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "bad target free space '%s'", optarg); - exit(1); - } + if (opt.o_tgt_free < 1 || opt.o_tgt_free > 99) + LAMIGO_FATAL("invalid target free space '%s'\n", optarg); break; case LAMIGO_OPT_VERSION: lipe_version(); - exit(0); + exit(EXIT_SUCCESS); case 'a': opt.o_min_age = strtol(optarg, &endptr, 10); - if (*endptr != '\0' || opt.o_min_age < 5) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "bad value for -a %s", optarg); - exit(1); - } + if (*endptr != '\0' || opt.o_min_age < 5) + LAMIGO_FATAL("invalid value for -a '%s'\n", optarg); break; case 'b': + lamigo_log_level = LLAPI_MSG_MAX; llapi_msg_set_level(LLAPI_MSG_MAX); break; case 'c': { @@ -2180,12 +2056,8 @@ void lamigo_process_opt(int c, char *optarg) rc = strsize2int(&cache_size, optarg); if (rc < 0 || cache_size <= 0 || - (cache_size >= 100 && cache_size < 1<<20)) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "bad value for -c '%s'", optarg); - exit(1); - } + (cache_size >= 100 && cache_size < 1<<20)) + LAMIGO_FATAL("invalid cache size '%s'\n", optarg); /* For value < 100, it is taken as the percentage of * total memory instead. @@ -2194,7 +2066,7 @@ void lamigo_process_opt(int c, char *optarg) opt.o_cache_size = get_fid_cache_size(cache_size); else opt.o_cache_size = cache_size; - llapi_printf(LLAPI_MSG_INFO, "Cache size: %lu\n", opt.o_cache_size); + LAMIGO_INFO("cache size: %lu\n", opt.o_cache_size); break; } case 'f': @@ -2204,11 +2076,9 @@ void lamigo_process_opt(int c, char *optarg) host = strsep(&optarg, ":"); mnt = strsep(&optarg, ":"); jobs = strsep(&optarg, ":"); - if (!host || !mnt) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "invalid agent definition"); - exit(1); - } + if (!host || !mnt) + LAMIGO_FATAL("invalid agent definition\n"); + lamigo_add_agent(host, mnt, jobs); break; case 'h': @@ -2219,32 +2089,24 @@ void lamigo_process_opt(int c, char *optarg) enable_heat = 0; } else { opt.o_alr_heat_fn = atoi(optarg); - if (opt.o_alr_heat_fn < 0 || opt.o_alr_heat_fn > 1) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "invalid heat function '%s'", - optarg); - exit(1); - } + if (opt.o_alr_heat_fn < 0 || opt.o_alr_heat_fn > 1) + LAMIGO_FATAL("invalid heat function '%s'\n", optarg); } break; case 'I': opt.o_alr_hot_after_idle = atoi(optarg); break; case 'm': - opt.o_mdtname = strdup(optarg); + lamigo_mdt_name = xstrdup(optarg); + opt.o_mdtname = xstrdup(optarg); break; case 'M': opt.o_mntpt = strdup(optarg); break; case 'n': opt.o_num_threads = strtoul(optarg, NULL, 0); - if (opt.o_num_threads < 1) { - rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "invalid thread number: %d", - opt.o_num_threads); - exit(1); - } + if (opt.o_num_threads < 1) + LAMIGO_FATAL("invalid thread number: %d\n", opt.o_num_threads); break; case 'o': lamigo_add_alr_agent(optarg); @@ -2262,7 +2124,7 @@ void lamigo_process_opt(int c, char *optarg) opt.o_chlg_user = strdup(optarg); break; case 'v': - opt.o_verbose++; + lamigo_log_level++; break; case 'w': opt.o_dump_file = strdup(optarg); @@ -2272,13 +2134,11 @@ void lamigo_process_opt(int c, char *optarg) break; default: rc = -EINVAL; - llapi_error(LLAPI_MSG_ERROR, rc, - "%s: unknown option '-%c'\n", - program_invocation_short_name, - optopt); - fprintf(stderr, "Try '%s --help' for more information.\n", - program_invocation_short_name); - exit(1); + fprintf(stderr, + "%s: unrecognized option '-%c'\n" + "Try '%s --help' for more information.\n", + program_invocation_short_name, optopt, program_invocation_short_name); + exit(EXIT_FAILURE + 1); break; } } @@ -2304,11 +2164,8 @@ static void count_bracket_recursion(const char *str, int *counter) (*counter)++; else if (*p == '}') (*counter)--; - if (*counter < 0) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "invalid rule string"); - exit(1); - } + if (*counter < 0) + LAMIGO_FATAL("invalid rule '%s'\n", str); p++; } } @@ -2322,7 +2179,7 @@ char *stracat(char *src, char *dst) len += strlen(src); if (dst) len += strlen(dst); - n = malloc(len + 1); + n = xmalloc(len + 1); if (src) strcpy(n, src); if (dst) @@ -2379,11 +2236,9 @@ static void load_config(char *name) FILE *f; f = fopen(name, "r"); - if (!f) { - llapi_error(LLAPI_MSG_FATAL, errno, - "can't open config file %s", name); - exit(1); - } + if (!f) + LAMIGO_FATAL("cannot open config file '%s': %s\n", name, strerror(errno)); + while (!feof(f)) { struct option *opt; char *s, *t; @@ -2427,15 +2282,12 @@ static void load_config(char *name) opt->has_arg == optional_argument) { optarg = strsep(&s, "\n "); if (!optarg && - opt->has_arg == required_argument) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "no argument for %s", t); - exit(1); - } + opt->has_arg == required_argument) + LAMIGO_FATAL("option '%s' requires an argument\n", t); } else { optarg = NULL; } - llapi_printf(LLAPI_MSG_DEBUG, "conf: %s %s\n", t, optarg); + LAMIGO_DEBUG("conf: %s %s\n", t, optarg); lamigo_process_opt(opt->val, optarg); } @@ -2456,7 +2308,7 @@ void lamigo_parse_opts(int argc, char **argv) fprintf(stderr, "Try '%s --help' for more information.\n", program_invocation_short_name); - exit(1); + exit(EXIT_FAILURE + 1); } if (strcmp(options[opt_index].name, "mountpoint") == 0) llapi_err_noerrno(LLAPI_MSG_WARN, @@ -2464,66 +2316,48 @@ void lamigo_parse_opts(int argc, char **argv) lamigo_process_opt(c, optarg); } - if (!opt.o_mntpt) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "%s: no mount point specified\n", argv[0]); - exit(1); - } + if (!opt.o_mntpt) + LAMIGO_FATAL("no mount point specified\n"); rc = llapi_search_fsname(opt.o_mntpt, fsname); - if (rc < 0) { - llapi_error(LLAPI_MSG_ERROR, rc, - "cannot find a Lustre file system mounted at '%s'", - opt.o_mntpt); - exit(1); - } + if (rc < 0) + LAMIGO_FATAL("cannot find a Lustre file system mounted at '%s'\n", + opt.o_mntpt); + + if (!opt.o_mdtname) + LAMIGO_FATAL("no MDT specified\n"); - if (!opt.o_mdtname) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "no MDT specified"); - exit(1); - } rc = cfs_get_param_paths(&paths, "mdt/%s/uuid", opt.o_mdtname); - if (rc != 0) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't find MDT %s", opt.o_mdtname); - exit(1); - } + if (rc != 0) + LAMIGO_FATAL("cannot find MDT uuid from 'mdt/%s/uuid': %s\n", + opt.o_mdtname, strerror(errno)); + globfree(&paths); snprintf(buf, sizeof(buf), "%s/.lustre/fid", opt.o_mntpt); open_by_fid_fd = open(buf, O_RDONLY); - if (open_by_fid_fd < 0) { - llapi_error(LLAPI_MSG_FATAL, errno, "can't open '%s'", buf); - exit(1); - } + if (open_by_fid_fd < 0) + LAMIGO_FATAL("cannot open '%s': %s\n", buf, strerror(errno)); if (src_pools == NULL) { lamigo_parse_pool(DEF_SOURCE_POOL); - llapi_err_noerrno(LLAPI_MSG_FATAL, - "source pools aren't defined, use %s", - DEF_SOURCE_POOL); + LAMIGO_WARN("source pools aren't defined, using '%s'\n", DEF_SOURCE_POOL); } if (opt.o_tgt_pool == NULL) { opt.o_tgt_pool = DEF_TARGET_POOL; - llapi_err_noerrno(LLAPI_MSG_INFO, - "target pool not defined, use %s", - opt.o_tgt_pool); + LAMIGO_WARN("target pool is not defined, using %s\n", opt.o_tgt_pool); } opt.o_tgt_pool_len = strlen(opt.o_tgt_pool); - if (lamigo_lookup_pool(opt.o_tgt_pool)) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "target pool '%s' cannot also be source pool", + if (lamigo_lookup_pool(opt.o_tgt_pool)) + LAMIGO_FATAL("target pool '%s' cannot also be source pool\n", opt.o_tgt_pool); - exit(1); - } - if (lipe_list_empty(&lamigo_agent_list)) { - llapi_err_noerrno(LLAPI_MSG_ERROR, "no agents configured?"); - exit(1); - } + if (lipe_list_empty(&lamigo_agent_list)) + LAMIGO_FATAL("no agents configured\n"); - llapi_printf(LLAPI_MSG_DEBUG, "target pool: %s/%d\n", opt.o_tgt_pool, + LAMIGO_DEBUG("target pool: %s/%d\n", opt.o_tgt_pool, opt.o_tgt_pool_len); tgt_pools = lamigo_alloc_pool(opt.o_tgt_pool); @@ -2541,11 +2375,8 @@ void lamigo_parse_opts(int argc, char **argv) opt.o_batch_sync_cnt = opt.o_cached_fid_hiwm / 2; rc = pipe2(lamigo_sigpipe, O_NONBLOCK); - if (rc < 0) { - llapi_error(LLAPI_MSG_FATAL, errno, - "cannot create sigpipe"); - exit(1); - } + if (rc < 0) + LAMIGO_FATAL("cannot create sigpipe: %s\n", strerror(errno)); } static void lamigo_wait_for_job_completion(int timeout) @@ -2575,11 +2406,7 @@ static int lamigo_create_job(struct lu_fid *fid, { struct resync_job *rj; - rj = calloc(1, sizeof(struct resync_job)); - if (rj == NULL) { - llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job"); - return 1; - } + rj = xcalloc(1, sizeof(*rj)); rj->rj_fid = *fid; rj->rj_stripes = mo->mo_stripes; rj->rj_resync = resync; @@ -2818,7 +2645,7 @@ int lamigo_rescan(void) diff_timevals(&result.sr_time_start, &result.sr_time_end, &result.sr_time_diff); - llapi_printf(LLAPI_MSG_DEBUG, "finished scanning in %d.%06u seconds\n", + LAMIGO_DEBUG("finished scanning in %d.%06u seconds\n", (int)result.sr_time_diff.tv_sec, (unsigned int)result.sr_time_diff.tv_usec); @@ -2833,10 +2660,9 @@ static void lamigo_changelog_check_and_set_mask(void) rc = cfs_get_param_paths(&paths, "mdd/%s/changelog_mask", opt.o_mdtname); - if (rc != 0 || paths.gl_pathc != 1) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't find changelog mask"); - exit(1); - } + if (rc != 0 || paths.gl_pathc != 1) + LAMIGO_FATAL("cannot find changelog mask: %s\n", strerror(errno)); + rc = lamigo_read_file(paths.gl_pathv[0], buf, sizeof(buf)); globfree(&paths); @@ -2856,12 +2682,9 @@ static void lamigo_changelog_check_and_set_mask(void) "lctl set_param -n mdd.%s.changelog_mask=+\"CLOSE UNLNK\"", opt.o_mdtname); rc = system(buf); - if (rc < 0) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "can't enable CLOSE/UNLNK in changelog: rc=%d", - rc); - exit(1); - } + if (rc < 0) + LAMIGO_FATAL("cannot enable CLOSE/UNLNK in changelog: rc = %d\n", rc); + llapi_err_noerrno(LLAPI_MSG_INFO, "enable CLOSE/UNLNK in changelog"); } @@ -2878,15 +2701,13 @@ static int lamigo_check_changelog_user(const char *user) rc = cfs_get_param_paths(&paths, "mdd/%s/changelog_users", opt.o_mdtname); - if (rc != 0 || paths.gl_pathc != 1) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't find changelog users"); - exit(1); - } + if (rc != 0 || paths.gl_pathc != 1) + LAMIGO_FATAL("can't find changelog users\n"); + rc = lamigo_read_file(paths.gl_pathv[0], buf, sizeof(buf)); - if (rc < 0) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't get changelog users"); - exit(1); - } + if (rc < 0) + LAMIGO_FATAL("can't get changelog users\n"); + rc = -1; s = buf; /* skip current index line */ @@ -2928,28 +2749,21 @@ again: if (!rc) { /* found, use it */ opt.o_chlg_user = strdup(user); - llapi_printf(LLAPI_MSG_DEBUG, - "found Changelog user '%s' in '%s'\n", + LAMIGO_DEBUG("found Changelog user '%s' in '%s'\n", user, buf); return; } } - if (registered) { - /* can't find just registered changelog user */ - llapi_err_noerrno(LLAPI_MSG_FATAL, - "can't find registered Changelog user '%s'", - user); - exit(1); - } + if (registered) + LAMIGO_FATAL("cannot find just registered Changelog user '%s'\n", user); /* try one from the config file */ if (opt.o_chlg_user) { rc = lamigo_check_changelog_user(opt.o_chlg_user); if (!rc) { /* found, use it */ - llapi_printf(LLAPI_MSG_DEBUG, - "found Changelog user '%s' from config\n", + LAMIGO_DEBUG("found Changelog user '%s' from config\n", opt.o_chlg_user); return; } @@ -2961,12 +2775,10 @@ again: "lctl --device %s changelog_register -n >"LAMIGO_USERFILE, opt.o_mdtname, opt.o_mdtname); rc = system(buf); - if (rc < 0) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "Changelog user '%s' is not registered", + if (rc < 0) + LAMIGO_FATAL("changelog user '%s' is not registered\n", opt.o_chlg_user); - exit(1); - } + registered = true; /* if a new changelog user was just registered, either this is the * first time lamigo was run on the filesystem, or it has been some @@ -2992,13 +2804,12 @@ void lamigo_show_progress(void) return; progress_last_processed = stats.s_processed; - llapi_printf(LLAPI_MSG_INFO, - "%lu processed, %lu replicated, %lu busy, %lu in queue, " - "%lu hot skipped, %lu ro2hot, %lu rw2hot, %lu rw2cold\n", - stats.s_processed, stats.s_replicated, stats.s_busy, - stats.s_skip_hot, stats.s_replicate_ro2hot, - stats.s_replicate_rw2hot, stats.s_replicate_rw2cold, - head.lh_cached_count); + LAMIGO_INFO("%lu processed, %lu replicated, %lu busy, %lu in queue, " + "%lu hot skipped, %lu ro2hot, %lu rw2hot, %lu rw2cold\n", + stats.s_processed, stats.s_replicated, stats.s_busy, + stats.s_skip_hot, stats.s_replicate_ro2hot, + stats.s_replicate_rw2hot, stats.s_replicate_rw2cold, + head.lh_cached_count); } static void lamigo_lock_pidfile(void) @@ -3008,10 +2819,9 @@ static void lamigo_lock_pidfile(void) snprintf(buf, sizeof(buf), LAMIGO_PIDFILE, opt.o_mdtname); fd = open(buf, O_RDWR | O_CREAT, 0600); - if (fd < 0) { - llapi_error(LLAPI_MSG_FATAL, errno, "can't create pidfile"); - exit(1); - } + if (fd < 0) + LAMIGO_FATAL("cannot create pidfile '%s': %s\n", buf, strerror(errno)); + rc = flock(fd, LOCK_EX | LOCK_NB); if (rc < 0) { sz = read(fd, buf, sizeof(buf)); @@ -3020,24 +2830,18 @@ static void lamigo_lock_pidfile(void) sz = 0; if (sz > 0) buf[sz] = 0; - llapi_err_noerrno(LLAPI_MSG_FATAL, - "another lamigo is running, locked by %s", + LAMIGO_FATAL("another lamigo is running, locked by %s\n", sz > 0 ? buf : "[unknown]"); - exit(1); } rc = ftruncate(fd, 0); - if (rc < 0) { - llapi_error(LLAPI_MSG_FATAL, errno, "cannot truncate pidfile"); - exit(1); - } + if (rc < 0) + LAMIGO_FATAL("cannot truncate pidfile: %s\n", strerror(errno)); sz = snprintf(buf, sizeof(buf), "%d\n", getpid()); rc = write(fd, buf, sz); - if (rc < 0 || rc != sz) { - llapi_error(LLAPI_MSG_FATAL, rc, "can't write pidfile"); - exit(1); - } + if (rc < 0 || rc != sz) + LAMIGO_ERROR("cannot write pidfile: %s\n", rc < 0 ? strerror(errno) : "short write"); } static void lamigo_process_changelog(void) @@ -3085,14 +2889,11 @@ again: if (rc < 0) { int i; - llapi_error(LLAPI_MSG_ERROR, rc, - "failed to process record"); + LAMIGO_ERROR("cannot to process changelog record: %s\n", strerror(-rc)); rc = llapi_changelog_fini(&chglog_hdlr); - if (rc) { - llapi_error(LLAPI_MSG_FATAL, rc, - "cannot fini changelog"); - exit(1); - } + if (rc) + LAMIGO_FATAL("cannot fini changelog: %s\n", strerror(-rc)); + i = 0; do { /* do not reopen too frequently */ @@ -3106,13 +2907,11 @@ again: CHANGELOG_FLAG_EXTRA_FLAGS, opt.o_mdtname, 0); } while (i++ < 5 && rc != 0); - if (rc) { - llapi_error(LLAPI_MSG_ERROR, rc, - "unable to reopen changelog of MDT [%s]", - opt.o_mdtname); - exit(1); - } - llapi_printf(LLAPI_MSG_DEBUG, "Reopened changelog\n"); + + if (rc) + LAMIGO_FATAL("cannot reopen changelog: %s\n", strerror(-rc)); + + LAMIGO_DEBUG("Reopened changelog\n"); goto again; } } @@ -3126,32 +2925,24 @@ void lamigo_parse_rules(const char *rule_str, const char *filename) int i, rc; tok = json_tokener_new(); - if (!tok) { - llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO , -1, - "cannot allocate json token"); - exit(1); - } + if (!tok) + LAMIGO_OOM(-1); obj_top = json_tokener_parse_ex(tok, rule_str, strlen(rule_str)); if (obj_top == NULL) { enum json_tokener_error jerr; jerr = json_tokener_get_error(tok); - llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, -1, - "cannot parse rules in %s: %s - %s", - filename, rule_str, json_tokener_error_desc(jerr)); - exit(1); + + LAMIGO_FATAL("cannot parse rule '%s' in '%s': %s\n", + rule_str, filename, json_tokener_error_desc(jerr)); } rc = json_object_object_get_ex(obj_top, LIPE_CONFIG_RULES, &obj_rules); - if (!rc) { - llapi_error(LLAPI_MSG_ERROR, rc, "no rules in %s", filename); - exit(1); - } - if (json_object_get_type(obj_rules) != json_type_array) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "rules are not an array in %s", filename); - exit(1); - } + if (!rc) + LAMIGO_FATAL("no rules in '%s'\n", filename); + + if (json_object_get_type(obj_rules) != json_type_array) + LAMIGO_FATAL("rules in '%s' are not an array\n", filename); for (i = 0; i < json_object_array_length(obj_rules); i++) { struct json_object *obj_action, *obj_expr; @@ -3161,71 +2952,47 @@ void lamigo_parse_rules(const char *rule_str, const char *filename) obj_rule = json_object_array_get_idx(obj_rules, i); - if (!obj_rule) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "failed to get rule #%d", i); - exit(1); - } + if (!obj_rule) + LAMIGO_FATAL("failed to get rule #%d\n", i); rc = json_object_object_get_ex(obj_rule, LIPE_CONFIG_ACTION, &obj_action); - if (!rc) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "no action in rule %s", + if (!rc) + LAMIGO_FATAL("no action in rule %s\n", json_object_to_json_string(obj_rule)); - exit(1); - } + action = json_object_get_string(obj_action); - if (!action) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "invalid action in rule %s", + if (!action) + LAMIGO_FATAL("invalid action in rule %s\n", json_object_to_json_string(obj_rule)); - exit(1); - } + rc = json_object_object_get_ex(obj_rule, LIPE_CONFIG_EXPRESSION, &obj_expr); - if (!rc) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "no expression in rule %s", - json_object_to_json_string(obj_rule)); - exit(1); - } + if (!rc) + LAMIGO_FATAL("no expression in rule %s\n", + json_object_to_json_string(obj_rule)); + expr = json_object_get_string(obj_expr); - if (!expr) { - llapi_error(LLAPI_MSG_ERROR, -EINVAL, - "invalid expression in rule %s", - json_object_to_json_string(obj_rule)); - exit(1); - } + if (!expr) + LAMIGO_FATAL("invalid expression in rule %s\n", + json_object_to_json_string(obj_rule)); - LIPE_ALLOC_PTR(lr); - if (!lr) { - llapi_error(LLAPI_MSG_ERROR, rc, - "cannot allocate rule %s", - json_object_to_json_string(obj_rule)); - exit(1); - } + lr = xcalloc(1, sizeof(*lr)); if (!strcmp(action, "skip")) lr->lr_action.la_action = LAT_COUNTER_INC; else if (!strcmp(action, "mirror")) lr->lr_action.la_action = LAT_SHELL_CMD_FID; - else { - llapi_error(LLAPI_MSG_ERROR, 0, - "unknown action '%s' in rule %s", action, - json_object_to_json_string(obj_rule)); - exit(1); - } + else + LAMIGO_FATAL("unknown action '%s' in rule '%s'\n", + action, json_object_to_json_string(obj_rule)); LIPE_INIT_LIST_HEAD(&lr->lr_values); rc = lipe_policy_value_init(&lr->lr_values, &lr->lr_expression, &valid, expr); - if (rc) { - llapi_error(LLAPI_MSG_ERROR, rc, - "cannot parse expression in rule %s", - json_object_to_json_string(obj_rule)); - exit(1); - } + if (rc) + LAMIGO_FATAL("cannot parse expression in rule %s\n", strerror(-rc)); + lipe_list_add_tail(&lr->lr_linkage, &lamigo_rule_list); lamigo_rule_attrs |= lr->lr_attr_bits; } @@ -3233,39 +3000,65 @@ void lamigo_parse_rules(const char *rule_str, const char *filename) json_object_put(obj_top); } -static void lamigo_register_signal_handlers(void) +static void *lamigo_signal_thread_start(void *arg) { - struct sigaction sigterm_action = { - .sa_handler = &lamigo_sigterm_handler, - }; - struct sigaction sigusr1_action = { - .sa_handler = &lamigo_sigusr1_handler, - .sa_flags = SA_RESTART, - }; - struct sigaction sigusr2_action = { - .sa_handler = &lamigo_sigusr2_handler, - .sa_flags = SA_RESTART, - }; + sigset_t *set = arg; + int sig; + int rc; + + while (1) { + rc = sigwait(set, &sig); + /* + * RETURN VALUE + * + * On success, sigwait() returns 0. On error, it + * returns a positive error number (listed in + * ERRORS). + * + * ERRORS + * EINVAL set contains an invalid signal number. + */ + if (rc != 0) { + LAMIGO_ERROR("signal wait failed: %s\n", strerror(rc)); + continue; + } - sigemptyset(&sigterm_action.sa_mask); - sigemptyset(&sigusr1_action.sa_mask); - sigemptyset(&sigusr2_action.sa_mask); + LAMIGO_DEBUG("received signal %d\n", sig); - sigaction(SIGTERM, &sigterm_action, NULL); - sigaction(SIGUSR1, &sigusr1_action, NULL); - sigaction(SIGUSR2, &sigusr2_action, NULL); + switch (sig) { + case SIGUSR1: + lamigo_dump_stats_file(); + break; + case SIGUSR2: + lamigo_dump_heat_file(); + break; + default: + LAMIGO_INFO("received signal %d, exiting\n", sig); + exit(EXIT_SUCCESS); + } + } } int main(int argc, char **argv) { - int rc; - bool stop = 0; - int ret = 0; - pthread_t pid; + pthread_t lamigo_refresh_statfs_thread_id; + pthread_t lamigo_signal_thread_id; + sigset_t sigset; + int rc; - /* Ignore SIGUSR1 and SIGUSR2 until we are setup. */ - signal(SIGUSR1, SIG_IGN); - signal(SIGUSR2, SIG_IGN); + /* We will handle signals in a dedicated thread. */ + sigemptyset(&sigset); + sigaddset(&sigset, SIGTERM); + sigaddset(&sigset, SIGUSR1); + sigaddset(&sigset, SIGUSR2); + + rc = pthread_sigmask(SIG_BLOCK, &sigset, NULL); + if (rc != 0) + LAMIGO_FATAL("cannot set signal mask: %s\n", strerror(rc)); + + rc = pthread_create(&lamigo_signal_thread_id, NULL, &lamigo_signal_thread_start, &sigset); + if (rc != 0) + LAMIGO_FATAL("cannot start signal thread: %s\n", strerror(rc)); lipe_version_init(); ssh_threads_set_callbacks(ssh_threads_get_pthread()); @@ -3273,7 +3066,7 @@ int main(int argc, char **argv) setlinebuf(stdout); setlinebuf(stderr); - llapi_msg_set_level(opt.o_verbose); + llapi_msg_set_level(lamigo_log_level); lamigo_parse_opts(argc, argv); @@ -3281,15 +3074,12 @@ int main(int argc, char **argv) * followed by the MDT name ("lamigo lustre-MDT0000"). */ llapi_set_command_name(opt.o_mdtname); - llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, - "version %s-%s, revision %s", + LAMIGO_INFO("version %s-%s, revision %s\n", PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION); rc = lamigo_init_cache(); - if (rc < 0) { - llapi_err_noerrno(LLAPI_MSG_FATAL, "can't init cache\n"); - exit(1); - } + if (rc < 0) + LAMIGO_FATAL("cannot init cache\n"); /* create and lock pidfile to protect against another instance */ lamigo_lock_pidfile(); @@ -3297,9 +3087,8 @@ int main(int argc, char **argv) /* wait till the target pool got one OST at least */ lamigo_refresh_osts_from_pool(tgt_pools); while (tgt_pools->pl_ostnr == 0) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "Target pool %s is empty, waiting...", - tgt_pools->pl_pool); + LAMIGO_ERROR("target pool '%s' is empty, waiting %d seconds\n", + tgt_pools->pl_pool, opt.o_pool_refresh); sleep(opt.o_pool_refresh); lamigo_refresh_osts_from_pool(tgt_pools); } @@ -3312,40 +3101,30 @@ int main(int argc, char **argv) /* start heat collection and maintaining */ lamigo_alr_init(); - rc = pthread_create(&pid, NULL, lamigo_refresh_statfs_thread, NULL); - if (rc) { - llapi_error(LLAPI_MSG_FATAL, rc, - "unable to start statfs thread"); - exit(1); - } + rc = pthread_create(&lamigo_refresh_statfs_thread_id, NULL, lamigo_refresh_statfs_thread, NULL); + if (rc != 0) + LAMIGO_FATAL("cannot start statfs thread: %s\n", strerror(rc)); if (opt.o_rescan) { /* scan the whole MDT and replicate matched files */ - ret = lamigo_rescan(); + rc = lamigo_rescan(); + if (rc < 0) + LAMIGO_FATAL("cannot scan device: %s\n", strerror(-rc)); } - llapi_printf(LLAPI_MSG_DEBUG, "Start receiving records\n"); + LAMIGO_DEBUG("Start receiving records\n"); rc = llapi_changelog_start(&chglog_hdlr, CHANGELOG_FLAG_FOLLOW | CHANGELOG_FLAG_BLOCK | CHANGELOG_FLAG_JOBID | CHANGELOG_FLAG_EXTRA_FLAGS, opt.o_mdtname, 0); - if (rc) { - /* XXX: probably keep trying in some cases? */ - llapi_error(LLAPI_MSG_ERROR, rc, - "unable to open changelog of MDT [%s]", - opt.o_mdtname); - ret = rc; - goto out; - } - - llapi_printf(LLAPI_MSG_INFO, "started\n"); - - lamigo_register_signal_handlers(); + if (rc < 0) + LAMIGO_FATAL("cannot open changelog: %s\n", strerror(-rc)); - while (!stop) { + LAMIGO_INFO("started\n"); + while (1) { if (head.lh_cached_count < opt.o_cached_fid_hiwm) lamigo_process_changelog(); else @@ -3357,8 +3136,8 @@ int main(int argc, char **argv) if (!are_agents_busy()) { rc = lamigo_check_sync(); if (rc < 0) { - stop = true; - ret = rc; + LAMIGO_ERROR("check sync failed: rc = %d\n", rc); + sleep(1); } } @@ -3367,28 +3146,6 @@ int main(int argc, char **argv) lamigo_check_bad_agents(); lamigo_show_progress(); } - - /* wait for all jobs to complete */ - while (lamigo_jobs_running) { - lamigo_wait_for_job_completion(10); - lamigo_check_jobs(); - lamigo_check_and_clear_changelog(); - } - - rc = llapi_changelog_fini(&chglog_hdlr); - if (rc) { - llapi_error(LLAPI_MSG_ERROR, rc, - "unable to close changelog of MDT [%s]", - opt.o_mdtname); - ret = rc; - } - -out: - lamigo_cleanup(); - llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, "exited\n"); - lipe_version_fini(); - - return ret; } void lamigo_alr_mirror_cb(struct resync_job *rj, void *cbdata, int rc) @@ -3404,11 +3161,7 @@ static void lamigo_new_job_for_hot(struct lu_fid *fid, enum amigo_resync_type sy struct resync_job *rj; int rc; - rj = calloc(1, sizeof(struct resync_job)); - if (rj == NULL) { - llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job"); - return; - } + rj = xcalloc(1, sizeof(*rj)); rj->rj_fid = *fid; rj->rj_stripes = stripes; rj->rj_index = 0; @@ -3431,16 +3184,14 @@ static int lamigo_check_hot_one(struct alr_heat *ht) struct mirror_opts mo = { 0 }; int sync; - llapi_printf(LLAPI_MSG_DEBUG, - "check hot "DFID": H: %Lu/%Lu, P: %Lu/%Lu, " + LAMIGO_DEBUG("check hot "DFID": H: %Lu/%Lu, P: %Lu/%Lu, " "L %d, I %d %s\n", PFID(&ht->ah_fid), ht->ah_heat[0], ht->ah_heat[1], ht->ah_pools[0], ht->ah_pools[1], ht->ah_livetime, ht->ah_idle, ht->ah_mark ? "M" : ""); if (ht->ah_mark & ALR_TAG_PROCESSED) { /* already tried to replicate */ - llapi_printf(LLAPI_MSG_DEBUG, - DFID" tried to replicate already\n", + LAMIGO_DEBUG(DFID" tried to replicate already\n", PFID(&ht->ah_fid)); return 0; } @@ -3453,7 +3204,7 @@ static int lamigo_check_hot_one(struct alr_heat *ht) if (ht->ah_heat[0] && ht->ah_heat[1] == 0 && ht->ah_pools[0] == 0 && ht->ah_pools[1]) { sync = lamigo_is_in_sync(&ht->ah_fid, tgt_pools, src_pools, &mo); - llapi_printf(LLAPI_MSG_DEBUG, "try to replicate RO "DFID": %d\n", + LAMIGO_DEBUG("try to replicate RO "DFID": %d\n", PFID(&ht->ah_fid), sync); if (sync != AMIGO_RESYNC_NONE) { lamigo_new_job_for_hot(&ht->ah_fid, sync, src_pools, @@ -3472,7 +3223,7 @@ static int lamigo_check_hot_one(struct alr_heat *ht) if (ht->ah_idle > 0 && ht->ah_heat[1] && ht->ah_pools[0] == 0 && ht->ah_pools[1]) { sync = lamigo_is_in_sync(&ht->ah_fid, tgt_pools, src_pools, &mo); - llapi_printf(LLAPI_MSG_DEBUG, "try to replicate RW "DFID": %d\n", + LAMIGO_DEBUG("try to replicate RW "DFID": %d\n", PFID(&ht->ah_fid), sync); if (sync != AMIGO_RESYNC_NONE) { lamigo_new_job_for_hot(&ht->ah_fid, sync, src_pools, @@ -3498,7 +3249,7 @@ static void lamigo_check_hot_on_cold(struct alr_heat *ht) ht->ah_pools[1] == 0 && ht->ah_pools[0]) { sync = lamigo_is_in_sync(&ht->ah_fid, src_pools, tgt_pools, &mo); - llapi_printf(LLAPI_MSG_DEBUG, + LAMIGO_DEBUG( "replicate idling hot to CP "DFID": %d\n", PFID(&ht->ah_fid), sync); if (sync != AMIGO_RESYNC_NONE) { @@ -3524,13 +3275,7 @@ struct alr_heat *lamigo_get_hot(int period, int *nr) return NULL; /* XXX: limit number of hot files to check? */ - ht = calloc(*nr + 1, sizeof(*ht)); - if (!ht) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "allocation for ht failed"); - return NULL; - } - + ht = xcalloc(*nr + 1, sizeof(*ht)); i = lamigo_alr_get_hot_files(period, ht, *nr, ALR_TAG_NO_ACCT | ALR_TAG_REPLICATED); if (i == 0) { @@ -3561,7 +3306,7 @@ static void lamigo_check_hot(void) if (src_pools->pl_open) { /* get most recent hot files */ ht = lamigo_get_hot(alr_hot_period, &nr); - llapi_printf(LLAPI_MSG_DEBUG, "check hot in period %lu - %d\n", + LAMIGO_DEBUG("check hot in period %lu - %d\n", alr_hot_period, nr); if (ht) { for (i = 0; i < nr; i++) @@ -3576,15 +3321,14 @@ static void lamigo_check_hot(void) /* now check hot idling files - the files we found hot and * skipped replication. now it's time to try again */ ht = lamigo_get_hot(alr_hot_period - opt.o_alr_hot_after_idle, &nr); - llapi_printf(LLAPI_MSG_DEBUG, "check idle in period %lu - %d\n", + LAMIGO_DEBUG("check idle in period %lu - %d\n", alr_hot_period - 3, nr); if (!ht) goto out; for (i = 0; i < nr; i++) { struct alr_heat *ah = ht + i; - llapi_printf(LLAPI_MSG_DEBUG, - "idle "DFID": P: %Lu/%Lu, live %d, idle %d\n", + LAMIGO_DEBUG("idle "DFID": P: %Lu/%Lu, live %d, idle %d\n", PFID(&ah->ah_fid), ah->ah_pools[0], ah->ah_pools[1], ah->ah_livetime, ah->ah_idle); if (src_pools->pl_open) @@ -3614,7 +3358,7 @@ static __u64 lamigo_read_osp_param(const int ostidx, const char *param) fd = open(path, O_RDONLY); if (fd < 0) { - llapi_error(LLAPI_MSG_ERROR, errno, "cannot open '%s'", path); + LAMIGO_ERROR("cannot open '%s': %s\n", path, strerror(errno)); /* 0 means non-available OST */ return 0; } @@ -3622,7 +3366,7 @@ static __u64 lamigo_read_osp_param(const int ostidx, const char *param) if (rc > 0) retval = strtoul(buf, NULL, 10); if (rc < 0) - llapi_error(LLAPI_MSG_ERROR, errno, "cannot read '%s'", path); + LAMIGO_ERROR("cannot read '%s': %s\n", path, strerror(errno)); close(fd); /* report zero if something went wrong @@ -3647,8 +3391,7 @@ static void lamigo_refresh_pool_statfs(struct pool_list *pl, int threshold) /* check OSP is active */ active = lamigo_read_osp_param(ostidx, "active"); status = lamigo_read_osp_param(ostidx, "prealloc_status"); - llapi_printf(LLAPI_MSG_DEBUG, - "statfs for %d%s/%d: %llu from %llu\n", + LAMIGO_DEBUG("statfs for %d%s/%d: %llu from %llu\n", ostidx, active ? "(active)" : "(inactive)", (int)status, kbavail, kbtotal); if (!active || status) @@ -3665,8 +3408,7 @@ static void lamigo_refresh_pool_statfs(struct pool_list *pl, int threshold) pl->pl_open = true; /* whether pool is good for replicas */ - llapi_printf(LLAPI_MSG_DEBUG, - "statfs for %s %s pool: %llu from %llu, thresh %llu\n", + LAMIGO_DEBUG("statfs for %s %s pool: %llu from %llu, thresh %llu\n", pl->pl_open ? "open" : "closed", pl->pl_pool, tavail, ttotal, ttotal * threshold / 100); } @@ -3700,19 +3442,15 @@ static void *lamigo_refresh_statfs_thread(void *arg) int rc; str = strstr(opt.o_mdtname, "-MDT"); - if (!str) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "failed to get MDT index from %s\n", - opt.o_mdtname); - exit(1); - } + if (!str) + LAMIGO_FATAL("cannot get MDT index from '%s'\n", opt.o_mdtname); + lamigo_mdtidx = strtoul(str + 4, NULL, 16); rc = cfs_get_param_paths(&paths, "osp"); - if (rc != 0) { - llapi_error(LLAPI_MSG_FATAL, rc, "can't find OSP root"); - exit(1); - } + if (rc != 0) + LAMIGO_FATAL("cannot find OSP root: %s\n", strerror(errno)); + osproot = strdup(paths.gl_pathv[0]); globfree(&paths); diff --git a/lipe/src/lamigo.h b/lipe/src/lamigo.h index 87db237..343c641 100644 --- a/lipe/src/lamigo.h +++ b/lipe/src/lamigo.h @@ -5,6 +5,11 @@ #ifndef _LAMIGO_H_ #define _LAMIGO_H_ +#include +#include +#include +#include + void lamigo_add_alr_agent(const char *host); void lamigo_alr_init(void); @@ -71,7 +76,6 @@ struct options { int o_src_pool_len; char *o_tgt_pool; int o_tgt_pool_len; - int o_verbose; int o_min_age; unsigned long o_cached_fid_hiwm; /* high watermark */ unsigned long o_cache_size; @@ -102,7 +106,7 @@ struct options { int o_src_dom; char *o_heat_file; }; -extern struct options opt; +extern struct options opt; /* opt is not a good global variable name. */ extern int enable_heat; extern unsigned long alr_period; @@ -115,4 +119,79 @@ void lamigo_alr_dump_heat_table(FILE *file); void lamigo_parse_rules(const char *rule_str, const char *filename); +extern int lamigo_log_level; /* enum llapi_message_level */ +extern const char *lamigo_mdt_name; + +/* lamigo runs as a systemd service. So whatever it prints to stderr + * (or stdout) will be collected by journald and added to the + * logs. journald will add a lamigo[$PID] prefix. So when we print a + * debug message we don't need an lamigo prefix but we do want to + * include the MDT name. In this was we get + * + * Sep 22 12:51:10 $HOSTNAME lamigo[24074]: lustre-MDT0001: blah balh +*/ +#define LAMIGO_PRINT(level, fmt, args...) \ + do { \ + if (level <= lamigo_log_level) \ + fprintf(stderr, "%s: " fmt, lamigo_mdt_name, ##args); \ + } while (0) + +#define LAMIGO_DEBUG(fmt, args...) \ + LAMIGO_PRINT(LLAPI_MSG_DEBUG, "DEBUG: " fmt, ##args) + +#define LAMIGO_INFO(fmt, args...) \ + LAMIGO_PRINT(LLAPI_MSG_INFO, "INFO: " fmt, ##args) + +#define LAMIGO_WARN(fmt, args...) \ + LAMIGO_PRINT(LLAPI_MSG_WARN, "WARN: " fmt, ##args) + +#define LAMIGO_ERROR(fmt, args...) \ + LAMIGO_PRINT(LLAPI_MSG_ERROR, "ERROR: " fmt, ##args) + +#define LAMIGO_FATAL(fmt, args...) \ + do { \ + LAMIGO_PRINT(LLAPI_MSG_FATAL, "FATAL: " fmt, ##args); \ + exit(EXIT_FAILURE); \ + } while (0) + +#define LAMIGO_OOM_AT(file, line, func, size) \ + LAMIGO_FATAL("out of memory at (%s:%d:%s), size = %zd\n", (file), (line), (func), (ssize_t)(size)) + +#define LAMIGO_OOM(size) \ + LAMIGO_OOM_AT(__FILE__, __LINE__, __func__, (size)) + +static inline void *xmalloc1(const char *file, int line, const char *func, size_t size) +{ + void *ptr = malloc(size); + + if (ptr == NULL && size != 0) + LAMIGO_OOM_AT(file, line, func, size); + + return ptr; +} + +static inline void *xcalloc1(const char *file, int line, const char *func, size_t nmemb, size_t size) +{ + void *ptr = calloc(nmemb, size); + + if (ptr == NULL && (nmemb * size) != 0) + LAMIGO_OOM_AT(file, line, func, (nmemb * size)); + + return ptr; +} + +static inline void *xstrdup1(const char *file, int line, const char *func, const char *s) +{ + void *ptr = strdup(s); + + if (ptr == NULL) + LAMIGO_OOM_AT(file, line, func, strlen(s) + 1); + + return ptr; +} + +#define xmalloc(size) (xmalloc1(__FILE__, __LINE__, __func__, (size))) +#define xcalloc(nmemb, size) (xcalloc1(__FILE__, __LINE__, __func__, (nmemb), (size))) +#define xstrdup(s) (xstrdup1(__FILE__, __LINE__, __func__, (s))) + #endif diff --git a/lipe/src/lamigo_alr.c b/lipe/src/lamigo_alr.c index 2e61fd2..1e675a9 100644 --- a/lipe/src/lamigo_alr.c +++ b/lipe/src/lamigo_alr.c @@ -145,12 +145,7 @@ static void lamigo_alr_update_one(struct lu_fid *fid, enum alr_rw rw, { struct alr_rec_temp *t; - t = calloc(sizeof(*t), 1); - if (!t) { - llapi_error(LLAPI_MSG_FATAL, -ENOMEM, - "failed to alloc memory for alr_rec_temp"); - exit(1); - } + t = xcalloc(sizeof(*t), 1); t->art_fid = *fid; t->art_ops = ops; t->art_rw = rw; @@ -494,13 +489,7 @@ static void lamigo_alr_update_heat_all(void) p->alp_avg[0] = asum[0] / nr; p->alp_avg[1] = asum[1] / nr; - sa = malloc(sizeof(*sa) * nr); - if (!sa) { - /* XXX: better handling */ - llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0, - "cannot allocate for sorting\n"); - exit(1); - } + sa = xmalloc(sizeof(*sa) * nr); i = 0; lipe_list_for_each_entry(f, &p->alp_list, ar_link) { assert(i < nr); @@ -571,12 +560,7 @@ void lamigo_alr_process_temp_one(struct lu_fid *fid, int rw, __u64 ops, f = container_of(fh, struct alr_rec, ar_fh); if (f == NULL) { - f = calloc(1, sizeof(*f)); - if (f == NULL) { - llapi_error(LLAPI_MSG_ERROR, -ENOMEM, - "failed to alloc memory for alr_rec"); - return; - } + f = xcalloc(1, sizeof(*f)); f->ar_fh.fh_fid = *fid; f->ar_start = alr_period; @@ -843,12 +827,9 @@ void lamigo_alr_init(void) char *str; str = strstr(opt.o_mdtname, "-MDT"); - if (!str) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "failed to get MDT index from %s\n", - opt.o_mdtname); - exit(1); - } + if (!str) + LAMIGO_FATAL("cannot get MDT index from '%s'\n", opt.o_mdtname); + mdtidx = strtoul(str + 4, NULL, 16); if (opt.o_alr_ofd_interval == 0) { /* not passed as an option */ @@ -858,15 +839,11 @@ void lamigo_alr_init(void) } rc = fid_hash_init(&alr_head.alh_hash); - if (rc) { - llapi_err_noerrno(LLAPI_MSG_ERROR, - "failed to alloc memory for hash (%zu).", - sizeof(struct hlist_head) * FID_HASH_ENTRIES); - exit(1); - } + if (rc) + LAMIGO_OOM(-1); - alr_head.alh_period = calloc(sizeof(*alr_head.alh_period), - opt.o_alr_periods); + alr_head.alh_period = xcalloc(sizeof(*alr_head.alh_period), + opt.o_alr_periods); for (i = 0; i < opt.o_alr_periods; i++) { struct alr_period *p = &alr_head.alh_period[i]; p->alp_nr = 0; @@ -880,20 +857,13 @@ void lamigo_alr_init(void) lipe_list_for_each_entry(ala, &alr_agent_list, ala_list) { rc = pthread_create(&ala->ala_pid, NULL, lamigo_alr_data_collection_thread, ala); - if (rc) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "cannot start access reader: rc=%d\n", - rc); - exit(1); - } + if (rc) + LAMIGO_FATAL("cannot start access log reader: %s\n", strerror(rc)); } rc = pthread_create(&pid, NULL, lamigo_alr_heat_thread, NULL); - if (rc) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "failed to start heat-maint thread\n"); - exit(1); - } + if (rc) + LAMIGO_FATAL("cannot start heat-maint thread: %s\n", strerror(rc)); } void lamigo_add_alr_agent(const char *host) @@ -901,9 +871,7 @@ void lamigo_add_alr_agent(const char *host) struct alr_agent *ala; int rc; - ala = calloc(1, sizeof(*ala)); - assert(ala != NULL); - + ala = xcalloc(1, sizeof(*ala)); ala->ala_host = strdup(host); assert(ala->ala_host != NULL); diff --git a/lipe/src/lamigo_hash.c b/lipe/src/lamigo_hash.c index 57cd8f8..88fbef5 100644 --- a/lipe/src/lamigo_hash.c +++ b/lipe/src/lamigo_hash.c @@ -1,6 +1,9 @@ #include +#include "lamigo.h" #include "lamigo_hash.h" +int fid_hash_shift = 14; + /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define GOLDEN_RATIO_PRIME_32 0x9e370001UL /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ @@ -177,21 +180,8 @@ int fid_hash_init(struct fid_hash_head *hash) { int i; - hash->fhh_hash = malloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES); - if (hash->fhh_hash == NULL) { - llapi_err_noerrno(LLAPI_MSG_FATAL, - "failed to alloc memory for hash (%zu).", - sizeof(struct hlist_head) * FID_HASH_ENTRIES); - return -ENOMEM; - } - hash->fhh_mutex = malloc(sizeof(pthread_mutex_t) * FID_HASH_ENTRIES); - if (hash->fhh_mutex == NULL) { - free(hash->fhh_hash); - llapi_err_noerrno(LLAPI_MSG_FATAL, - "failed to alloc memory for mutex (%zu).", - sizeof(pthread_mutex_t) * FID_HASH_ENTRIES); - return -ENOMEM; - } + hash->fhh_hash = xmalloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES); + hash->fhh_mutex = xmalloc(sizeof(pthread_mutex_t) * FID_HASH_ENTRIES); for (i = 0; i < FID_HASH_ENTRIES; i++) { INIT_HLIST_HEAD(&hash->fhh_hash[i]); diff --git a/lipe/src/lamigo_hash.h b/lipe/src/lamigo_hash.h index cd78f43..e7bd4ae 100644 --- a/lipe/src/lamigo_hash.h +++ b/lipe/src/lamigo_hash.h @@ -21,7 +21,7 @@ struct fid_hash_head { pthread_mutex_t *fhh_mutex; }; -static int fid_hash_shift = 14; +extern int fid_hash_shift; #define FID_HASH_ENTRIES (1 << fid_hash_shift) #define FID_ON_HASH(f) (!hlist_unhashed(&(f)->fh_node)) diff --git a/lustre/tests/hot-pools.sh b/lustre/tests/hot-pools.sh index 18fe2c1..f9d621d 100755 --- a/lustre/tests/hot-pools.sh +++ b/lustre/tests/hot-pools.sh @@ -1188,7 +1188,7 @@ test_7() { sleep $LAMIGO_AGE do_facet $facet "cat $log_file" | - grep -q "Target pool $tgt_pool is empty, waiting" || + grep --ignore-case -q "target pool '$tgt_pool' is empty, waiting" || error "failed to use default pool '$tgt_pool'" } run_test 7 "lamigo: start with no OST pools" -- 1.8.3.1