Whamcloud - gitweb
EX-3889 lipe: lamigo error reporting and signal handling
authorJohn L. Hammond <jhammond@whamcloud.com>
Wed, 22 Sep 2021 19:15:52 +0000 (14:15 -0500)
committerJohn L. Hammond <jhammond@whamcloud.com>
Fri, 14 Jan 2022 21:35:24 +0000 (21:35 +0000)
Add new macros LAMIGO_{FATAL,ERROR,WARN,INFO,DEBUG}() to replace the
existing calls to llapi_error() and llapi_printf(). Replace almost all
open coded calls to exit() with LAMIGO_FATAL(). Handle signals
(SIGTERM, SIGUSR1, SIGUSR2) from a dedicated thread. Add
x{malloc,calloc,strdup}() macros that call LAMIGO_FATAL() on OOM
conditions. In main() replace the while (!stop) loop with a
non-breaking while (1) loop.

Test-Parameters: trivial testlist=hot-pools
Signed-off-by: John L. Hammond <jhammond@whamcloud.com>
Change-Id: Idc31da6eca847305ca16b9992a7fb22aa4d0f112
Reviewed-on: https://review.whamcloud.com/45026
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Alexandre Ioffe <aioffe@ddn.com>
Reviewed-on: https://review.whamcloud.com/45210

lipe/src/lamigo.c
lipe/src/lamigo.h
lipe/src/lamigo_alr.c
lipe/src/lamigo_hash.c
lipe/src/lamigo_hash.h
lustre/tests/hot-pools.sh

index 246cccd..e111e18 100644 (file)
@@ -90,6 +90,9 @@
 #define LAMIGO_HEAT_FILE       "/var/run/lamigo-%s.heat"
 #define LAMIGO_PIDFILE "/var/run/lamigo-%s.pid"
 
+int lamigo_log_level = LLAPI_MSG_INFO;
+const char *lamigo_mdt_name = "NONE";
+
 static LIPE_LIST_HEAD(lamigo_rule_list);
 __u64 lamigo_rule_attrs; /* attributes needed to evalute the rules */
 
@@ -148,7 +151,7 @@ static void usage(void)
               DEF_HOT_AFTER_IDLE,
               DEF_SRC_FREE,
               DEF_TGT_FREE);
-       exit(0);
+       exit(EXIT_SUCCESS);
 }
 
 #define container_of(ptr, type, member) ({                      \
@@ -201,7 +204,6 @@ enum amigo_resync_type {
 };
 
 struct options opt = {
-       .o_verbose = LLAPI_MSG_INFO,
        .o_min_age = DEF_MIN_AGE,
        .o_cache_size = DEF_CACHE_SIZE,
        .o_chlg_clear_frequency = 4096,
@@ -426,14 +428,7 @@ static void lamigo_dump_history(FILE *out)
        }
 }
 
-static void lamigo_sigterm_handler(int sig)
-{
-       psignal(sig, "exiting");
-
-       _exit(0);
-}
-
-static void lamigo_sigusr1_handler(int sig)
+static void lamigo_dump_stats_file(void)
 {
        struct resync_agent *a;
        struct pool_list *pl;
@@ -442,7 +437,7 @@ static void lamigo_sigusr1_handler(int sig)
        FILE *f;
        int i;
 
-       llapi_printf(LLAPI_MSG_DEBUG, "dump to %s\n", opt.o_dump_file);
+       LAMIGO_DEBUG("dumping stats to '%s'\n", opt.o_dump_file);
        if (opt.o_dump_file == NULL)
                return;
        f = fopen(opt.o_dump_file, "w");
@@ -569,16 +564,16 @@ static void lamigo_sigusr1_handler(int sig)
        fclose(f);
 }
 
-static void lamigo_sigusr2_handler(int sig)
+static void lamigo_dump_heat_file(void)
 {
        FILE *f;
 
-       llapi_printf(LLAPI_MSG_DEBUG, "heat to %s\n", opt.o_heat_file);
+       LAMIGO_DEBUG("dumping heat to '%s'\n", opt.o_heat_file);
        if (opt.o_heat_file == NULL)
                return;
        f = fopen(opt.o_heat_file, "w");
        if (!f) {
-               llapi_printf(LLAPI_MSG_DEBUG, "can't open heat file\n");
+               LAMIGO_ERROR("cannot open heat file '%s': %s\n", opt.o_heat_file, strerror(errno));
                return;
        }
        lamigo_alr_dump_heat_table(f);
@@ -604,24 +599,6 @@ static int lamigo_init_cache(void)
        return 0;
 }
 
-static void lamigo_cleanup(void)
-{
-       struct resync_agent *agent;
-
-       fid_hash_free(&head.lh_hash);
-       lipe_list_for_each_entry(agent, &lamigo_agent_list, rag_list) {
-               struct resync_ssh_session *rss, *tmp;
-
-               lipe_list_for_each_entry_safe(rss, tmp,
-                                             &agent->rag_ssh_list,
-                                             rss_list) {
-                       lipe_ssh_context_destroy(&rss->rss_ctx);
-                       lipe_list_del(&rss->rss_list);
-                       free(rss);
-               }
-       }
-}
-
 static int lamigo_exec_cmd(struct resync_agent *a, const char *cmd, int *pstatus)
 {
        struct resync_ssh_session *rss;
@@ -656,7 +633,6 @@ void *lamigo_replicate_one(void *args)
        int resync = rj->rj_resync;
        char cmd[PATH_MAX * 2];
        int status = INT_MAX;
-       enum llapi_message_level msg_level;
        int rc;
 
        if (rj->rj_setprefer) {
@@ -665,7 +641,7 @@ void *lamigo_replicate_one(void *args)
                         "'%s/.lustre/fid/"DFID"' > /dev/null 2>&1", rj->rj_pool,
                         agent->rag_mountpoint,
                         PFID(&rj->rj_fid));
-               llapi_printf(LLAPI_MSG_DEBUG, "set prefer on "DFID"\n",
+               LAMIGO_DEBUG("set prefer on "DFID"\n",
                             PFID(&rj->rj_fid));
        } else if (resync == AMIGO_RESYNC_EXTEND) {
                int i;
@@ -688,7 +664,7 @@ void *lamigo_replicate_one(void *args)
                         agent->rag_mountpoint,
                         PFID(&rj->rj_fid));
        } else {
-               llapi_err_noerrno(LLAPI_MSG_ERROR, "unknown resync: %d", resync);
+               LAMIGO_ERROR("unknown resync: %d\n", resync);
                rc = -EINVAL;
                goto out;
        }
@@ -696,18 +672,15 @@ void *lamigo_replicate_one(void *args)
        /* rc < 0 means an ssh error. Otherwise command exit status is
         * in status. Mask common exit statuses. */
        rc = lamigo_exec_cmd(agent, cmd, &status);
+       LAMIGO_DEBUG("exec command '%s' on '%s': rc = %d, status = %d\n",
+                    cmd, agent->rag_hostname, rc, status);
        if (rc < 0 ||
            /* 1 from setprefer (see EX-3591) */
            (rj->rj_setprefer && status != 0 && status != 1) ||
            /* EBUSY from mirror extend/resync */
            (!rj->rj_setprefer && status != 0 && status != EBUSY))
-               msg_level = LLAPI_MSG_ERROR;
-       else
-               msg_level = LLAPI_MSG_DEBUG;
-
-       llapi_error(msg_level|LLAPI_MSG_NO_ERRNO, 0,
-                   "error executing command '%s' on '%s': rc = %d, status = %d",
-                   cmd, agent->rag_hostname, rc, status);
+               LAMIGO_ERROR("command '%s' on '%s' failed: rc = %d, status = %d\n",
+                            cmd, agent->rag_hostname, rc, status);
 out:
        /* notify the main thread about completion */
        write(lamigo_sigpipe[1], &rc, 1);
@@ -742,13 +715,13 @@ static int lamigo_spawn_replication(struct resync_job *rj)
                }
        }
        if (!a) {
-               llapi_printf(LLAPI_MSG_DEBUG, "no good agent\n");
+               LAMIGO_DEBUG("no good agent\n");
                return -EBUSY;
        }
        rj->rj_agent = a;
        rj->rj_start = time(NULL);
 
-       llapi_printf(LLAPI_MSG_DEBUG, "new job %s for "DFID" spawned on %s\n",
+       LAMIGO_DEBUG("new job %s for "DFID" spawned on %s\n",
                resync == AMIGO_RESYNC_EXTEND ? "extend" : "resync",
                PFID(&rj->rj_fid), rj->rj_agent->rag_hostname);
 
@@ -813,8 +786,7 @@ static int lamigo_get_objects(struct lov_user_md_v3 *v3,
        } else {
                *objects = NULL;
                *stripes = 0;
-               llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                           "unsupported LOV magic %x", v3->lmm_magic);
+               LAMIGO_ERROR("unsupported LOV magic %x\n", v3->lmm_magic);
                return -EINVAL;
        }
        return 0;
@@ -957,8 +929,7 @@ static int lamigo_striping_is_in_sync(struct lov_user_md *lum,
                }
 
                if (opt.o_src_dom && v3->lmm_pattern == LOV_PATTERN_MDT) {
-                       llapi_printf(LLAPI_MSG_DEBUG,
-                                       "DoM component");
+                       LAMIGO_DEBUG("DoM component");
                        onsrc++;
                        continue;
                }
@@ -1051,7 +1022,7 @@ lamigo_check_user_rules(struct lipe_object_attrs *attrs,
 
                rc = lipe_rule_evaluate(rule, attrs, sysattrs, &result);
                if (rc) {
-                       llapi_error(LLAPI_MSG_ERROR, rc, "rule failed");
+                       LAMIGO_ERROR("cannot evaluate rule: %s\n", strerror(-rc));
                        return AMIGO_RESYNC_NONE;
                }
                if (!result)
@@ -1155,7 +1126,7 @@ static int lamigo_get_attrs(const struct lu_fid *fid,
                        snprintf(attrs->loa_fid_str, sizeof(attrs->loa_fid_str),
                                 DFID_NOBRACE, PFID(&attrs->loa_fid));
                        attrs->loa_attr_bits |= LIPE_OBJECT_ATTR_LMAEA;
-                       llapi_printf(LLAPI_MSG_DEBUG, "got LMA: %d\n", rc);
+                       LAMIGO_DEBUG("got LMA: %d\n", rc);
                }
        }
 
@@ -1231,9 +1202,8 @@ static int lamigo_is_in_sync(struct lu_fid *fid,
         */
        resync = lamigo_check_user_rules(&attrs, &sysattrs);
        if (resync == AMIGO_RESYNC_NONE) {
-               llapi_printf(LLAPI_MSG_DEBUG,
-                           "skip "DFID" due to rules\n",
-                           PFID(fid));
+               LAMIGO_DEBUG("skip "DFID" due to rules\n",
+                            PFID(fid));
                stats.s_skip_by_rule++;
                goto out;
        }
@@ -1243,7 +1213,7 @@ static int lamigo_is_in_sync(struct lu_fid *fid,
 out:
        lamigo_hist_add(fid, resync);
 
-       llapi_printf(LLAPI_MSG_DEBUG, "check "DFID" stripes=%d: resync=%d\n",
+       LAMIGO_DEBUG("check "DFID" stripes=%d: resync=%d\n",
                     PFID(fid), mo->mo_stripes, resync);
 
        return resync;
@@ -1287,12 +1257,7 @@ static void lamigo_check_bad_agents(void)
                                    (void *)a);
                if (rc)
                        return;
-               rj = calloc(1, sizeof(struct resync_job));
-               if (rj == NULL) {
-                       llapi_err_noerrno(LLAPI_MSG_ERROR,
-                                         "can't allocate for a test job");
-                       return;
-               }
+               rj = xcalloc(1, sizeof(*rj));
                rj->rj_check_job = 1;
                rj->rj_pid = pid;
                rj->rj_agent = a;
@@ -1342,12 +1307,7 @@ void lamigo_schedule_setprefer(struct resync_job *rj, void *cbdata, int rc)
        if (rc)
                return;
 
-       srj = calloc(1, sizeof(*srj));
-       if (srj == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job");
-               return;
-       }
-
+       srj = xcalloc(1, sizeof(*srj));
        srj->rj_fid = rj->rj_fid;
        srj->rj_setprefer = 1;
        /* XXX: few src pools? */
@@ -1367,14 +1327,14 @@ static int lamigo_update_one(struct fid_rec *f)
                /* cold pool is close to full, skip replication */
                /* do this check before expensive layout fetching, rules, etc */
                stats.s_skip_tgt_closed++;
-               llapi_printf(LLAPI_MSG_DEBUG, "pool %s closed for "DFID"\n",
+               LAMIGO_DEBUG("pool %s closed for "DFID"\n",
                                tgt_pools->pl_pool, PFID(&f->fr_fh.fh_fid));
                return 0;
        }
 
        if (are_agents_busy()) {
                /* all the agents are busy */
-               llapi_printf(LLAPI_MSG_DEBUG, "no agents avilable (max: %d)\n", lamigo_max_jobs);
+               LAMIGO_DEBUG("no agents avilable (max: %d)\n", lamigo_max_jobs);
                return 1;
        }
 
@@ -1386,7 +1346,7 @@ static int lamigo_update_one(struct fid_rec *f)
                        return 0;
                }
                if (ah.ah_hot && alr_period - ah.ah_hot <= 1) {
-                       llapi_printf(LLAPI_MSG_DEBUG,
+                       LAMIGO_DEBUG(
                                     "skip hot "DFID" in %u, now %lu\n",
                                     PFID(&f->fr_fh.fh_fid), ah.ah_hot,
                                     alr_period);
@@ -1402,11 +1362,7 @@ static int lamigo_update_one(struct fid_rec *f)
                return 0;
        }
 
-       rj = calloc(1, sizeof(struct resync_job));
-       if (rj == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job");
-               return 1;
-       }
+       rj = xcalloc(1, sizeof(*rj));
        rj->rj_fid = f->fr_fh.fh_fid;
        rj->rj_stripes = mo.mo_stripes;
        rj->rj_index = f->fr_index;
@@ -1461,8 +1417,7 @@ static int lamigo_check_sync(void)
                                     struct resync_job, rj_list);
                lipe_list_del(&rj->rj_list);
                rc = lamigo_submit_job(rj);
-               llapi_printf(LLAPI_MSG_DEBUG,
-                            "tried to resubmit failed job %p: rc=%d\n", rj, rc);
+               LAMIGO_DEBUG("tried to resubmit failed job %p: rc=%d\n", rj, rc);
                if (rc != 0)
                        return rc;
        }
@@ -1532,14 +1487,7 @@ static int lamigo_process_record(struct changelog_rec *rec)
 
        fh = fid_hash_find(&head.lh_hash, &rec->cr_tfid);
        if (fh == NULL) {
-               f = calloc(sizeof(struct fid_rec), 1);
-               if (f == NULL) {
-                       rc = -ENOMEM;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "failed to alloc memory for fid_rec");
-                       return rc;
-               }
-
+               f = xcalloc(1, sizeof(*f));
                f->fr_fh.fh_fid = rec->cr_tfid;
                f->fr_index = index;
                f->fr_time = rec->cr_time;
@@ -1634,7 +1582,7 @@ static void lamigo_check_and_clear_changelog(void)
            index - lamigo_last_cleared_index < opt.o_chlg_clear_frequency)
                return;
 
-       llapi_printf(LLAPI_MSG_DEBUG, "CLEAR upto %llu in %s (%llu last)\n",
+       LAMIGO_DEBUG("CLEAR upto %llu in %s (%llu last)\n",
                     index, opt.o_chlg_user, lamigo_last_processed_idx);
        lamigo_last_cleared_index = index;
        rc = llapi_changelog_clear(opt.o_mdtname, opt.o_chlg_user, index);
@@ -1648,10 +1596,9 @@ static void lamigo_check_and_clear_changelog(void)
 
 static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
 {
-       llapi_printf(LLAPI_MSG_DEBUG,
-                    "job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n",
-                   rj->rj_pid, PFID(&rj->rj_fid), time(NULL) - rj->rj_start,
-                   retval, rj->rj_agent->rag_bad);
+       LAMIGO_DEBUG("job %lu on "DFID" done in %lu: %"PRIdPTR" (%d)\n",
+                    rj->rj_pid, PFID(&rj->rj_fid), time(NULL) - rj->rj_start,
+                    retval, rj->rj_agent->rag_bad);
 
        rj->rj_done_timestamp = time(NULL);
 
@@ -1660,16 +1607,16 @@ static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
                if (retval == 0) {
                        /* the agent is back */
                        if (rj->rj_agent->rag_bad) {
-                               llapi_printf(LLAPI_MSG_DEBUG, "agent %s is back\n",
-                                           rj->rj_agent->rag_hostname);
+                               LAMIGO_DEBUG("agent %s is back\n",
+                                            rj->rj_agent->rag_hostname);
                                rj->rj_agent->rag_bad = false;
                                lamigo_max_jobs += rj->rj_agent->rag_maxjobs;
                        }
                } else {
                        /* the agent is still bad */
                        if (rj->rj_agent->rag_bad == false) {
-                               llapi_printf(LLAPI_MSG_DEBUG, "agent %s is bad\n",
-                                           rj->rj_agent->rag_hostname);
+                               LAMIGO_DEBUG("agent %s is bad\n",
+                                            rj->rj_agent->rag_hostname);
 
                                assert(lamigo_max_jobs >= rj->rj_agent->rag_maxjobs);
                                lamigo_max_jobs -= rj->rj_agent->rag_maxjobs;
@@ -1744,13 +1691,7 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
        struct resync_agent *a;
        int i;
 
-       a = calloc(1, sizeof(*a));
-       if (!a) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                 "can't allocate memory for agent");
-               exit(1);
-       }
-
+       a = xcalloc(1, sizeof(*a));
        a->rag_index = lamigo_agent_count;
        a->rag_hostname = strdup(host);
        a->rag_mountpoint = strdup(mnt);
@@ -1758,22 +1699,16 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
                char *endptr;
 
                a->rag_maxjobs = strtol(jobs, &endptr, 10);
-               if (*endptr != '\0') {
-                       llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                         "invalid jobs: '%s' (1-2048 expected)",
-                                         jobs);
-                       exit(1);
-               }
+               if (*endptr != '\0')
+                       LAMIGO_FATAL("invalid jobs: '%s' (1-2048 expected)\n", jobs);
        } else {
                a->rag_maxjobs = DEF_AGENT_JOBS;
        }
 
-       if (a->rag_maxjobs < 1 || a->rag_maxjobs > 2048) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                 "invalid jobs per agent: %d (1-2048 expected)",
-                                 a->rag_maxjobs);
-               exit(1);
-       }
+       if (a->rag_maxjobs < 1 || a->rag_maxjobs > 2048)
+               LAMIGO_FATAL("invalid jobs per agent: %d (1-2048 expected)\n",
+                            a->rag_maxjobs);
+
        lipe_list_add(&a->rag_list, &lamigo_agent_list);
 
        a->rag_jobs = 0;
@@ -1785,27 +1720,17 @@ static void lamigo_add_agent(const char *host, const char *mnt, char *jobs)
 
        /* ssh context per job, and one more for agent heartbeat */
        for (i = 0; i < a->rag_maxjobs + 1; i++) {
-               struct resync_ssh_session *rss = calloc(1, sizeof(*rss));
+               struct resync_ssh_session *rss = xcalloc(1, sizeof(*rss));
                int rc;
 
-               if (!rss) {
-                       llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                    "can't allocate memory for agent ssh session\n");
-                       exit(1);
-               }
-
                rc = lipe_ssh_context_init(&rss->rss_ctx, a->rag_hostname);
-               if (rc < 0) {
-                       llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                   "cannot create SSH context for '%s'\n",
-                                   a->rag_hostname);
-                       exit(1);
-               }
-
+               if (rc < 0)
+                       LAMIGO_FATAL("cannot create SSH context for '%s': rc = %d\n",
+                                    a->rag_hostname, rc);
                lipe_list_add(&rss->rss_list, &a->rag_ssh_list);
        }
 
-       llapi_printf(LLAPI_MSG_DEBUG, "AGENT: %s %s %d\n", a->rag_hostname,
+       LAMIGO_DEBUG("AGENT: %s %s %d\n", a->rag_hostname,
                     a->rag_mountpoint, a->rag_maxjobs);
 
        lamigo_agent_count++;
@@ -1917,16 +1842,8 @@ struct pool_list *lamigo_alloc_pool(char *pool)
 {
        struct pool_list *pl;
 
-       pl = calloc(sizeof(*pl), 1);
-       if (pl == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't allocate pool");
-               exit(1);
-       }
-       pl->pl_pool = strdup(pool);
-       if (pl->pl_pool == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't allocate pool name");
-               exit(1);
-       }
+       pl = xcalloc(sizeof(*pl), 1);
+       pl->pl_pool = xstrdup(pool);
        pl->pl_ostnr = 0;
        pl->pl_osts = NULL;
        pthread_rwlock_init(&pl->pl_lock, NULL);
@@ -1963,38 +1880,29 @@ void lamigo_refresh_osts_from_pool(struct pool_list *pl)
        int oldlevel;
 
        rc = cfs_get_param_paths(&paths, "lod/%s-*/numobd", fsname);
-       if (rc != 0) {
-               llapi_error(LLAPI_MSG_FATAL, errno,
-                           "can't find numobd fs '%s'", fsname);
-               exit(1);
-       }
+       if (rc != 0)
+               LAMIGO_FATAL("cannot read OBD count from 'lod/%s-*/numobd': %s\n",
+                            fsname, strerror(errno));
+
        for (i = 0; i < paths.gl_pathc; i++) {
                rc = lamigo_read_file(paths.gl_pathv[i], data, sizeof(data));
                if (rc >= 0) {
                        char *endptr;
 
                        obdcount = strtol(data, &endptr, 10);
-                       if (*endptr != '\0') {
-                               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                                 "invalid numobd: '%s'", data);
-                               exit(1);
-                       }
+                       if (*endptr != '\0')
+                               LAMIGO_FATAL("invalid OBD count '%s'\n", data);
+
                        break;
                }
        }
        globfree(&paths);
 
-       if (obdcount < 0) {
-               llapi_error(LLAPI_MSG_FATAL, errno, "can't find fs '%s'", fsname);
-               exit(1);
-       }
+       if (obdcount < 0)
+               LAMIGO_FATAL("cannot find filesystem '%s'\n", fsname);
 
        bufsize = sizeof(struct obd_uuid) * obdcount;
-       buffer = malloc(bufsize + sizeof(*list) * obdcount);
-       if (buffer == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't get mem for pool members");
-               exit(1);
-       }
+       buffer = xmalloc(bufsize + sizeof(*list) * obdcount);
        list = (char **) (buffer + bufsize);
        snprintf(poolname, sizeof(poolname), "%s.%s", fsname, pl->pl_pool);
        oldlevel = llapi_msg_get_level();
@@ -2013,11 +1921,7 @@ void lamigo_refresh_osts_from_pool(struct pool_list *pl)
                goto out;
        }
        if (pl->pl_osts == NULL)
-               pl->pl_osts = malloc(sizeof(int) * nb);
-       if (pl->pl_osts == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't allocate mem for OST ind");
-               exit(1);
-       }
+               pl->pl_osts = xmalloc(sizeof(int) * nb);
 
        fslen = strlen(fsname);
        for (i = 0; i < nb; i++) {
@@ -2088,62 +1992,41 @@ void lamigo_process_opt(int c, char *optarg)
                break;
        case LAMIGO_OPT_OFD_INTERVAL:
                opt.o_alr_ofd_interval = atoi(optarg);
-               if (opt.o_alr_ofd_interval < 1) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "invalid ofd interval '%s'", optarg);
-                       exit(1);
-               }
+               if (opt.o_alr_ofd_interval < 1)
+                       LAMIGO_FATAL("invalid ofd interval '%s'\n", optarg);
                break;
        case LAMIGO_OPT_HOT_FRACTION:
                opt.o_alr_hot_fraction = atoi(optarg);
                if (opt.o_alr_hot_fraction < 1 ||
-                   opt.o_alr_hot_fraction > 100) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "invalid hot fraction '%s'", optarg);
-                       exit(1);
-               }
+                   opt.o_alr_hot_fraction > 100)
+                       LAMIGO_FATAL("invalid hot fraction '%s'\n", optarg);
                break;
        case LAMIGO_OPT_HOT_AFTER_IDLE:
                opt.o_alr_hot_after_idle = atoi(optarg);
                if (opt.o_alr_hot_after_idle < 1 ||
-                   opt.o_alr_hot_after_idle >= opt.o_alr_periods) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "invalid hot-after-idle '%s'", optarg);
-                       exit(1);
-               }
+                   opt.o_alr_hot_after_idle >= opt.o_alr_periods)
+                       LAMIGO_FATAL("invalid hot-after-idle '%s'\n", optarg);
                break;
        case LAMIGO_OPT_MIRROR_CMD:
                opt.o_mirror_cmd = strdup(optarg);
                break;
        case LAMIGO_OPT_POOL_REFRESH:
                opt.o_pool_refresh = strtol(optarg, &endptr, 10);
-               if (*endptr != '\0' || opt.o_pool_refresh < 1) {
-                       rc = -EINVAL;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "bad pool refresh interval '%s'", optarg);
-                       exit(1);
-               }
+               if (*endptr != '\0' || opt.o_pool_refresh < 1)
+                       LAMIGO_FATAL("invalid pool refresh interval '%s'\n", optarg);
                break;
        case LAMIGO_OPT_PROGRESS_INTV:
                opt.o_progress_interval = strtol(optarg, &endptr, 10);
-               if (*endptr != '\0' || opt.o_progress_interval < 1) {
-                       rc = -EINVAL;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "bad progress interval '%s'", optarg);
-                       exit(1);
-               }
+               if (*endptr != '\0' || opt.o_progress_interval < 1)
+                       LAMIGO_FATAL("invalid progress interval '%s'\n", optarg);
                break;
        case LAMIGO_OPT_ALR_EXTRA_ARGS:
                opt.o_alr_extra_args = optarg;
                break;
        case LAMIGO_OPT_SRC_FREE:
                opt.o_src_free = atoi(optarg);
-               if (opt.o_src_free < 1 || opt.o_src_free > 99) {
-                       rc = -EINVAL;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "bad source free space '%s'", optarg);
-                       exit(1);
-               }
+               if (opt.o_src_free < 1 || opt.o_src_free > 99)
+                       LAMIGO_FATAL("invalid source free space '%s'\n", optarg);
                break;
        case LAMIGO_OPT_SRC_DOM:
                opt.o_src_dom = 1;
@@ -2153,26 +2036,19 @@ void lamigo_process_opt(int c, char *optarg)
                break;
        case LAMIGO_OPT_TGT_FREE:
                opt.o_tgt_free = atoi(optarg);
-               if (opt.o_tgt_free < 1 || opt.o_tgt_free > 99) {
-                       rc = -EINVAL;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "bad target free space '%s'", optarg);
-                       exit(1);
-               }
+               if (opt.o_tgt_free < 1 || opt.o_tgt_free > 99)
+                       LAMIGO_FATAL("invalid target free space '%s'\n", optarg);
                break;
        case LAMIGO_OPT_VERSION:
                lipe_version();
-               exit(0);
+               exit(EXIT_SUCCESS);
        case 'a':
                opt.o_min_age = strtol(optarg, &endptr, 10);
-               if (*endptr != '\0' || opt.o_min_age < 5) {
-                       rc = -EINVAL;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "bad value for -a %s", optarg);
-                       exit(1);
-               }
+               if (*endptr != '\0' || opt.o_min_age < 5)
+                       LAMIGO_FATAL("invalid value for -a '%s'\n", optarg);
                break;
        case 'b':
+               lamigo_log_level = LLAPI_MSG_MAX;
                llapi_msg_set_level(LLAPI_MSG_MAX);
                break;
        case 'c': {
@@ -2180,12 +2056,8 @@ void lamigo_process_opt(int c, char *optarg)
 
                rc = strsize2int(&cache_size, optarg);
                if (rc < 0 || cache_size <= 0 ||
-                   (cache_size >= 100 && cache_size < 1<<20)) {
-                       rc = -EINVAL;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "bad value for -c '%s'", optarg);
-                       exit(1);
-               }
+                   (cache_size >= 100 && cache_size < 1<<20))
+                       LAMIGO_FATAL("invalid cache size '%s'\n", optarg);
 
                /* For value < 100, it is taken as the percentage of
                 * total memory instead.
@@ -2194,7 +2066,7 @@ void lamigo_process_opt(int c, char *optarg)
                        opt.o_cache_size = get_fid_cache_size(cache_size);
                else
                        opt.o_cache_size = cache_size;
-               llapi_printf(LLAPI_MSG_INFO, "Cache size: %lu\n", opt.o_cache_size);
+               LAMIGO_INFO("cache size: %lu\n", opt.o_cache_size);
                break;
                }
        case 'f':
@@ -2204,11 +2076,9 @@ void lamigo_process_opt(int c, char *optarg)
                host = strsep(&optarg, ":");
                mnt = strsep(&optarg, ":");
                jobs = strsep(&optarg, ":");
-               if (!host || !mnt) {
-                       llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                         "invalid agent definition");
-                       exit(1);
-               }
+               if (!host || !mnt)
+                       LAMIGO_FATAL("invalid agent definition\n");
+
                lamigo_add_agent(host, mnt, jobs);
                break;
        case 'h':
@@ -2219,32 +2089,24 @@ void lamigo_process_opt(int c, char *optarg)
                        enable_heat = 0;
                } else {
                        opt.o_alr_heat_fn = atoi(optarg);
-                       if (opt.o_alr_heat_fn < 0 || opt.o_alr_heat_fn > 1) {
-                               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                                 "invalid heat function '%s'",
-                                                 optarg);
-                               exit(1);
-                       }
+                       if (opt.o_alr_heat_fn < 0 || opt.o_alr_heat_fn > 1)
+                               LAMIGO_FATAL("invalid heat function '%s'\n", optarg);
                }
                break;
        case 'I':
                opt.o_alr_hot_after_idle = atoi(optarg);
                break;
        case 'm':
-               opt.o_mdtname = strdup(optarg);
+               lamigo_mdt_name = xstrdup(optarg);
+               opt.o_mdtname = xstrdup(optarg);
                break;
        case 'M':
                opt.o_mntpt = strdup(optarg);
                break;
        case 'n':
                opt.o_num_threads = strtoul(optarg, NULL, 0);
-               if (opt.o_num_threads < 1) {
-                       rc = -EINVAL;
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "invalid thread number: %d",
-                                   opt.o_num_threads);
-                       exit(1);
-               }
+               if (opt.o_num_threads < 1)
+                       LAMIGO_FATAL("invalid thread number: %d\n", opt.o_num_threads);
                break;
        case 'o':
                lamigo_add_alr_agent(optarg);
@@ -2262,7 +2124,7 @@ void lamigo_process_opt(int c, char *optarg)
                opt.o_chlg_user = strdup(optarg);
                break;
        case 'v':
-               opt.o_verbose++;
+               lamigo_log_level++;
                break;
        case 'w':
                opt.o_dump_file = strdup(optarg);
@@ -2272,13 +2134,11 @@ void lamigo_process_opt(int c, char *optarg)
                break;
        default:
                rc = -EINVAL;
-               llapi_error(LLAPI_MSG_ERROR, rc,
-                           "%s: unknown option '-%c'\n",
-                           program_invocation_short_name,
-                           optopt);
-               fprintf(stderr, "Try '%s --help' for more information.\n",
-                       program_invocation_short_name);
-               exit(1);
+               fprintf(stderr,
+                       "%s: unrecognized option '-%c'\n"
+                       "Try '%s --help' for more information.\n",
+                       program_invocation_short_name, optopt, program_invocation_short_name);
+               exit(EXIT_FAILURE + 1);
                break;
        }
 }
@@ -2304,11 +2164,8 @@ static void count_bracket_recursion(const char *str, int *counter)
                        (*counter)++;
                else if (*p == '}')
                        (*counter)--;
-               if (*counter < 0) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "invalid rule string");
-                       exit(1);
-               }
+               if (*counter < 0)
+                       LAMIGO_FATAL("invalid rule '%s'\n", str);
                p++;
        }
 }
@@ -2322,7 +2179,7 @@ char *stracat(char *src, char *dst)
                len += strlen(src);
        if (dst)
                len += strlen(dst);
-       n = malloc(len + 1);
+       n = xmalloc(len + 1);
        if (src)
                strcpy(n, src);
        if (dst)
@@ -2379,11 +2236,9 @@ static void load_config(char *name)
        FILE *f;
 
        f = fopen(name, "r");
-       if (!f) {
-               llapi_error(LLAPI_MSG_FATAL, errno,
-                           "can't open config file %s", name);
-               exit(1);
-       }
+       if (!f)
+               LAMIGO_FATAL("cannot open config file '%s': %s\n", name, strerror(errno));
+
        while (!feof(f)) {
                struct option *opt;
                char *s, *t;
@@ -2427,15 +2282,12 @@ static void load_config(char *name)
                    opt->has_arg == optional_argument) {
                        optarg = strsep(&s, "\n ");
                        if (!optarg &&
-                            opt->has_arg == required_argument) {
-                               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                                 "no argument for %s", t);
-                               exit(1);
-                       }
+                           opt->has_arg == required_argument)
+                               LAMIGO_FATAL("option '%s' requires an argument\n", t);
                } else {
                        optarg = NULL;
                }
-               llapi_printf(LLAPI_MSG_DEBUG, "conf: %s %s\n", t, optarg);
+               LAMIGO_DEBUG("conf: %s %s\n", t, optarg);
                lamigo_process_opt(opt->val, optarg);
        }
 
@@ -2456,7 +2308,7 @@ void lamigo_parse_opts(int argc, char **argv)
                        fprintf(stderr,
                                "Try '%s --help' for more information.\n",
                                program_invocation_short_name);
-                       exit(1);
+                       exit(EXIT_FAILURE + 1);
                }
                if (strcmp(options[opt_index].name, "mountpoint") == 0)
                        llapi_err_noerrno(LLAPI_MSG_WARN,
@@ -2464,66 +2316,48 @@ void lamigo_parse_opts(int argc, char **argv)
                lamigo_process_opt(c, optarg);
        }
 
-       if (!opt.o_mntpt) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR,
-                                 "%s: no mount point specified\n", argv[0]);
-               exit(1);
-       }
+       if (!opt.o_mntpt)
+               LAMIGO_FATAL("no mount point specified\n");
 
        rc = llapi_search_fsname(opt.o_mntpt, fsname);
-       if (rc < 0) {
-               llapi_error(LLAPI_MSG_ERROR, rc,
-                           "cannot find a Lustre file system mounted at '%s'",
-                           opt.o_mntpt);
-               exit(1);
-       }
+       if (rc < 0)
+               LAMIGO_FATAL("cannot find a Lustre file system mounted at '%s'\n",
+                            opt.o_mntpt);
+
+       if (!opt.o_mdtname)
+               LAMIGO_FATAL("no MDT specified\n");
 
-       if (!opt.o_mdtname) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "no MDT specified");
-               exit(1);
-       }
        rc = cfs_get_param_paths(&paths, "mdt/%s/uuid", opt.o_mdtname);
-       if (rc != 0) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't find MDT %s", opt.o_mdtname);
-               exit(1);
-       }
+       if (rc != 0)
+               LAMIGO_FATAL("cannot find MDT uuid from 'mdt/%s/uuid': %s\n",
+                            opt.o_mdtname, strerror(errno));
+
        globfree(&paths);
 
        snprintf(buf, sizeof(buf), "%s/.lustre/fid", opt.o_mntpt);
        open_by_fid_fd = open(buf, O_RDONLY);
-       if (open_by_fid_fd < 0) {
-               llapi_error(LLAPI_MSG_FATAL, errno, "can't open '%s'", buf);
-               exit(1);
-       }
+       if (open_by_fid_fd < 0)
+               LAMIGO_FATAL("cannot open '%s': %s\n", buf, strerror(errno));
 
        if (src_pools == NULL) {
                lamigo_parse_pool(DEF_SOURCE_POOL);
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                            "source pools aren't defined, use %s",
-                            DEF_SOURCE_POOL);
+               LAMIGO_WARN("source pools aren't defined, using '%s'\n", DEF_SOURCE_POOL);
        }
 
        if (opt.o_tgt_pool == NULL) {
                opt.o_tgt_pool = DEF_TARGET_POOL;
-               llapi_err_noerrno(LLAPI_MSG_INFO,
-                            "target pool not defined, use %s",
-                            opt.o_tgt_pool);
+               LAMIGO_WARN("target pool is not defined, using %s\n", opt.o_tgt_pool);
        }
        opt.o_tgt_pool_len = strlen(opt.o_tgt_pool);
 
-       if (lamigo_lookup_pool(opt.o_tgt_pool)) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                            "target pool '%s' cannot also be source pool",
+       if (lamigo_lookup_pool(opt.o_tgt_pool))
+               LAMIGO_FATAL("target pool '%s' cannot also be source pool\n",
                             opt.o_tgt_pool);
-               exit(1);
-       }
 
-       if (lipe_list_empty(&lamigo_agent_list)) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR, "no agents configured?");
-               exit(1);
-       }
+       if (lipe_list_empty(&lamigo_agent_list))
+               LAMIGO_FATAL("no agents configured\n");
 
-       llapi_printf(LLAPI_MSG_DEBUG, "target pool: %s/%d\n", opt.o_tgt_pool,
+       LAMIGO_DEBUG("target pool: %s/%d\n", opt.o_tgt_pool,
                     opt.o_tgt_pool_len);
        tgt_pools = lamigo_alloc_pool(opt.o_tgt_pool);
 
@@ -2541,11 +2375,8 @@ void lamigo_parse_opts(int argc, char **argv)
        opt.o_batch_sync_cnt = opt.o_cached_fid_hiwm / 2;
 
        rc = pipe2(lamigo_sigpipe, O_NONBLOCK);
-       if (rc < 0) {
-               llapi_error(LLAPI_MSG_FATAL, errno,
-                            "cannot create sigpipe");
-               exit(1);
-       }
+       if (rc < 0)
+               LAMIGO_FATAL("cannot create sigpipe: %s\n", strerror(errno));
 }
 
 static void lamigo_wait_for_job_completion(int timeout)
@@ -2575,11 +2406,7 @@ static int lamigo_create_job(struct lu_fid *fid,
 {
        struct resync_job *rj;
 
-       rj = calloc(1, sizeof(struct resync_job));
-       if (rj == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job");
-               return 1;
-       }
+       rj = xcalloc(1, sizeof(*rj));
        rj->rj_fid = *fid;
        rj->rj_stripes = mo->mo_stripes;
        rj->rj_resync = resync;
@@ -2818,7 +2645,7 @@ int lamigo_rescan(void)
        diff_timevals(&result.sr_time_start, &result.sr_time_end,
                      &result.sr_time_diff);
 
-       llapi_printf(LLAPI_MSG_DEBUG, "finished scanning in %d.%06u seconds\n",
+       LAMIGO_DEBUG("finished scanning in %d.%06u seconds\n",
                     (int)result.sr_time_diff.tv_sec,
                     (unsigned int)result.sr_time_diff.tv_usec);
 
@@ -2833,10 +2660,9 @@ static void lamigo_changelog_check_and_set_mask(void)
 
        rc = cfs_get_param_paths(&paths, "mdd/%s/changelog_mask",
                                 opt.o_mdtname);
-       if (rc != 0 || paths.gl_pathc != 1) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't find changelog mask");
-               exit(1);
-       }
+       if (rc != 0 || paths.gl_pathc != 1)
+               LAMIGO_FATAL("cannot find changelog mask: %s\n", strerror(errno));
+
        rc = lamigo_read_file(paths.gl_pathv[0], buf, sizeof(buf));
        globfree(&paths);
 
@@ -2856,12 +2682,9 @@ static void lamigo_changelog_check_and_set_mask(void)
                 "lctl set_param -n mdd.%s.changelog_mask=+\"CLOSE UNLNK\"",
                 opt.o_mdtname);
        rc = system(buf);
-       if (rc < 0) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                            "can't enable CLOSE/UNLNK in changelog: rc=%d",
-                            rc);
-               exit(1);
-       }
+       if (rc < 0)
+               LAMIGO_FATAL("cannot enable CLOSE/UNLNK in changelog: rc = %d\n", rc);
+
        llapi_err_noerrno(LLAPI_MSG_INFO, "enable CLOSE/UNLNK in changelog");
 }
 
@@ -2878,15 +2701,13 @@ static int lamigo_check_changelog_user(const char *user)
 
        rc = cfs_get_param_paths(&paths, "mdd/%s/changelog_users",
                                 opt.o_mdtname);
-       if (rc != 0 || paths.gl_pathc != 1) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't find changelog users");
-               exit(1);
-       }
+       if (rc != 0 || paths.gl_pathc != 1)
+               LAMIGO_FATAL("can't find changelog users\n");
+
        rc = lamigo_read_file(paths.gl_pathv[0], buf, sizeof(buf));
-       if (rc < 0) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't get changelog users");
-               exit(1);
-       }
+       if (rc < 0)
+               LAMIGO_FATAL("can't get changelog users\n");
+
        rc = -1;
        s = buf;
        /* skip current index line */
@@ -2928,28 +2749,21 @@ again:
                if (!rc) {
                        /* found, use it */
                        opt.o_chlg_user = strdup(user);
-                       llapi_printf(LLAPI_MSG_DEBUG,
-                                    "found Changelog user '%s' in '%s'\n",
+                       LAMIGO_DEBUG("found Changelog user '%s' in '%s'\n",
                                     user, buf);
                        return;
                }
        }
 
-       if (registered) {
-               /* can't find just registered changelog user */
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                            "can't find registered Changelog user '%s'",
-                            user);
-               exit(1);
-       }
+       if (registered)
+               LAMIGO_FATAL("cannot find just registered Changelog user '%s'\n", user);
 
        /* try one from the config file */
        if (opt.o_chlg_user) {
                rc = lamigo_check_changelog_user(opt.o_chlg_user);
                if (!rc) {
                        /* found, use it */
-                       llapi_printf(LLAPI_MSG_DEBUG,
-                                    "found Changelog user '%s' from config\n",
+                       LAMIGO_DEBUG("found Changelog user '%s' from config\n",
                                     opt.o_chlg_user);
                        return;
                }
@@ -2961,12 +2775,10 @@ again:
                 "lctl --device %s changelog_register -n >"LAMIGO_USERFILE,
                 opt.o_mdtname, opt.o_mdtname);
        rc = system(buf);
-       if (rc < 0) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                            "Changelog user '%s' is not registered",
+       if (rc < 0)
+               LAMIGO_FATAL("changelog user '%s' is not registered\n",
                             opt.o_chlg_user);
-               exit(1);
-       }
+
        registered = true;
        /* if a new changelog user was just registered, either this is the
         * first time lamigo was run on the filesystem, or it has been some
@@ -2992,13 +2804,12 @@ void lamigo_show_progress(void)
                return;
        progress_last_processed = stats.s_processed;
 
-       llapi_printf(LLAPI_MSG_INFO,
-                    "%lu processed, %lu replicated, %lu busy, %lu in queue, "
-                    "%lu hot skipped, %lu ro2hot, %lu rw2hot, %lu rw2cold\n",
-                    stats.s_processed, stats.s_replicated, stats.s_busy,
-                    stats.s_skip_hot, stats.s_replicate_ro2hot,
-                    stats.s_replicate_rw2hot, stats.s_replicate_rw2cold,
-                    head.lh_cached_count);
+       LAMIGO_INFO("%lu processed, %lu replicated, %lu busy, %lu in queue, "
+                   "%lu hot skipped, %lu ro2hot, %lu rw2hot, %lu rw2cold\n",
+                   stats.s_processed, stats.s_replicated, stats.s_busy,
+                   stats.s_skip_hot, stats.s_replicate_ro2hot,
+                   stats.s_replicate_rw2hot, stats.s_replicate_rw2cold,
+                   head.lh_cached_count);
 }
 
 static void lamigo_lock_pidfile(void)
@@ -3008,10 +2819,9 @@ static void lamigo_lock_pidfile(void)
 
        snprintf(buf, sizeof(buf), LAMIGO_PIDFILE, opt.o_mdtname);
        fd = open(buf, O_RDWR | O_CREAT, 0600);
-       if (fd < 0) {
-               llapi_error(LLAPI_MSG_FATAL, errno, "can't create pidfile");
-               exit(1);
-       }
+       if (fd < 0)
+               LAMIGO_FATAL("cannot create pidfile '%s': %s\n", buf, strerror(errno));
+
        rc = flock(fd, LOCK_EX | LOCK_NB);
        if (rc < 0) {
                sz = read(fd, buf, sizeof(buf));
@@ -3020,24 +2830,18 @@ static void lamigo_lock_pidfile(void)
                        sz = 0;
                if (sz > 0)
                        buf[sz] = 0;
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                            "another lamigo is running, locked by %s",
+               LAMIGO_FATAL("another lamigo is running, locked by %s\n",
                             sz > 0 ? buf : "[unknown]");
-               exit(1);
        }
 
        rc = ftruncate(fd, 0);
-       if (rc < 0) {
-               llapi_error(LLAPI_MSG_FATAL, errno, "cannot truncate pidfile");
-               exit(1);
-       }
+       if (rc < 0)
+               LAMIGO_FATAL("cannot truncate pidfile: %s\n", strerror(errno));
 
        sz = snprintf(buf, sizeof(buf), "%d\n", getpid());
        rc = write(fd, buf, sz);
-       if (rc < 0 || rc != sz) {
-               llapi_error(LLAPI_MSG_FATAL, rc, "can't write pidfile");
-               exit(1);
-       }
+       if (rc < 0 || rc != sz)
+               LAMIGO_ERROR("cannot write pidfile: %s\n", rc < 0 ? strerror(errno) : "short write");
 }
 
 static void lamigo_process_changelog(void)
@@ -3085,14 +2889,11 @@ again:
        if (rc < 0) {
                int i;
 
-               llapi_error(LLAPI_MSG_ERROR, rc,
-                           "failed to process record");
+               LAMIGO_ERROR("cannot to process changelog record: %s\n", strerror(-rc));
                rc = llapi_changelog_fini(&chglog_hdlr);
-               if (rc) {
-                       llapi_error(LLAPI_MSG_FATAL, rc,
-                                   "cannot fini changelog");
-                       exit(1);
-               }
+               if (rc)
+                       LAMIGO_FATAL("cannot fini changelog: %s\n", strerror(-rc));
+
                i = 0;
                do {
                        /* do not reopen too frequently */
@@ -3106,13 +2907,11 @@ again:
                                                   CHANGELOG_FLAG_EXTRA_FLAGS,
                                                   opt.o_mdtname, 0);
                } while (i++ < 5 && rc != 0);
-               if (rc) {
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "unable to reopen changelog of MDT [%s]",
-                                   opt.o_mdtname);
-                       exit(1);
-               }
-               llapi_printf(LLAPI_MSG_DEBUG, "Reopened changelog\n");
+
+               if (rc)
+                       LAMIGO_FATAL("cannot reopen changelog: %s\n", strerror(-rc));
+
+               LAMIGO_DEBUG("Reopened changelog\n");
                goto again;
        }
 }
@@ -3126,32 +2925,24 @@ void lamigo_parse_rules(const char *rule_str, const char *filename)
        int i, rc;
 
        tok = json_tokener_new();
-       if (!tok) {
-               llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO , -1,
-                           "cannot allocate json token");
-               exit(1);
-       }
+       if (!tok)
+               LAMIGO_OOM(-1);
 
        obj_top = json_tokener_parse_ex(tok, rule_str, strlen(rule_str));
        if (obj_top == NULL) {
                enum json_tokener_error jerr;
                jerr = json_tokener_get_error(tok);
-               llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, -1,
-                           "cannot parse rules in %s: %s - %s",
-                           filename, rule_str, json_tokener_error_desc(jerr));
-               exit(1);
+
+               LAMIGO_FATAL("cannot parse rule '%s' in '%s': %s\n",
+                            rule_str, filename, json_tokener_error_desc(jerr));
        }
 
        rc = json_object_object_get_ex(obj_top, LIPE_CONFIG_RULES, &obj_rules);
-       if (!rc) {
-               llapi_error(LLAPI_MSG_ERROR, rc, "no rules in %s", filename);
-               exit(1);
-       }
-       if (json_object_get_type(obj_rules) != json_type_array) {
-               llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                           "rules are not an array in %s", filename);
-               exit(1);
-       }
+       if (!rc)
+               LAMIGO_FATAL("no rules in '%s'\n", filename);
+
+       if (json_object_get_type(obj_rules) != json_type_array)
+               LAMIGO_FATAL("rules in '%s' are not an array\n", filename);
 
        for (i = 0; i < json_object_array_length(obj_rules); i++) {
                struct json_object *obj_action, *obj_expr;
@@ -3161,71 +2952,47 @@ void lamigo_parse_rules(const char *rule_str, const char *filename)
 
                obj_rule = json_object_array_get_idx(obj_rules, i);
 
-               if (!obj_rule) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "failed to get rule #%d", i);
-                       exit(1);
-               }
+               if (!obj_rule)
+                       LAMIGO_FATAL("failed to get rule #%d\n", i);
 
                rc = json_object_object_get_ex(obj_rule, LIPE_CONFIG_ACTION,
                                               &obj_action);
-               if (!rc) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "no action in rule %s",
+               if (!rc)
+                       LAMIGO_FATAL("no action in rule %s\n",
                                    json_object_to_json_string(obj_rule));
-                       exit(1);
-               }
+
                action = json_object_get_string(obj_action);
-               if (!action) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "invalid action in rule %s",
+               if (!action)
+                       LAMIGO_FATAL("invalid action in rule %s\n",
                                    json_object_to_json_string(obj_rule));
-                       exit(1);
-               }
+
                rc = json_object_object_get_ex(obj_rule, LIPE_CONFIG_EXPRESSION,
                                               &obj_expr);
-               if (!rc) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "no expression in rule %s",
-                                   json_object_to_json_string(obj_rule));
-                       exit(1);
-               }
+               if (!rc)
+                       LAMIGO_FATAL("no expression in rule %s\n",
+                                    json_object_to_json_string(obj_rule));
+
                expr = json_object_get_string(obj_expr);
-               if (!expr) {
-                       llapi_error(LLAPI_MSG_ERROR, -EINVAL,
-                                   "invalid expression in rule %s",
-                                   json_object_to_json_string(obj_rule));
-                       exit(1);
-               }
+               if (!expr)
+                       LAMIGO_FATAL("invalid expression in rule %s\n",
+                                    json_object_to_json_string(obj_rule));
 
-               LIPE_ALLOC_PTR(lr);
-               if (!lr) {
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "cannot allocate rule %s",
-                                   json_object_to_json_string(obj_rule));
-                       exit(1);
-               }
+               lr = xcalloc(1, sizeof(*lr));
 
                if (!strcmp(action, "skip"))
                        lr->lr_action.la_action = LAT_COUNTER_INC;
                else if (!strcmp(action, "mirror"))
                        lr->lr_action.la_action = LAT_SHELL_CMD_FID;
-               else {
-                       llapi_error(LLAPI_MSG_ERROR, 0,
-                                   "unknown action '%s' in rule %s", action,
-                                   json_object_to_json_string(obj_rule));
-                       exit(1);
-               }
+               else
+                       LAMIGO_FATAL("unknown action '%s' in rule '%s'\n",
+                                    action, json_object_to_json_string(obj_rule));
 
                LIPE_INIT_LIST_HEAD(&lr->lr_values);
                rc = lipe_policy_value_init(&lr->lr_values, &lr->lr_expression,
                                            &valid, expr);
-               if (rc) {
-                       llapi_error(LLAPI_MSG_ERROR, rc,
-                                   "cannot parse expression in rule %s",
-                                   json_object_to_json_string(obj_rule));
-                       exit(1);
-               }
+               if (rc)
+                       LAMIGO_FATAL("cannot parse expression in rule %s\n", strerror(-rc));
+
                lipe_list_add_tail(&lr->lr_linkage, &lamigo_rule_list);
                lamigo_rule_attrs |= lr->lr_attr_bits;
        }
@@ -3233,39 +3000,65 @@ void lamigo_parse_rules(const char *rule_str, const char *filename)
        json_object_put(obj_top);
 }
 
-static void lamigo_register_signal_handlers(void)
+static void *lamigo_signal_thread_start(void *arg)
 {
-       struct sigaction sigterm_action = {
-               .sa_handler = &lamigo_sigterm_handler,
-       };
-       struct sigaction sigusr1_action = {
-               .sa_handler = &lamigo_sigusr1_handler,
-               .sa_flags = SA_RESTART,
-       };
-       struct sigaction sigusr2_action = {
-               .sa_handler = &lamigo_sigusr2_handler,
-               .sa_flags = SA_RESTART,
-       };
+       sigset_t *set = arg;
+       int sig;
+       int rc;
+
+       while (1) {
+               rc = sigwait(set, &sig);
+               /*
+                * RETURN VALUE
+                *
+                *   On success, sigwait() returns 0. On error, it
+                *   returns a positive error number (listed in
+                *   ERRORS).
+                *
+                * ERRORS
+                *   EINVAL set contains an invalid signal number.
+                */
+               if (rc != 0) {
+                       LAMIGO_ERROR("signal wait failed: %s\n", strerror(rc));
+                       continue;
+               }
 
-       sigemptyset(&sigterm_action.sa_mask);
-       sigemptyset(&sigusr1_action.sa_mask);
-       sigemptyset(&sigusr2_action.sa_mask);
+               LAMIGO_DEBUG("received signal %d\n", sig);
 
-       sigaction(SIGTERM, &sigterm_action, NULL);
-       sigaction(SIGUSR1, &sigusr1_action, NULL);
-       sigaction(SIGUSR2, &sigusr2_action, NULL);
+               switch (sig) {
+               case SIGUSR1:
+                       lamigo_dump_stats_file();
+                       break;
+               case SIGUSR2:
+                       lamigo_dump_heat_file();
+                       break;
+               default:
+                       LAMIGO_INFO("received signal %d, exiting\n", sig);
+                       exit(EXIT_SUCCESS);
+               }
+       }
 }
 
 int main(int argc, char **argv)
 {
-       int      rc;
-       bool     stop = 0;
-       int      ret = 0;
-       pthread_t pid;
+       pthread_t lamigo_refresh_statfs_thread_id;
+       pthread_t lamigo_signal_thread_id;
+       sigset_t sigset;
+       int rc;
 
-       /* Ignore SIGUSR1 and SIGUSR2 until we are setup. */
-       signal(SIGUSR1, SIG_IGN);
-       signal(SIGUSR2, SIG_IGN);
+       /*  We will handle signals in a dedicated thread. */
+       sigemptyset(&sigset);
+       sigaddset(&sigset, SIGTERM);
+       sigaddset(&sigset, SIGUSR1);
+       sigaddset(&sigset, SIGUSR2);
+
+       rc = pthread_sigmask(SIG_BLOCK, &sigset, NULL);
+       if (rc != 0)
+               LAMIGO_FATAL("cannot set signal mask: %s\n", strerror(rc));
+
+       rc = pthread_create(&lamigo_signal_thread_id, NULL, &lamigo_signal_thread_start, &sigset);
+       if (rc != 0)
+               LAMIGO_FATAL("cannot start signal thread: %s\n", strerror(rc));
 
        lipe_version_init();
        ssh_threads_set_callbacks(ssh_threads_get_pthread());
@@ -3273,7 +3066,7 @@ int main(int argc, char **argv)
 
        setlinebuf(stdout);
        setlinebuf(stderr);
-       llapi_msg_set_level(opt.o_verbose);
+       llapi_msg_set_level(lamigo_log_level);
 
        lamigo_parse_opts(argc, argv);
 
@@ -3281,15 +3074,12 @@ int main(int argc, char **argv)
         * followed by the MDT name ("lamigo lustre-MDT0000"). */
        llapi_set_command_name(opt.o_mdtname);
 
-       llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0,
-                   "version %s-%s, revision %s",
+       LAMIGO_INFO("version %s-%s, revision %s\n",
                    PACKAGE_VERSION, LIPE_RELEASE, LIPE_REVISION);
 
        rc = lamigo_init_cache();
-       if (rc < 0) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL, "can't init cache\n");
-               exit(1);
-       }
+       if (rc < 0)
+               LAMIGO_FATAL("cannot init cache\n");
 
        /* create and lock pidfile to protect against another instance */
        lamigo_lock_pidfile();
@@ -3297,9 +3087,8 @@ int main(int argc, char **argv)
        /* wait till the target pool got one OST at least */
        lamigo_refresh_osts_from_pool(tgt_pools);
        while (tgt_pools->pl_ostnr == 0) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR,
-                                 "Target pool %s is empty, waiting...",
-                                 tgt_pools->pl_pool);
+               LAMIGO_ERROR("target pool '%s' is empty, waiting %d seconds\n",
+                            tgt_pools->pl_pool, opt.o_pool_refresh);
                sleep(opt.o_pool_refresh);
                lamigo_refresh_osts_from_pool(tgt_pools);
        }
@@ -3312,40 +3101,30 @@ int main(int argc, char **argv)
        /* start heat collection and maintaining */
        lamigo_alr_init();
 
-       rc = pthread_create(&pid, NULL, lamigo_refresh_statfs_thread, NULL);
-       if (rc) {
-               llapi_error(LLAPI_MSG_FATAL, rc,
-                           "unable to start statfs thread");
-               exit(1);
-       }
+       rc = pthread_create(&lamigo_refresh_statfs_thread_id, NULL, lamigo_refresh_statfs_thread, NULL);
+       if (rc != 0)
+               LAMIGO_FATAL("cannot start statfs thread: %s\n", strerror(rc));
 
        if (opt.o_rescan) {
                /* scan the whole MDT and replicate matched files */
-               ret = lamigo_rescan();
+               rc = lamigo_rescan();
+               if (rc < 0)
+                       LAMIGO_FATAL("cannot scan device: %s\n", strerror(-rc));
        }
 
-       llapi_printf(LLAPI_MSG_DEBUG, "Start receiving records\n");
+       LAMIGO_DEBUG("Start receiving records\n");
        rc = llapi_changelog_start(&chglog_hdlr,
                                   CHANGELOG_FLAG_FOLLOW |
                                   CHANGELOG_FLAG_BLOCK |
                                   CHANGELOG_FLAG_JOBID |
                                   CHANGELOG_FLAG_EXTRA_FLAGS,
                                   opt.o_mdtname, 0);
-       if (rc) {
-               /* XXX: probably keep trying in some cases? */
-               llapi_error(LLAPI_MSG_ERROR, rc,
-                           "unable to open changelog of MDT [%s]",
-                           opt.o_mdtname);
-               ret = rc;
-               goto out;
-       }
-
-       llapi_printf(LLAPI_MSG_INFO, "started\n");
-
-       lamigo_register_signal_handlers();
+       if (rc < 0)
+               LAMIGO_FATAL("cannot open changelog: %s\n", strerror(-rc));
 
-       while (!stop) {
+       LAMIGO_INFO("started\n");
 
+       while (1) {
                if (head.lh_cached_count < opt.o_cached_fid_hiwm)
                        lamigo_process_changelog();
                else
@@ -3357,8 +3136,8 @@ int main(int argc, char **argv)
                if (!are_agents_busy()) {
                        rc = lamigo_check_sync();
                        if (rc < 0) {
-                               stop = true;
-                               ret = rc;
+                               LAMIGO_ERROR("check sync failed: rc = %d\n", rc);
+                               sleep(1);
                        }
                }
 
@@ -3367,28 +3146,6 @@ int main(int argc, char **argv)
                lamigo_check_bad_agents();
                lamigo_show_progress();
        }
-
-       /* wait for all jobs to complete */
-       while (lamigo_jobs_running) {
-               lamigo_wait_for_job_completion(10);
-               lamigo_check_jobs();
-               lamigo_check_and_clear_changelog();
-       }
-
-       rc = llapi_changelog_fini(&chglog_hdlr);
-       if (rc) {
-               llapi_error(LLAPI_MSG_ERROR, rc,
-                               "unable to close changelog of MDT [%s]",
-                               opt.o_mdtname);
-               ret = rc;
-       }
-
-out:
-       lamigo_cleanup();
-       llapi_error(LLAPI_MSG_INFO|LLAPI_MSG_NO_ERRNO, 0, "exited\n");
-       lipe_version_fini();
-
-       return ret;
 }
 
 void lamigo_alr_mirror_cb(struct resync_job *rj, void *cbdata, int rc)
@@ -3404,11 +3161,7 @@ static void lamigo_new_job_for_hot(struct lu_fid *fid, enum amigo_resync_type sy
        struct resync_job *rj;
        int rc;
 
-       rj = calloc(1, sizeof(struct resync_job));
-       if (rj == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR, "can't allocate for a job");
-               return;
-       }
+       rj = xcalloc(1, sizeof(*rj));
        rj->rj_fid = *fid;
        rj->rj_stripes = stripes;
        rj->rj_index = 0;
@@ -3431,16 +3184,14 @@ static int lamigo_check_hot_one(struct alr_heat *ht)
        struct mirror_opts mo = { 0 };
        int sync;
 
-       llapi_printf(LLAPI_MSG_DEBUG,
-                    "check hot "DFID": H: %Lu/%Lu, P: %Lu/%Lu, "
+       LAMIGO_DEBUG("check hot "DFID": H: %Lu/%Lu, P: %Lu/%Lu, "
                     "L %d, I %d %s\n", PFID(&ht->ah_fid),
                     ht->ah_heat[0], ht->ah_heat[1], ht->ah_pools[0],
                     ht->ah_pools[1], ht->ah_livetime, ht->ah_idle,
                     ht->ah_mark ? "M" : "");
        if (ht->ah_mark & ALR_TAG_PROCESSED) {
                /* already tried to replicate */
-               llapi_printf(LLAPI_MSG_DEBUG,
-                            DFID" tried to replicate already\n",
+               LAMIGO_DEBUG(DFID" tried to replicate already\n",
                             PFID(&ht->ah_fid));
                return 0;
        }
@@ -3453,7 +3204,7 @@ static int lamigo_check_hot_one(struct alr_heat *ht)
        if (ht->ah_heat[0] && ht->ah_heat[1] == 0 &&
            ht->ah_pools[0] == 0 && ht->ah_pools[1]) {
                sync = lamigo_is_in_sync(&ht->ah_fid, tgt_pools, src_pools, &mo);
-               llapi_printf(LLAPI_MSG_DEBUG, "try to replicate RO "DFID": %d\n",
+               LAMIGO_DEBUG("try to replicate RO "DFID": %d\n",
                            PFID(&ht->ah_fid), sync);
                if (sync != AMIGO_RESYNC_NONE) {
                        lamigo_new_job_for_hot(&ht->ah_fid, sync, src_pools,
@@ -3472,7 +3223,7 @@ static int lamigo_check_hot_one(struct alr_heat *ht)
        if (ht->ah_idle > 0 && ht->ah_heat[1] &&
            ht->ah_pools[0] == 0 && ht->ah_pools[1]) {
                sync = lamigo_is_in_sync(&ht->ah_fid, tgt_pools, src_pools, &mo);
-               llapi_printf(LLAPI_MSG_DEBUG, "try to replicate RW "DFID": %d\n",
+               LAMIGO_DEBUG("try to replicate RW "DFID": %d\n",
                            PFID(&ht->ah_fid), sync);
                if (sync != AMIGO_RESYNC_NONE) {
                        lamigo_new_job_for_hot(&ht->ah_fid, sync, src_pools,
@@ -3498,7 +3249,7 @@ static void lamigo_check_hot_on_cold(struct alr_heat *ht)
            ht->ah_pools[1] == 0 && ht->ah_pools[0]) {
                sync = lamigo_is_in_sync(&ht->ah_fid, src_pools,
                                         tgt_pools, &mo);
-               llapi_printf(LLAPI_MSG_DEBUG,
+               LAMIGO_DEBUG(
                             "replicate idling hot to CP "DFID": %d\n",
                            PFID(&ht->ah_fid), sync);
                if (sync != AMIGO_RESYNC_NONE) {
@@ -3524,13 +3275,7 @@ struct alr_heat *lamigo_get_hot(int period, int *nr)
                return NULL;
 
        /* XXX: limit number of hot files to check? */
-       ht = calloc(*nr + 1, sizeof(*ht));
-       if (!ht) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR,
-                               "allocation for ht failed");
-               return NULL;
-       }
-
+       ht = xcalloc(*nr + 1, sizeof(*ht));
        i = lamigo_alr_get_hot_files(period, ht, *nr,
                                    ALR_TAG_NO_ACCT | ALR_TAG_REPLICATED);
        if (i == 0) {
@@ -3561,7 +3306,7 @@ static void lamigo_check_hot(void)
        if (src_pools->pl_open) {
                /* get most recent hot files */
                ht = lamigo_get_hot(alr_hot_period, &nr);
-               llapi_printf(LLAPI_MSG_DEBUG, "check hot in period %lu - %d\n",
+               LAMIGO_DEBUG("check hot in period %lu - %d\n",
                                alr_hot_period, nr);
                if (ht) {
                        for (i = 0; i < nr; i++)
@@ -3576,15 +3321,14 @@ static void lamigo_check_hot(void)
        /* now check hot idling files - the files we found hot and
         * skipped replication. now it's time to try again */
        ht = lamigo_get_hot(alr_hot_period - opt.o_alr_hot_after_idle, &nr);
-       llapi_printf(LLAPI_MSG_DEBUG, "check idle in period %lu - %d\n",
+       LAMIGO_DEBUG("check idle in period %lu - %d\n",
                alr_hot_period - 3, nr);
        if (!ht)
                goto out;
 
        for (i = 0; i < nr; i++) {
                struct alr_heat *ah = ht + i;
-               llapi_printf(LLAPI_MSG_DEBUG,
-                             "idle "DFID": P: %Lu/%Lu, live %d, idle %d\n",
+               LAMIGO_DEBUG("idle "DFID": P: %Lu/%Lu, live %d, idle %d\n",
                              PFID(&ah->ah_fid), ah->ah_pools[0], ah->ah_pools[1],
                              ah->ah_livetime, ah->ah_idle);
                if (src_pools->pl_open)
@@ -3614,7 +3358,7 @@ static __u64 lamigo_read_osp_param(const int ostidx, const char *param)
 
        fd = open(path, O_RDONLY);
        if (fd < 0) {
-               llapi_error(LLAPI_MSG_ERROR, errno, "cannot open '%s'", path);
+               LAMIGO_ERROR("cannot open '%s': %s\n", path, strerror(errno));
                /* 0 means non-available OST */
                return 0;
        }
@@ -3622,7 +3366,7 @@ static __u64 lamigo_read_osp_param(const int ostidx, const char *param)
        if (rc > 0)
                retval = strtoul(buf, NULL, 10);
        if (rc < 0)
-               llapi_error(LLAPI_MSG_ERROR, errno, "cannot read '%s'", path);
+               LAMIGO_ERROR("cannot read '%s': %s\n", path, strerror(errno));
        close(fd);
 
        /* report zero if something went wrong
@@ -3647,8 +3391,7 @@ static void lamigo_refresh_pool_statfs(struct pool_list *pl, int threshold)
                /* check OSP is active */
                active = lamigo_read_osp_param(ostidx, "active");
                status = lamigo_read_osp_param(ostidx, "prealloc_status");
-               llapi_printf(LLAPI_MSG_DEBUG,
-                       "statfs for %d%s/%d: %llu from %llu\n",
+               LAMIGO_DEBUG("statfs for %d%s/%d: %llu from %llu\n",
                        ostidx, active ? "(active)" : "(inactive)",
                        (int)status, kbavail, kbtotal);
                if (!active || status)
@@ -3665,8 +3408,7 @@ static void lamigo_refresh_pool_statfs(struct pool_list *pl, int threshold)
                pl->pl_open = true;
 
        /* whether pool is good for replicas */
-       llapi_printf(LLAPI_MSG_DEBUG,
-                    "statfs for %s %s pool: %llu from %llu, thresh %llu\n",
+       LAMIGO_DEBUG("statfs for %s %s pool: %llu from %llu, thresh %llu\n",
                     pl->pl_open ? "open" : "closed", pl->pl_pool, tavail,
                     ttotal, ttotal * threshold / 100);
 }
@@ -3700,19 +3442,15 @@ static void *lamigo_refresh_statfs_thread(void *arg)
        int rc;
 
        str = strstr(opt.o_mdtname, "-MDT");
-       if (!str) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR,
-                               "failed to get MDT index from %s\n",
-                               opt.o_mdtname);
-               exit(1);
-       }
+       if (!str)
+               LAMIGO_FATAL("cannot get MDT index from '%s'\n", opt.o_mdtname);
+
        lamigo_mdtidx = strtoul(str + 4, NULL, 16);
 
        rc = cfs_get_param_paths(&paths, "osp");
-       if (rc != 0) {
-               llapi_error(LLAPI_MSG_FATAL, rc, "can't find OSP root");
-               exit(1);
-       }
+       if (rc != 0)
+               LAMIGO_FATAL("cannot find OSP root: %s\n", strerror(errno));
+
        osproot = strdup(paths.gl_pathv[0]);
        globfree(&paths);
 
index 87db237..343c641 100644 (file)
@@ -5,6 +5,11 @@
 #ifndef _LAMIGO_H_
 #define _LAMIGO_H_
 
+#include <stdio.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <lustre/lustreapi.h>
+
 void lamigo_add_alr_agent(const char *host);
 void lamigo_alr_init(void);
 
@@ -71,7 +76,6 @@ struct options {
        int              o_src_pool_len;
        char            *o_tgt_pool;
        int              o_tgt_pool_len;
-       int              o_verbose;
        int              o_min_age;
        unsigned long    o_cached_fid_hiwm; /* high watermark */
        unsigned long    o_cache_size;
@@ -102,7 +106,7 @@ struct options {
        int              o_src_dom;
        char            *o_heat_file;
 };
-extern struct options opt;
+extern struct options opt; /* opt is not a good global variable name. */
 extern int enable_heat;
 
 extern unsigned long alr_period;
@@ -115,4 +119,79 @@ void lamigo_alr_dump_heat_table(FILE *file);
 
 void lamigo_parse_rules(const char *rule_str, const char *filename);
 
+extern int lamigo_log_level; /* enum llapi_message_level */
+extern const char *lamigo_mdt_name;
+
+/* lamigo runs as a systemd service. So whatever it prints to stderr
+ * (or stdout) will be collected by journald and added to the
+ * logs. journald will add a lamigo[$PID] prefix. So when we print a
+ * debug message we don't need an lamigo prefix but we do want to
+ * include the MDT name. In this was we get
+ *
+ *   Sep 22 12:51:10 $HOSTNAME lamigo[24074]: lustre-MDT0001: blah balh
+*/
+#define LAMIGO_PRINT(level, fmt, args...)                              \
+       do {                                                            \
+               if (level <= lamigo_log_level)                          \
+                       fprintf(stderr, "%s: " fmt, lamigo_mdt_name, ##args); \
+       } while (0)
+
+#define LAMIGO_DEBUG(fmt, args...)                             \
+       LAMIGO_PRINT(LLAPI_MSG_DEBUG, "DEBUG: " fmt, ##args)
+
+#define LAMIGO_INFO(fmt, args...)                              \
+       LAMIGO_PRINT(LLAPI_MSG_INFO, "INFO: " fmt, ##args)
+
+#define LAMIGO_WARN(fmt, args...)                              \
+       LAMIGO_PRINT(LLAPI_MSG_WARN, "WARN: " fmt, ##args)
+
+#define LAMIGO_ERROR(fmt, args...)                             \
+       LAMIGO_PRINT(LLAPI_MSG_ERROR, "ERROR: " fmt, ##args)
+
+#define LAMIGO_FATAL(fmt, args...)                                     \
+       do {                                                            \
+               LAMIGO_PRINT(LLAPI_MSG_FATAL, "FATAL: " fmt, ##args);   \
+               exit(EXIT_FAILURE);                                     \
+       } while (0)
+
+#define LAMIGO_OOM_AT(file, line, func, size)                          \
+       LAMIGO_FATAL("out of memory at (%s:%d:%s), size = %zd\n", (file), (line), (func), (ssize_t)(size))
+
+#define LAMIGO_OOM(size) \
+       LAMIGO_OOM_AT(__FILE__, __LINE__, __func__, (size))
+
+static inline void *xmalloc1(const char *file, int line, const char *func, size_t size)
+{
+       void *ptr = malloc(size);
+
+       if (ptr == NULL && size != 0)
+               LAMIGO_OOM_AT(file, line, func, size);
+
+       return ptr;
+}
+
+static inline void *xcalloc1(const char *file, int line, const char *func, size_t nmemb, size_t size)
+{
+       void *ptr = calloc(nmemb, size);
+
+       if (ptr == NULL && (nmemb * size) != 0)
+               LAMIGO_OOM_AT(file, line, func, (nmemb * size));
+
+       return ptr;
+}
+
+static inline void *xstrdup1(const char *file, int line, const char *func, const char *s)
+{
+       void *ptr = strdup(s);
+
+       if (ptr == NULL)
+               LAMIGO_OOM_AT(file, line, func, strlen(s) + 1);
+
+       return ptr;
+}
+
+#define xmalloc(size) (xmalloc1(__FILE__, __LINE__, __func__, (size)))
+#define xcalloc(nmemb, size) (xcalloc1(__FILE__, __LINE__, __func__, (nmemb), (size)))
+#define xstrdup(s) (xstrdup1(__FILE__, __LINE__, __func__, (s)))
+
 #endif
index 2e61fd2..1e675a9 100644 (file)
@@ -145,12 +145,7 @@ static void lamigo_alr_update_one(struct lu_fid *fid, enum alr_rw rw,
 {
        struct alr_rec_temp *t;
 
-       t = calloc(sizeof(*t), 1);
-       if (!t) {
-               llapi_error(LLAPI_MSG_FATAL, -ENOMEM,
-                           "failed to alloc memory for alr_rec_temp");
-               exit(1);
-       }
+       t = xcalloc(sizeof(*t), 1);
        t->art_fid = *fid;
        t->art_ops = ops;
        t->art_rw = rw;
@@ -494,13 +489,7 @@ static void lamigo_alr_update_heat_all(void)
        p->alp_avg[0] = asum[0] / nr;
        p->alp_avg[1] = asum[1] / nr;
 
-       sa = malloc(sizeof(*sa) * nr);
-       if (!sa) {
-               /* XXX: better handling */
-               llapi_error(LLAPI_MSG_FATAL|LLAPI_MSG_NO_ERRNO, 0,
-                           "cannot allocate for sorting\n");
-               exit(1);
-       }
+       sa = xmalloc(sizeof(*sa) * nr);
        i = 0;
        lipe_list_for_each_entry(f, &p->alp_list, ar_link) {
                assert(i < nr);
@@ -571,12 +560,7 @@ void lamigo_alr_process_temp_one(struct lu_fid *fid, int rw, __u64 ops,
                f = container_of(fh, struct alr_rec, ar_fh);
 
        if (f == NULL) {
-               f = calloc(1, sizeof(*f));
-               if (f == NULL) {
-                       llapi_error(LLAPI_MSG_ERROR, -ENOMEM,
-                                       "failed to alloc memory for alr_rec");
-                       return;
-               }
+               f = xcalloc(1, sizeof(*f));
                f->ar_fh.fh_fid = *fid;
                f->ar_start = alr_period;
 
@@ -843,12 +827,9 @@ void lamigo_alr_init(void)
        char *str;
 
        str = strstr(opt.o_mdtname, "-MDT");
-       if (!str) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR,
-                                 "failed to get MDT index from %s\n",
-                                 opt.o_mdtname);
-               exit(1);
-       }
+       if (!str)
+               LAMIGO_FATAL("cannot get MDT index from '%s'\n", opt.o_mdtname);
+
        mdtidx = strtoul(str + 4, NULL, 16);
        if (opt.o_alr_ofd_interval == 0) {
                /* not passed as an option */
@@ -858,15 +839,11 @@ void lamigo_alr_init(void)
        }
 
        rc = fid_hash_init(&alr_head.alh_hash);
-       if (rc) {
-               llapi_err_noerrno(LLAPI_MSG_ERROR,
-                                 "failed to alloc memory for hash (%zu).",
-                                 sizeof(struct hlist_head) * FID_HASH_ENTRIES);
-               exit(1);
-       }
+       if (rc)
+               LAMIGO_OOM(-1);
 
-       alr_head.alh_period = calloc(sizeof(*alr_head.alh_period),
-                                    opt.o_alr_periods);
+       alr_head.alh_period = xcalloc(sizeof(*alr_head.alh_period),
+                                     opt.o_alr_periods);
        for (i = 0; i < opt.o_alr_periods; i++) {
                struct alr_period *p = &alr_head.alh_period[i];
                p->alp_nr = 0;
@@ -880,20 +857,13 @@ void lamigo_alr_init(void)
        lipe_list_for_each_entry(ala, &alr_agent_list, ala_list) {
                rc = pthread_create(&ala->ala_pid, NULL,
                                    lamigo_alr_data_collection_thread, ala);
-               if (rc) {
-                       llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                         "cannot start access reader: rc=%d\n",
-                                         rc);
-                       exit(1);
-               }
+               if (rc)
+                       LAMIGO_FATAL("cannot start access log reader: %s\n", strerror(rc));
        }
 
        rc = pthread_create(&pid, NULL, lamigo_alr_heat_thread, NULL);
-       if (rc) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                 "failed to start heat-maint thread\n");
-               exit(1);
-       }
+       if (rc)
+               LAMIGO_FATAL("cannot start heat-maint thread: %s\n", strerror(rc));
 }
 
 void lamigo_add_alr_agent(const char *host)
@@ -901,9 +871,7 @@ void lamigo_add_alr_agent(const char *host)
        struct alr_agent *ala;
        int rc;
 
-       ala = calloc(1, sizeof(*ala));
-       assert(ala != NULL);
-
+       ala = xcalloc(1, sizeof(*ala));
        ala->ala_host = strdup(host);
        assert(ala->ala_host != NULL);
 
index 57cd8f8..88fbef5 100644 (file)
@@ -1,6 +1,9 @@
 #include <assert.h>
+#include "lamigo.h"
 #include "lamigo_hash.h"
 
+int fid_hash_shift = 14;
+
 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
 #define GOLDEN_RATIO_PRIME_32 0x9e370001UL
 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
@@ -177,21 +180,8 @@ int fid_hash_init(struct fid_hash_head *hash)
 {
        int i;
 
-       hash->fhh_hash = malloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES);
-       if (hash->fhh_hash == NULL) {
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                "failed to alloc memory for hash (%zu).",
-                                sizeof(struct hlist_head) * FID_HASH_ENTRIES);
-               return -ENOMEM;
-       }
-       hash->fhh_mutex = malloc(sizeof(pthread_mutex_t) * FID_HASH_ENTRIES);
-       if (hash->fhh_mutex == NULL) {
-               free(hash->fhh_hash);
-               llapi_err_noerrno(LLAPI_MSG_FATAL,
-                                "failed to alloc memory for mutex (%zu).",
-                                sizeof(pthread_mutex_t) * FID_HASH_ENTRIES);
-               return -ENOMEM;
-       }
+       hash->fhh_hash = xmalloc(sizeof(struct hlist_head) * FID_HASH_ENTRIES);
+       hash->fhh_mutex = xmalloc(sizeof(pthread_mutex_t) * FID_HASH_ENTRIES);
 
        for (i = 0; i < FID_HASH_ENTRIES; i++) {
                INIT_HLIST_HEAD(&hash->fhh_hash[i]);
index cd78f43..e7bd4ae 100644 (file)
@@ -21,7 +21,7 @@ struct fid_hash_head {
        pthread_mutex_t         *fhh_mutex;
 };
 
-static int fid_hash_shift = 14;
+extern int fid_hash_shift;
 
 #define FID_HASH_ENTRIES       (1 << fid_hash_shift)
 #define FID_ON_HASH(f)         (!hlist_unhashed(&(f)->fh_node))
index 18fe2c1..f9d621d 100755 (executable)
@@ -1188,7 +1188,7 @@ test_7() {
 
        sleep $LAMIGO_AGE
        do_facet $facet "cat $log_file" |
-               grep -q "Target pool $tgt_pool is empty, waiting" ||
+               grep --ignore-case -q "target pool '$tgt_pool' is empty, waiting" ||
                error "failed to use default pool '$tgt_pool'"
 }
 run_test 7 "lamigo: start with no OST pools"