Whamcloud - gitweb
EX-6497 lipe: Refine stats field name in lamigo
authorAlexandre Ioffe <aioffe@ddn.com>
Thu, 8 Dec 2022 06:45:35 +0000 (22:45 -0800)
committerAndreas Dilger <adilger@whamcloud.com>
Tue, 13 Dec 2022 18:57:01 +0000 (18:57 +0000)
Corrected periodically printed by lamigo INFO
message "processed":
- Added two additional fields:
  "running" - number of currently running jobs such as replication
  "delayed" - current number of failed and other (such as set flag)
  jobs which are awating to be run on next lamigo cycle
- "in queue" field is changed to "awaiting". This is current number
  of files in the internal cache. These files are awating to be
  processed (replicated)

Test-Parameters: trivial testlist=hot-pools
Signed-off-by: Alexandre Ioffe <aioffe@ddn.com>
Change-Id: Iacf0199cfcf56edcbb8ad91e0e4b62c7451900f5
Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49344
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Colin Faber <cfaber@ddn.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lipe/src/lamigo.c

index ece683e..6c2eae6 100644 (file)
@@ -324,7 +324,7 @@ struct fid_rec {
 };
 
 static LIPE_LIST_HEAD(lamigo_job_list);
-static LIPE_LIST_HEAD(lamigo_failed_job_list); /* failed jobs to repeat */
+static LIPE_LIST_HEAD(lamigo_failed_job_list); /* failed/other jobs to run */
 struct lamigo_head {
        struct fid_hash_head    lh_hash;
        struct lipe_list_head   lh_list; /* ordered list by record index */
@@ -347,7 +347,8 @@ static __u64 lamigo_last_cleared_index;
 static LIPE_LIST_HEAD(lamigo_agent_list);
 static int lamigo_agent_count;
 static int lamigo_max_jobs; /* max jobs for all agents */
-static int lamigo_jobs_running; /* jobs running at the moment */
+static int lamigo_jobs_running; /* jobs in lamigo_job_list */
+static int lamigo_jobs_delayed; /* jobs in lamigo_failed_job_list */
 static char fsname[MAX_OBD_NAME + 1];
 
 static void *chglog_hdlr;
@@ -1444,6 +1445,7 @@ static int lamigo_submit_job(struct resync_job *rj)
        if (rc < 0) {
                LX_ERROR("cannot spawn a new job: %s\n", strerror(-rc));
                lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list);
+               lamigo_jobs_delayed++;
                return 1;
        }
        /*
@@ -1476,6 +1478,7 @@ static void lamigo_schedule_setprefer(struct resync_job *rj, void *cbdata, int r
        srj->rj_pool = fast_pools->pl_pool;
 
        lipe_list_add_tail(&srj->rj_list, &lamigo_failed_job_list);
+       lamigo_jobs_delayed++;
 }
 
 static int lamigo_update_one(struct fid_rec *f)
@@ -1536,7 +1539,6 @@ static int lamigo_update_one(struct fid_rec *f)
                rj->rj_callback = lamigo_alr_mirror_cb;
        }
 
-
        return lamigo_submit_job(rj);
 }
 
@@ -1576,6 +1578,7 @@ static int lamigo_check_sync(void)
                rj = lipe_list_entry(lamigo_failed_job_list.next,
                                     struct resync_job, rj_list);
                lipe_list_del(&rj->rj_list);
+               lamigo_jobs_delayed--;
                rc = lamigo_submit_job(rj);
                LX_DEBUG("tried to resubmit failed job %p: rc=%d\n", rj, rc);
                if (rc != 0)
@@ -1815,6 +1818,7 @@ static void lamigo_job_fini(struct resync_job *rj, intptr_t retval)
                rj->rj_pid = 0;
                rj->rj_agent = NULL;
                lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list);
+               lamigo_jobs_delayed++;
                rj = NULL;
        } else if (retval == 127) {
                /* likely invalid setup on the agent (missing lfs?) */
@@ -3096,10 +3100,14 @@ static void lamigo_show_progress(void)
                return;
        progress_last_processed = stats.s_processed;
 
-       LX_INFO("%lu processed, %lu replicated, %lu busy, %lu in queue, %lu hot skipped, %lu ro2hot, %lu rw2hot, %lu rw2cold\n",
+       LX_INFO("%lu processed, %lu replicated, %lu busy, %lu running, "
+               "%lu delayed, %lu awaiting, %lu hot skipped, %lu ro2hot, "
+               "%lu rw2hot, %lu rw2cold\n",
                stats.s_processed,
                stats.s_replicated,
                stats.s_busy,
+               lamigo_jobs_running,
+               lamigo_jobs_delayed,
                head.lh_cached_count,
                stats.s_skip_hot,
                stats.s_replicate_ro2hot,
@@ -3481,6 +3489,7 @@ static void lamigo_submit_sync(const struct lu_fid *fid, enum amigo_resync_type
        if (rc == 1) {
                /* probably a dedicated list would be better */
                lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list);
+               lamigo_jobs_delayed++;
        }
 }