From 5dc2a5bc9dd1889b6c6a4a955d09007092d2db7d Mon Sep 17 00:00:00 2001 From: Alexandre Ioffe Date: Wed, 7 Dec 2022 22:45:35 -0800 Subject: [PATCH] EX-6497 lipe: Refine stats field name in lamigo Corrected periodically printed by lamigo INFO message "processed": - Added two additional fields: "running" - number of currently running jobs such as replication "delayed" - current number of failed and other (such as set flag) jobs which are awating to be run on next lamigo cycle - "in queue" field is changed to "awaiting". This is current number of files in the internal cache. These files are awating to be processed (replicated) Test-Parameters: trivial testlist=hot-pools Signed-off-by: Alexandre Ioffe Change-Id: Iacf0199cfcf56edcbb8ad91e0e4b62c7451900f5 Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/49344 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Colin Faber Reviewed-by: Andreas Dilger --- lipe/src/lamigo.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/lipe/src/lamigo.c b/lipe/src/lamigo.c index ece683e..6c2eae6 100644 --- a/lipe/src/lamigo.c +++ b/lipe/src/lamigo.c @@ -324,7 +324,7 @@ struct fid_rec { }; static LIPE_LIST_HEAD(lamigo_job_list); -static LIPE_LIST_HEAD(lamigo_failed_job_list); /* failed jobs to repeat */ +static LIPE_LIST_HEAD(lamigo_failed_job_list); /* failed/other jobs to run */ struct lamigo_head { struct fid_hash_head lh_hash; struct lipe_list_head lh_list; /* ordered list by record index */ @@ -347,7 +347,8 @@ static __u64 lamigo_last_cleared_index; static LIPE_LIST_HEAD(lamigo_agent_list); static int lamigo_agent_count; static int lamigo_max_jobs; /* max jobs for all agents */ -static int lamigo_jobs_running; /* jobs running at the moment */ +static int lamigo_jobs_running; /* jobs in lamigo_job_list */ +static int lamigo_jobs_delayed; /* jobs in lamigo_failed_job_list */ static char fsname[MAX_OBD_NAME + 1]; static void *chglog_hdlr; @@ -1444,6 +1445,7 @@ static int lamigo_submit_job(struct resync_job *rj) if (rc < 0) { LX_ERROR("cannot spawn a new job: %s\n", strerror(-rc)); lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list); + lamigo_jobs_delayed++; return 1; } /* @@ -1476,6 +1478,7 @@ static void lamigo_schedule_setprefer(struct resync_job *rj, void *cbdata, int r srj->rj_pool = fast_pools->pl_pool; lipe_list_add_tail(&srj->rj_list, &lamigo_failed_job_list); + lamigo_jobs_delayed++; } static int lamigo_update_one(struct fid_rec *f) @@ -1536,7 +1539,6 @@ static int lamigo_update_one(struct fid_rec *f) rj->rj_callback = lamigo_alr_mirror_cb; } - return lamigo_submit_job(rj); } @@ -1576,6 +1578,7 @@ static int lamigo_check_sync(void) rj = lipe_list_entry(lamigo_failed_job_list.next, struct resync_job, rj_list); lipe_list_del(&rj->rj_list); + lamigo_jobs_delayed--; rc = lamigo_submit_job(rj); LX_DEBUG("tried to resubmit failed job %p: rc=%d\n", rj, rc); if (rc != 0) @@ -1815,6 +1818,7 @@ static void lamigo_job_fini(struct resync_job *rj, intptr_t retval) rj->rj_pid = 0; rj->rj_agent = NULL; lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list); + lamigo_jobs_delayed++; rj = NULL; } else if (retval == 127) { /* likely invalid setup on the agent (missing lfs?) */ @@ -3096,10 +3100,14 @@ static void lamigo_show_progress(void) return; progress_last_processed = stats.s_processed; - LX_INFO("%lu processed, %lu replicated, %lu busy, %lu in queue, %lu hot skipped, %lu ro2hot, %lu rw2hot, %lu rw2cold\n", + LX_INFO("%lu processed, %lu replicated, %lu busy, %lu running, " + "%lu delayed, %lu awaiting, %lu hot skipped, %lu ro2hot, " + "%lu rw2hot, %lu rw2cold\n", stats.s_processed, stats.s_replicated, stats.s_busy, + lamigo_jobs_running, + lamigo_jobs_delayed, head.lh_cached_count, stats.s_skip_hot, stats.s_replicate_ro2hot, @@ -3481,6 +3489,7 @@ static void lamigo_submit_sync(const struct lu_fid *fid, enum amigo_resync_type if (rc == 1) { /* probably a dedicated list would be better */ lipe_list_add_tail(&rj->rj_list, &lamigo_failed_job_list); + lamigo_jobs_delayed++; } } -- 1.8.3.1