From 24a26fb3b203adcc7f26381b2cd3f66bae8668e9 Mon Sep 17 00:00:00 2001 From: Lei Feng Date: Fri, 24 Dec 2021 01:10:47 -0500 Subject: [PATCH] EX-4408 lipe: collect some statistic data with lpcc_purge Collect the number of cached files, min/max/avg file size, min/max/avg age of files in LPCC. Scan cache device forcefully even if it is not full enough to collect the statistics data in time. Change-Id: Id716d4689c83ecc5754e41734e44e7c051d36a8e Signed-off-by: Lei Feng Test-Parameters: trivial testlist=sanity-pcc Reviewed-on: https://review.whamcloud.com/45937 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- lipe/lpcc | 3 + lipe/lpcc.conf | 1 + lipe/man/lpcc.conf.5 | 7 +- lipe/src/lpcc_purge.c | 176 +++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 163 insertions(+), 24 deletions(-) diff --git a/lipe/lpcc b/lipe/lpcc index 960e1db..e3d0584 100755 --- a/lipe/lpcc +++ b/lipe/lpcc @@ -46,6 +46,7 @@ class LpccService: lpcc_purge_high_usage = 90 lpcc_purge_low_usage = 75 lpcc_purge_interval = 5 + lpcc_purge_force_scan_interval = 1800 lpcc_purge_scan_threads = 1 lpcc_purge_log_level = None @@ -61,6 +62,7 @@ class LpccService: self.lpcc_purge_high_usage = lpcc_purge_obj.get('high_usage', 90) self.lpcc_purge_low_usage = lpcc_purge_obj.get('low_usage', 75) self.lpcc_purge_interval = lpcc_purge_obj.get('interval', 30) + self.lpcc_purge_force_scan_interval = lpcc_purge_obj.get('force_scan_interval', 1800) self.lpcc_purge_scan_threads = lpcc_purge_obj.get('scan_threads', 1) self.lpcc_purge_log_level = lpcc_purge_obj.get('log_level') @@ -130,6 +132,7 @@ class LpccService: '--high-usage', str(self.lpcc_purge_high_usage), \ '--low-usage', str(self.lpcc_purge_low_usage), \ '--interval', str(self.lpcc_purge_interval), \ + '--force-scan-interval', str(self.lpcc_purge_force_scan_interval), \ '--scan-threads', str(self.lpcc_purge_scan_threads), \ '--pidfile', pidfile, \ '--dump', statsfile] diff --git a/lipe/lpcc.conf b/lipe/lpcc.conf index 3a19f93..41e1297 100644 --- a/lipe/lpcc.conf +++ b/lipe/lpcc.conf @@ -7,6 +7,7 @@ # low_usage: 75 # scan_threads: 1 # interval: 30 +# force_scan_interval: 1800 # #- mount: /mnt/lfs # cache: /mnt/pcc2 diff --git a/lipe/man/lpcc.conf.5 b/lipe/man/lpcc.conf.5 index 60976dd..f0c500d 100644 --- a/lipe/man/lpcc.conf.5 +++ b/lipe/man/lpcc.conf.5 @@ -42,6 +42,11 @@ purging. It is 75 (means 75% disk/inode usage) by default. The interval for lpcc_purge to check cache device usage, in seconds. It is 5 seconds by default. .TP +.BR purge.force_scan_interval +The interval for lpcc_purge to scan cache device forcefully even if its usage +is not higher than \fBpurge.high_usage\fR to refresh some statistics data. +It is 1800 seconds by default. +.TP .BR purge.scan_threads How many threads are used to scan cache device in parallel. It is 1 thread by default. @@ -138,4 +143,4 @@ For example, "mtime>{10m}" means the file was modified more than 10 minutes ago; .BR lpcc(8) .BR lpcc-start(8) .BR lpcc-stop(8) -.BR lpcc-status(8) \ No newline at end of file +.BR lpcc-status(8) diff --git a/lipe/src/lpcc_purge.c b/lipe/src/lpcc_purge.c index 0738d21..2cfbaba 100644 --- a/lipe/src/lpcc_purge.c +++ b/lipe/src/lpcc_purge.c @@ -35,8 +35,9 @@ #define DEF_MAX_SCAN_SECS 30 #define MIN_MAX_SCAN_SECS 5 #define MAX_MAX_SCAN_SECS 300 - -#define MAX_ATIME_FID_LEN 128 +#define DEF_FORCE_SCAN_INTERVAL (30 * 60) +#define MIN_FORCE_SCAN_INTERVAL (60) +#define MAX_FORCE_SCAN_INTERVAL (24 * 60 * 60) #define OPT_DRY_RUN 1 #define OPT_CANDIDATE_NUM 2 @@ -45,6 +46,7 @@ #define OPT_MAX_SCAN_SECS 5 #define OPT_PIDFILE 6 #define OPT_VERSION 7 +#define OPT_FORCE_SCAN_INTERVAL 8 #define LPCC_PURGE_LOG(level, fmt, ...) \ llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ##__VA_ARGS__) @@ -66,6 +68,7 @@ struct lpcc_purge_options { char *o_pidfile; bool o_dry_run; bool o_clear_hashdir; + int o_force_scan_interval; }; static struct lpcc_purge_options opt = { .o_rwid = -1, @@ -77,6 +80,7 @@ static struct lpcc_purge_options opt = { .o_max_scan_secs = DEF_MAX_SCAN_SECS, .o_dry_run = false, .o_clear_hashdir = false, + .o_force_scan_interval = DEF_FORCE_SCAN_INTERVAL, }; struct lpcc_purge_stats { @@ -97,11 +101,34 @@ static struct lpcc_purge_stats stats = { .s_lock = PTHREAD_MUTEX_INITIALIZER, }; +struct lpcc_purge_extra_stats { + pthread_mutex_t xs_lock; + int64_t xs_last_scan_time; + uint64_t xs_files; + uint64_t xs_bytes; + uint64_t xs_min_size; + uint64_t xs_max_size; + int64_t xs_min_atime; + int64_t xs_max_atime; + int64_t xs_age0_time_ms; + int64_t xs_total_ages_ms; +}; + +struct lpcc_purge_extra_stats xstats[2] = { + { + .xs_lock = PTHREAD_MUTEX_INITIALIZER, + }, + { + .xs_lock = PTHREAD_MUTEX_INITIALIZER, + } +}; + static struct lipe_instance *instance = NULL; struct lpcc_purge_candidate { - uint64_t c_atime_ms; struct lu_fid c_fid; + uint64_t c_atime_ms; + uint64_t c_size; char c_path[0]; }; @@ -116,7 +143,7 @@ static struct lpcc_purge_candidate_set candidate_set; static double lpcc_purge_get_fs_usage(const char *fs); struct lpcc_purge_candidate *lpcc_purge_candidate_new( - uint64_t atime_ms, struct lu_fid fid, const char *path) + struct lu_fid fid, struct lipe_object_attrs * attr, const char *path) { struct lpcc_purge_candidate *candidate = (struct lpcc_purge_candidate *)malloc( @@ -128,8 +155,9 @@ struct lpcc_purge_candidate *lpcc_purge_candidate_new( exit(1); } - candidate->c_atime_ms = atime_ms; candidate->c_fid = fid; + candidate->c_size = attr->loa_size; + candidate->c_atime_ms = attr->loa_atime_ms; strcpy(candidate->c_path, path); return candidate; @@ -243,6 +271,7 @@ static void lpcc_purge_dump_config_stats(FILE *f) char buff[64]; time_t curr; double usage; + int64_t avg_age_ms; json_object *j_all = json_object_new_object(); @@ -253,10 +282,8 @@ static void lpcc_purge_dump_config_stats(FILE *f) /* time, fsname, and cache usage */ curr = time(NULL); - ctime_r(&curr, buff); - buff[strlen(buff) - 1] = '\0'; - json_object *j_curr_time = json_object_new_string(buff); - json_object_object_add(j_all, "curr_time", j_curr_time); + json_object *j_curr_time = json_object_new_int64(curr); + json_object_object_add(j_all, "snapshot_time_secs", j_curr_time); rc = llapi_get_fsname(opt.o_mount, buff, sizeof(buff)); if (rc) { llapi_error(LLAPI_MSG_FATAL, rc, "Cannot get fsname."); @@ -264,7 +291,7 @@ static void lpcc_purge_dump_config_stats(FILE *f) } json_object_object_add(j_all, "fsname", json_object_new_string(buff)); usage = lpcc_purge_get_fs_usage(opt.o_cache); - json_object_object_add(j_all, "cache_usage", json_object_new_double(usage)); + json_object_object_add(j_all, "cache_usage_pct", json_object_new_double(usage)); /* configuration */ json_object *j_config = json_object_new_object(); @@ -279,6 +306,8 @@ static void lpcc_purge_dump_config_stats(FILE *f) json_object_object_add(j_config, "clear_hashdir", json_object_new_boolean(opt.o_clear_hashdir)); json_object_object_add(j_config, "max_scan_secs", json_object_new_int64(opt.o_max_scan_secs)); json_object_object_add(j_config, "log_level", json_object_new_string(log_level_to_str(llapi_msg_get_level()))); + json_object_object_add(j_config, "force_scan_interval", + json_object_new_int64(opt.o_force_scan_interval)); json_object_object_add(j_all, "config", j_config); /* stats */ @@ -287,18 +316,63 @@ static void lpcc_purge_dump_config_stats(FILE *f) json_object_object_add(j_stats, "total_purged_objs", json_object_new_int64(stats.s_total_purged_objs)); json_object_object_add(j_stats, "total_failed_objs", json_object_new_int64(stats.s_total_failed_objs)); if (stats.s_start_time != 0) { + json_object_object_add(j_stats, "start_time_secs", json_object_new_int64(stats.s_start_time)); ctime_r(&stats.s_start_time, buff); + buff[strlen(buff) - 1] = '\0'; /* remove ending \n */ json_object_object_add(j_stats, "start_time", json_object_new_string(buff)); - json_object_object_add(j_stats, "start_usage", json_object_new_double(stats.s_start_usage)); + + json_object_object_add(j_stats, "start_usage_pct", json_object_new_double(stats.s_start_usage)); } if (stats.s_end_time != 0) { + json_object_object_add(j_stats, "end_time_secs", json_object_new_int64(stats.s_end_time)); ctime_r(&stats.s_end_time, buff); + buff[strlen(buff) - 1] = '\0'; /* remove ending \n */ json_object_object_add(j_stats, "end_time", json_object_new_string(buff)); } json_object_object_add(j_stats, "scanned_objs", json_object_new_int64(stats.s_scanned_objs)); json_object_object_add(j_stats, "purged_objs", json_object_new_int64(stats.s_purged_objs)); json_object_object_add(j_all, "stats", j_stats); + /* xstats */ + json_object *j_xstats = json_object_new_object(); + json_object_object_add(j_xstats, "last_scan_time_secs", + json_object_new_int64(xstats[0].xs_last_scan_time)); + ctime_r(&xstats[0].xs_last_scan_time, buff); + buff[strlen(buff) - 1] = '\0'; /* remove ending \n */ + json_object_object_add(j_xstats, "last_scan_time", + json_object_new_string(buff)); + json_object_object_add(j_xstats, "cached_files", + json_object_new_int64(xstats[0].xs_files)); + json_object_object_add(j_xstats, "cached_bytes", + json_object_new_int64(xstats[0].xs_bytes)); + json_object_object_add(j_xstats, "min_cached_file_size", + json_object_new_int64(xstats[0].xs_min_size)); + json_object_object_add(j_xstats, "max_cached_file_size", + json_object_new_int64(xstats[0].xs_max_size)); + + json_object_object_add(j_xstats, "min_cached_file_atime_secs", + json_object_new_int64(xstats[0].xs_min_atime)); + ctime_r(&xstats[0].xs_min_atime, buff); + buff[strlen(buff) - 1] = '\0'; /* remove ending \n */ + json_object_object_add(j_xstats, "min_cached_file_atime", + json_object_new_string(buff)); + + json_object_object_add(j_xstats, "max_cached_file_atime_secs", + json_object_new_int64(xstats[0].xs_max_atime)); + ctime_r(&xstats[0].xs_max_atime, buff); + buff[strlen(buff) - 1] = '\0'; /* remove ending \n */ + json_object_object_add(j_xstats, "max_cached_file_atime", + json_object_new_string(buff)); + + if (xstats[0].xs_files == 0) + avg_age_ms = 0; + else + avg_age_ms = xstats[0].xs_total_ages_ms / xstats[0].xs_files + + curr * 1000 - xstats[0].xs_age0_time_ms; + json_object_object_add(j_xstats, "average_age_secs", + json_object_new_int64(avg_age_ms / 1000)); + + json_object_object_add(j_all, "cache_stats", j_xstats); /* convert the JSON object to string and print to file */ const char *str = json_object_to_json_string_ext(j_all, JSON_C_TO_STRING_PRETTY | @@ -333,15 +407,16 @@ static void usage(void) "\t-b, --debug, enable debugging output\n" "\t --log-level={debug|info|normal|warn|error|fatal|off}, set log level (default: info)\n" "\t-f, --config-file=FILE\n" - "\t-C, --cache=DIR root directory of PCC\n" - "\t-M, --mount=DIR local Lustre client's mountpoint\n" + "\t-C, --cache=DIR, root directory of PCC\n" + "\t-M, --mount=DIR, local Lustre client's mountpoint\n" "\t-A, --rwid, --roid=NUM the roid/rwid of PCC\n" - "\t-H, --high-usage=NUM %% of space or inode to start purging (default: %u)\n" - "\t-L, --low-usage=NUM %% of space or inode to stop purging (default: %u)\n" + "\t-H, --high-usage=NUM, %% of space or inode to start purging (default: %u)\n" + "\t-L, --low-usage=NUM, %% of space or inode to stop purging (default: %u)\n" "\t-i, --interval=NUM, seconds to next check (default: %u)\n" - "\t-t, --scan-threads=NUM scanning threads (default: %u)\n" + "\t-t, --scan-threads=NUM, scanning threads (default: %u)\n" "\t --candidate-num=NUM, candidate number of approximate LRU (default: %d, min: %d, max: %d)\n" - "\t --max-scan-secs, max seconds to scan continously before purging (default: %d, min: %d, max: %d)\n" + "\t --max-scan-secs=NUM, max seconds to scan continously before purging (default: %d, min: %d, max: %d)\n" + "\t --force-scan-interval=NUM, seconds to next forceful scanning (default: %d, min: %d, max: %d)\n" "\t-w, --dump=FILE, dump stats to FILE when signal USR1 is recieved (default: /var/run/lpcc_purge-RWID.stats)\n" "\t --pidfile=FILE, the pidfile name (default: /var/run/lpcc_purge-RWID.pid)\n" "\t --clear-hashdir, clear empty hash dir after detaching file\n" @@ -355,7 +430,9 @@ static void usage(void) DEF_INTERVAL, DEF_SCAN_THREADS, DEF_CANDIDATE_NUM, MIN_CANDIDATE_NUM, MAX_CANDIDATE_NUM, - DEF_MAX_SCAN_SECS, MIN_MAX_SCAN_SECS, MAX_MAX_SCAN_SECS + DEF_MAX_SCAN_SECS, MIN_MAX_SCAN_SECS, MAX_MAX_SCAN_SECS, + DEF_FORCE_SCAN_INTERVAL, MIN_FORCE_SCAN_INTERVAL, + MAX_FORCE_SCAN_INTERVAL ); } @@ -377,6 +454,8 @@ static struct option long_options[] = { { "pidfile", required_argument, NULL, OPT_PIDFILE}, { "clear-hashdir", no_argument, NULL, OPT_CLEAR_HASHDIR}, { "max-scan-secs", required_argument, NULL, OPT_MAX_SCAN_SECS}, + { "force-scan-interval", required_argument, NULL, + OPT_FORCE_SCAN_INTERVAL}, { "version", no_argument, NULL, OPT_VERSION}, { "help", no_argument, NULL, 'h' }, { NULL } @@ -639,6 +718,17 @@ static void lpcc_purge_process_opt(int c, char *optarg) } opt.o_max_scan_secs = value; break; + case OPT_FORCE_SCAN_INTERVAL: + value = strtol(optarg, &endptr, 10); + if (*endptr != '\0' || value < MIN_FORCE_SCAN_INTERVAL || + value > MAX_FORCE_SCAN_INTERVAL) { + llapi_error(LLAPI_MSG_FATAL, -EINVAL, + "invalid force-scan-interval: '%s'\n", + optarg); + exit(1); + } + opt.o_force_scan_interval = value; + break; case OPT_VERSION: lipe_version(); exit(0); @@ -800,10 +890,14 @@ static double lpcc_purge_get_fs_usage(const char *fs) static void lpcc_purge_wait_for_scan(void) { while (1) { + time_t now = time(NULL); double usage = lpcc_purge_get_fs_usage(opt.o_cache); LPCC_PURGE_LOG(LLAPI_MSG_DEBUG, "usage: %.1f\n", usage); - if (usage >= opt.o_high_usage || opt.o_dry_run) { + if (usage >= opt.o_high_usage || opt.o_dry_run || + (now - xstats[1].xs_last_scan_time) > + opt.o_force_scan_interval) { stats.s_start_usage = usage; + xstats[1].xs_last_scan_time = now; break; } else { sleep(opt.o_interval); @@ -953,6 +1047,28 @@ static int lpcc_purge_detach_candidate(const char *mnt, return 0; } +static void do_xstats(struct lpcc_purge_candidate *candidate) +{ + pthread_mutex_lock(&xstats[1].xs_lock); + + xstats[1].xs_files++; + xstats[1].xs_bytes += candidate->c_size; + if (xstats[1].xs_min_size == 0 || + candidate->c_size < xstats[1].xs_min_size) + xstats[1].xs_min_size = candidate->c_size; + if (candidate->c_size > xstats[1].xs_max_size) + xstats[1].xs_max_size = candidate->c_size; + if (xstats[1].xs_min_atime == 0 || + candidate->c_atime_ms < xstats[1].xs_min_atime * 1000) + xstats[1].xs_min_atime = candidate->c_atime_ms / 1000; + if (candidate->c_atime_ms > xstats[1].xs_max_atime * 1000) + xstats[1].xs_max_atime = candidate->c_atime_ms / 1000; + xstats[1].xs_total_ages_ms += xstats[1].xs_age0_time_ms - + candidate->c_atime_ms; + + pthread_mutex_unlock(&xstats[1].xs_lock); +} + static int lpcc_purge_scan_callback(struct lipe_instance *instance, struct lipe_object *object, struct lipe_object_attrs *attrs) @@ -983,7 +1099,7 @@ static int lpcc_purge_scan_callback(struct lipe_instance *instance, goto out; } - candidate = lpcc_purge_candidate_new(attrs->loa_atime_ms, fid, object->u.lo_posix.lop_path); + candidate = lpcc_purge_candidate_new(fid, attrs, object->u.lo_posix.lop_path); if (candidate == NULL) { rc = -ENOMEM; goto out; @@ -1044,6 +1160,7 @@ static int lpcc_purge_scan_callback(struct lipe_instance *instance, (candidate_set.cs_count - n_detach - n_discard) * sizeof(candidate_set.cs_arr[0])); for (j = candidate_set.cs_count - n_discard; j < candidate_set.cs_count; j++) { + do_xstats(candidate_set.cs_arr[j]); lpcc_purge_candidate_destroy(candidate_set.cs_arr[j]); } candidate_set.cs_count = candidate_set.cs_count - n_detach - n_discard; @@ -1113,10 +1230,13 @@ static void lpcc_purge_free_space(void) for (i = 0; i < candidate_set.cs_count; i++) { double usage = lpcc_purge_get_fs_usage(opt.o_cache); - if (usage >= opt.o_low_usage || opt.o_dry_run) - lpcc_purge_detach_candidate(opt.o_mount, candidate_set.cs_arr[i]); + struct lpcc_purge_candidate *candidate = candidate_set.cs_arr[i]; + + if (stats.s_start_usage > opt.o_high_usage && + (usage >= opt.o_low_usage || opt.o_dry_run)) + lpcc_purge_detach_candidate(opt.o_mount, candidate); else - break; + do_xstats(candidate); } lpcc_purge_candidate_set_clear(); @@ -1150,11 +1270,21 @@ int main(int argc, char *argv[]) stats.s_end_time = 0; stats.s_start_time = time(NULL); + xstats[1].xs_files = 0; + xstats[1].xs_bytes = 0; + xstats[1].xs_min_size = 0; + xstats[1].xs_max_size = 0; + xstats[1].xs_min_atime = 0; + xstats[1].xs_max_atime = 0; + xstats[1].xs_age0_time_ms = stats.s_start_time * 1000; + xstats[1].xs_total_ages_ms = 0; + lpcc_purge_scan(); lpcc_purge_free_space(); stats.s_end_time = time(NULL); + xstats[0] = xstats[1]; LPCC_PURGE_LOG(LLAPI_MSG_INFO, "scanned %ld object, purged %ld\n", -- 1.8.3.1