Whamcloud - gitweb
EX-4408 lipe: collect some statistic data with lpcc_purge
authorLei Feng <flei@whamcloud.com>
Fri, 24 Dec 2021 06:10:47 +0000 (01:10 -0500)
committerAndreas Dilger <adilger@whamcloud.com>
Wed, 9 Mar 2022 17:42:28 +0000 (17:42 +0000)
Collect the number of cached files, min/max/avg file size,
min/max/avg age of files in LPCC. Scan cache device forcefully
even if it is not full enough to collect the statistics data
in time.

Change-Id: Id716d4689c83ecc5754e41734e44e7c051d36a8e
Signed-off-by: Lei Feng <flei@whamcloud.com>
Test-Parameters: trivial testlist=sanity-pcc
Reviewed-on: https://review.whamcloud.com/45937
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lipe/lpcc
lipe/lpcc.conf
lipe/man/lpcc.conf.5
lipe/src/lpcc_purge.c

index 960e1db..e3d0584 100755 (executable)
--- a/lipe/lpcc
+++ b/lipe/lpcc
@@ -46,6 +46,7 @@ class LpccService:
     lpcc_purge_high_usage = 90
     lpcc_purge_low_usage = 75
     lpcc_purge_interval = 5
+    lpcc_purge_force_scan_interval = 1800
     lpcc_purge_scan_threads = 1
     lpcc_purge_log_level = None
 
@@ -61,6 +62,7 @@ class LpccService:
             self.lpcc_purge_high_usage = lpcc_purge_obj.get('high_usage', 90)
             self.lpcc_purge_low_usage = lpcc_purge_obj.get('low_usage', 75)
             self.lpcc_purge_interval = lpcc_purge_obj.get('interval', 30)
+            self.lpcc_purge_force_scan_interval = lpcc_purge_obj.get('force_scan_interval', 1800)
             self.lpcc_purge_scan_threads = lpcc_purge_obj.get('scan_threads', 1)
             self.lpcc_purge_log_level = lpcc_purge_obj.get('log_level')
 
@@ -130,6 +132,7 @@ class LpccService:
                    '--high-usage', str(self.lpcc_purge_high_usage), \
                    '--low-usage', str(self.lpcc_purge_low_usage), \
                    '--interval', str(self.lpcc_purge_interval), \
+                   '--force-scan-interval', str(self.lpcc_purge_force_scan_interval), \
                    '--scan-threads', str(self.lpcc_purge_scan_threads), \
                    '--pidfile', pidfile, \
                    '--dump', statsfile]
index 3a19f93..41e1297 100644 (file)
@@ -7,6 +7,7 @@
 #    low_usage: 75
 #    scan_threads: 1
 #    interval: 30
+#    force_scan_interval: 1800
 #
 #- mount: /mnt/lfs
 #  cache: /mnt/pcc2
index 60976dd..f0c500d 100644 (file)
@@ -42,6 +42,11 @@ purging. It is 75 (means 75% disk/inode usage) by default.
 The interval for lpcc_purge to check cache device usage, in seconds. It is 5
 seconds by default.
 .TP
+.BR purge.force_scan_interval
+The interval for lpcc_purge to scan cache device forcefully even if its usage
+is not higher than \fBpurge.high_usage\fR to refresh some statistics data.
+It is 1800 seconds by default.
+.TP
 .BR purge.scan_threads
 How many threads are used to scan cache device in parallel. It is 1 thread by
 default.
@@ -138,4 +143,4 @@ For example, "mtime>{10m}" means the file was modified more than 10 minutes ago;
 .BR lpcc(8)
 .BR lpcc-start(8)
 .BR lpcc-stop(8)
-.BR lpcc-status(8)
\ No newline at end of file
+.BR lpcc-status(8)
index 0738d21..2cfbaba 100644 (file)
@@ -35,8 +35,9 @@
 #define DEF_MAX_SCAN_SECS              30
 #define MIN_MAX_SCAN_SECS              5
 #define MAX_MAX_SCAN_SECS              300
-
-#define MAX_ATIME_FID_LEN              128
+#define DEF_FORCE_SCAN_INTERVAL                (30 * 60)
+#define MIN_FORCE_SCAN_INTERVAL                (60)
+#define MAX_FORCE_SCAN_INTERVAL                (24 * 60 * 60)
 
 #define OPT_DRY_RUN                    1
 #define OPT_CANDIDATE_NUM              2
@@ -45,6 +46,7 @@
 #define OPT_MAX_SCAN_SECS              5
 #define OPT_PIDFILE                    6
 #define OPT_VERSION                    7
+#define OPT_FORCE_SCAN_INTERVAL                8
 
 #define LPCC_PURGE_LOG(level, fmt, ...) \
        llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ##__VA_ARGS__)
@@ -66,6 +68,7 @@ struct lpcc_purge_options {
        char *o_pidfile;
        bool o_dry_run;
        bool o_clear_hashdir;
+       int o_force_scan_interval;
 };
 static struct lpcc_purge_options opt = {
        .o_rwid                 = -1,
@@ -77,6 +80,7 @@ static struct lpcc_purge_options opt = {
        .o_max_scan_secs        = DEF_MAX_SCAN_SECS,
        .o_dry_run              = false,
        .o_clear_hashdir        = false,
+       .o_force_scan_interval  = DEF_FORCE_SCAN_INTERVAL,
 };
 
 struct lpcc_purge_stats {
@@ -97,11 +101,34 @@ static struct lpcc_purge_stats stats = {
        .s_lock         = PTHREAD_MUTEX_INITIALIZER,
 };
 
+struct lpcc_purge_extra_stats {
+       pthread_mutex_t xs_lock;
+       int64_t xs_last_scan_time;
+       uint64_t xs_files;
+       uint64_t xs_bytes;
+       uint64_t xs_min_size;
+       uint64_t xs_max_size;
+       int64_t xs_min_atime;
+       int64_t xs_max_atime;
+       int64_t xs_age0_time_ms;
+       int64_t xs_total_ages_ms;
+};
+
+struct lpcc_purge_extra_stats xstats[2] = {
+       {
+               .xs_lock = PTHREAD_MUTEX_INITIALIZER,
+       },
+       {
+               .xs_lock = PTHREAD_MUTEX_INITIALIZER,
+       }
+};
+
 static struct lipe_instance *instance = NULL;
 
 struct lpcc_purge_candidate {
-       uint64_t c_atime_ms;
        struct lu_fid c_fid;
+       uint64_t c_atime_ms;
+       uint64_t c_size;
        char c_path[0];
 };
 
@@ -116,7 +143,7 @@ static struct lpcc_purge_candidate_set candidate_set;
 static double lpcc_purge_get_fs_usage(const char *fs);
 
 struct lpcc_purge_candidate *lpcc_purge_candidate_new(
-       uint64_t atime_ms, struct lu_fid fid, const char *path)
+       struct lu_fid fid, struct lipe_object_attrs * attr, const char *path)
 {
        struct lpcc_purge_candidate *candidate =
                (struct lpcc_purge_candidate *)malloc(
@@ -128,8 +155,9 @@ struct lpcc_purge_candidate *lpcc_purge_candidate_new(
                exit(1);
        }
 
-       candidate->c_atime_ms = atime_ms;
        candidate->c_fid = fid;
+       candidate->c_size = attr->loa_size;
+       candidate->c_atime_ms = attr->loa_atime_ms;
        strcpy(candidate->c_path, path);
 
        return candidate;
@@ -243,6 +271,7 @@ static void lpcc_purge_dump_config_stats(FILE *f)
        char buff[64];
        time_t curr;
        double usage;
+       int64_t avg_age_ms;
 
        json_object *j_all = json_object_new_object();
 
@@ -253,10 +282,8 @@ static void lpcc_purge_dump_config_stats(FILE *f)
 
        /* time, fsname, and cache usage */
        curr = time(NULL);
-       ctime_r(&curr, buff);
-       buff[strlen(buff) - 1] = '\0';
-       json_object *j_curr_time = json_object_new_string(buff);
-       json_object_object_add(j_all, "curr_time", j_curr_time);
+       json_object *j_curr_time = json_object_new_int64(curr);
+       json_object_object_add(j_all, "snapshot_time_secs", j_curr_time);
        rc = llapi_get_fsname(opt.o_mount, buff, sizeof(buff));
        if (rc) {
                llapi_error(LLAPI_MSG_FATAL, rc, "Cannot get fsname.");
@@ -264,7 +291,7 @@ static void lpcc_purge_dump_config_stats(FILE *f)
        }
        json_object_object_add(j_all, "fsname", json_object_new_string(buff));
        usage = lpcc_purge_get_fs_usage(opt.o_cache);
-       json_object_object_add(j_all, "cache_usage", json_object_new_double(usage));
+       json_object_object_add(j_all, "cache_usage_pct", json_object_new_double(usage));
 
        /* configuration */
        json_object *j_config = json_object_new_object();
@@ -279,6 +306,8 @@ static void lpcc_purge_dump_config_stats(FILE *f)
        json_object_object_add(j_config, "clear_hashdir", json_object_new_boolean(opt.o_clear_hashdir));
        json_object_object_add(j_config, "max_scan_secs", json_object_new_int64(opt.o_max_scan_secs));
        json_object_object_add(j_config, "log_level", json_object_new_string(log_level_to_str(llapi_msg_get_level())));
+       json_object_object_add(j_config, "force_scan_interval",
+                              json_object_new_int64(opt.o_force_scan_interval));
        json_object_object_add(j_all, "config", j_config);
 
        /* stats */
@@ -287,18 +316,63 @@ static void lpcc_purge_dump_config_stats(FILE *f)
        json_object_object_add(j_stats, "total_purged_objs", json_object_new_int64(stats.s_total_purged_objs));
        json_object_object_add(j_stats, "total_failed_objs", json_object_new_int64(stats.s_total_failed_objs));
        if (stats.s_start_time != 0) {
+               json_object_object_add(j_stats, "start_time_secs", json_object_new_int64(stats.s_start_time));
                ctime_r(&stats.s_start_time, buff);
+               buff[strlen(buff) - 1] = '\0';          /* remove ending \n */
                json_object_object_add(j_stats, "start_time", json_object_new_string(buff));
-               json_object_object_add(j_stats, "start_usage", json_object_new_double(stats.s_start_usage));
+
+               json_object_object_add(j_stats, "start_usage_pct", json_object_new_double(stats.s_start_usage));
        }
        if (stats.s_end_time != 0) {
+               json_object_object_add(j_stats, "end_time_secs", json_object_new_int64(stats.s_end_time));
                ctime_r(&stats.s_end_time, buff);
+               buff[strlen(buff) - 1] = '\0';          /* remove ending \n */
                json_object_object_add(j_stats, "end_time", json_object_new_string(buff));
        }
        json_object_object_add(j_stats, "scanned_objs", json_object_new_int64(stats.s_scanned_objs));
        json_object_object_add(j_stats, "purged_objs", json_object_new_int64(stats.s_purged_objs));
        json_object_object_add(j_all, "stats", j_stats);
 
+       /* xstats */
+       json_object *j_xstats = json_object_new_object();
+       json_object_object_add(j_xstats, "last_scan_time_secs",
+                              json_object_new_int64(xstats[0].xs_last_scan_time));
+       ctime_r(&xstats[0].xs_last_scan_time, buff);
+       buff[strlen(buff) - 1] = '\0';          /* remove ending \n */
+       json_object_object_add(j_xstats, "last_scan_time",
+                              json_object_new_string(buff));
+       json_object_object_add(j_xstats, "cached_files",
+                              json_object_new_int64(xstats[0].xs_files));
+       json_object_object_add(j_xstats, "cached_bytes",
+                              json_object_new_int64(xstats[0].xs_bytes));
+       json_object_object_add(j_xstats, "min_cached_file_size",
+                              json_object_new_int64(xstats[0].xs_min_size));
+       json_object_object_add(j_xstats, "max_cached_file_size",
+                              json_object_new_int64(xstats[0].xs_max_size));
+
+       json_object_object_add(j_xstats, "min_cached_file_atime_secs",
+                             json_object_new_int64(xstats[0].xs_min_atime));
+       ctime_r(&xstats[0].xs_min_atime, buff);
+       buff[strlen(buff) - 1] = '\0';          /* remove ending \n */
+       json_object_object_add(j_xstats, "min_cached_file_atime",
+                              json_object_new_string(buff));
+
+       json_object_object_add(j_xstats, "max_cached_file_atime_secs",
+                              json_object_new_int64(xstats[0].xs_max_atime));
+       ctime_r(&xstats[0].xs_max_atime, buff);
+       buff[strlen(buff) - 1] = '\0';          /* remove ending \n */
+       json_object_object_add(j_xstats, "max_cached_file_atime",
+                              json_object_new_string(buff));
+
+       if (xstats[0].xs_files == 0)
+               avg_age_ms = 0;
+       else
+               avg_age_ms = xstats[0].xs_total_ages_ms / xstats[0].xs_files +
+                         curr * 1000 - xstats[0].xs_age0_time_ms;
+       json_object_object_add(j_xstats, "average_age_secs",
+                              json_object_new_int64(avg_age_ms / 1000));
+
+       json_object_object_add(j_all, "cache_stats", j_xstats);
        /* convert the JSON object to string and print to file */
        const char *str = json_object_to_json_string_ext(j_all,
                JSON_C_TO_STRING_PRETTY |
@@ -333,15 +407,16 @@ static void usage(void)
                "\t-b, --debug, enable debugging output\n"
                "\t    --log-level={debug|info|normal|warn|error|fatal|off}, set log level (default: info)\n"
                "\t-f, --config-file=FILE\n"
-               "\t-C, --cache=DIR root directory of PCC\n"
-               "\t-M, --mount=DIR local Lustre client's mountpoint\n"
+               "\t-C, --cache=DIR, root directory of PCC\n"
+               "\t-M, --mount=DIR, local Lustre client's mountpoint\n"
                "\t-A, --rwid, --roid=NUM the roid/rwid of PCC\n"
-               "\t-H, --high-usage=NUM %% of space or inode to start purging (default: %u)\n"
-               "\t-L, --low-usage=NUM %% of space or inode to stop purging (default: %u)\n"
+               "\t-H, --high-usage=NUM, %% of space or inode to start purging (default: %u)\n"
+               "\t-L, --low-usage=NUM, %% of space or inode to stop purging (default: %u)\n"
                "\t-i, --interval=NUM, seconds to next check (default: %u)\n"
-               "\t-t, --scan-threads=NUM scanning threads (default: %u)\n"
+               "\t-t, --scan-threads=NUM, scanning threads (default: %u)\n"
                "\t    --candidate-num=NUM, candidate number of approximate LRU (default: %d, min: %d, max: %d)\n"
-               "\t    --max-scan-secs, max seconds to scan continously before purging (default: %d, min: %d, max: %d)\n"
+               "\t    --max-scan-secs=NUM, max seconds to scan continously before purging (default: %d, min: %d, max: %d)\n"
+               "\t    --force-scan-interval=NUM, seconds to next forceful scanning (default: %d, min: %d, max: %d)\n"
                "\t-w, --dump=FILE, dump stats to FILE when signal USR1 is recieved (default: /var/run/lpcc_purge-RWID.stats)\n"
                "\t    --pidfile=FILE, the pidfile name (default: /var/run/lpcc_purge-RWID.pid)\n"
                "\t    --clear-hashdir, clear empty hash dir after detaching file\n"
@@ -355,7 +430,9 @@ static void usage(void)
                DEF_INTERVAL,
                DEF_SCAN_THREADS,
                DEF_CANDIDATE_NUM, MIN_CANDIDATE_NUM, MAX_CANDIDATE_NUM,
-               DEF_MAX_SCAN_SECS, MIN_MAX_SCAN_SECS, MAX_MAX_SCAN_SECS
+               DEF_MAX_SCAN_SECS, MIN_MAX_SCAN_SECS, MAX_MAX_SCAN_SECS,
+               DEF_FORCE_SCAN_INTERVAL, MIN_FORCE_SCAN_INTERVAL,
+               MAX_FORCE_SCAN_INTERVAL
        );
 }
 
@@ -377,6 +454,8 @@ static struct option long_options[] = {
        { "pidfile", required_argument, NULL, OPT_PIDFILE},
        { "clear-hashdir", no_argument, NULL, OPT_CLEAR_HASHDIR},
        { "max-scan-secs", required_argument, NULL, OPT_MAX_SCAN_SECS},
+       { "force-scan-interval", required_argument, NULL,
+         OPT_FORCE_SCAN_INTERVAL},
        { "version", no_argument, NULL, OPT_VERSION},
        { "help", no_argument, NULL, 'h' },
        { NULL }
@@ -639,6 +718,17 @@ static void lpcc_purge_process_opt(int c, char *optarg)
                }
                opt.o_max_scan_secs = value;
                break;
+       case OPT_FORCE_SCAN_INTERVAL:
+               value = strtol(optarg, &endptr, 10);
+               if (*endptr != '\0' || value < MIN_FORCE_SCAN_INTERVAL ||
+                   value > MAX_FORCE_SCAN_INTERVAL) {
+                       llapi_error(LLAPI_MSG_FATAL, -EINVAL,
+                                   "invalid force-scan-interval: '%s'\n",
+                                   optarg);
+                       exit(1);
+               }
+               opt.o_force_scan_interval = value;
+               break;
        case OPT_VERSION:
                lipe_version();
                exit(0);
@@ -800,10 +890,14 @@ static double lpcc_purge_get_fs_usage(const char *fs)
 static void lpcc_purge_wait_for_scan(void)
 {
        while (1) {
+               time_t now = time(NULL);
                double usage = lpcc_purge_get_fs_usage(opt.o_cache);
                LPCC_PURGE_LOG(LLAPI_MSG_DEBUG, "usage: %.1f\n", usage);
-               if (usage >= opt.o_high_usage || opt.o_dry_run) {
+               if (usage >= opt.o_high_usage || opt.o_dry_run ||
+                   (now - xstats[1].xs_last_scan_time) >
+                    opt.o_force_scan_interval) {
                        stats.s_start_usage = usage;
+                       xstats[1].xs_last_scan_time = now;
                        break;
                } else {
                        sleep(opt.o_interval);
@@ -953,6 +1047,28 @@ static int lpcc_purge_detach_candidate(const char *mnt,
        return 0;
 }
 
+static void do_xstats(struct lpcc_purge_candidate *candidate)
+{
+       pthread_mutex_lock(&xstats[1].xs_lock);
+
+       xstats[1].xs_files++;
+       xstats[1].xs_bytes += candidate->c_size;
+       if (xstats[1].xs_min_size == 0 ||
+           candidate->c_size < xstats[1].xs_min_size)
+               xstats[1].xs_min_size = candidate->c_size;
+       if (candidate->c_size > xstats[1].xs_max_size)
+               xstats[1].xs_max_size = candidate->c_size;
+       if (xstats[1].xs_min_atime == 0 ||
+           candidate->c_atime_ms < xstats[1].xs_min_atime * 1000)
+               xstats[1].xs_min_atime = candidate->c_atime_ms / 1000;
+       if (candidate->c_atime_ms > xstats[1].xs_max_atime * 1000)
+               xstats[1].xs_max_atime = candidate->c_atime_ms / 1000;
+       xstats[1].xs_total_ages_ms += xstats[1].xs_age0_time_ms -
+                                     candidate->c_atime_ms;
+
+       pthread_mutex_unlock(&xstats[1].xs_lock);
+}
+
 static int lpcc_purge_scan_callback(struct lipe_instance *instance,
                             struct lipe_object *object,
                             struct lipe_object_attrs *attrs)
@@ -983,7 +1099,7 @@ static int lpcc_purge_scan_callback(struct lipe_instance *instance,
                goto out;
        }
 
-       candidate = lpcc_purge_candidate_new(attrs->loa_atime_ms, fid, object->u.lo_posix.lop_path);
+       candidate = lpcc_purge_candidate_new(fid, attrs, object->u.lo_posix.lop_path);
        if (candidate == NULL) {
                rc = -ENOMEM;
                goto out;
@@ -1044,6 +1160,7 @@ static int lpcc_purge_scan_callback(struct lipe_instance *instance,
                (candidate_set.cs_count - n_detach - n_discard) * sizeof(candidate_set.cs_arr[0]));
 
        for (j = candidate_set.cs_count - n_discard; j < candidate_set.cs_count; j++) {
+               do_xstats(candidate_set.cs_arr[j]);
                lpcc_purge_candidate_destroy(candidate_set.cs_arr[j]);
        }
        candidate_set.cs_count = candidate_set.cs_count - n_detach - n_discard;
@@ -1113,10 +1230,13 @@ static void lpcc_purge_free_space(void)
 
        for (i = 0; i < candidate_set.cs_count; i++) {
                double usage = lpcc_purge_get_fs_usage(opt.o_cache);
-               if (usage >= opt.o_low_usage || opt.o_dry_run)
-                       lpcc_purge_detach_candidate(opt.o_mount, candidate_set.cs_arr[i]);
+               struct lpcc_purge_candidate *candidate = candidate_set.cs_arr[i];
+
+               if (stats.s_start_usage > opt.o_high_usage &&
+                   (usage >= opt.o_low_usage || opt.o_dry_run))
+                       lpcc_purge_detach_candidate(opt.o_mount, candidate);
                else
-                       break;
+                       do_xstats(candidate);
        }
 
        lpcc_purge_candidate_set_clear();
@@ -1150,11 +1270,21 @@ int main(int argc, char *argv[])
                stats.s_end_time = 0;
                stats.s_start_time = time(NULL);
 
+               xstats[1].xs_files = 0;
+               xstats[1].xs_bytes = 0;
+               xstats[1].xs_min_size = 0;
+               xstats[1].xs_max_size = 0;
+               xstats[1].xs_min_atime = 0;
+               xstats[1].xs_max_atime = 0;
+               xstats[1].xs_age0_time_ms = stats.s_start_time * 1000;
+               xstats[1].xs_total_ages_ms = 0;
+
                lpcc_purge_scan();
 
                lpcc_purge_free_space();
 
                stats.s_end_time = time(NULL);
+               xstats[0] = xstats[1];
 
                LPCC_PURGE_LOG(LLAPI_MSG_INFO,
                             "scanned %ld object, purged %ld\n",