3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License version 2 only,
7 * as published by the Free Software Foundation.
9 * This program is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License version 2 for more details (a copy is included
13 * in the LICENSE file that accompanied this code).
15 * You should have received a copy of the GNU General Public License
16 * version 2 along with this program; If not, see
17 * http://www.gnu.org/licenses/gpl-2.0.html
22 * Copyright (c) 2012, 2016, Intel Corporation.
23 * Use is subject to license terms.
25 * Author: Niu Yawei <niu@whamcloud.com>
28 * lustre/obdclass/lprocfs_jobstats.c
31 #define DEBUG_SUBSYSTEM S_CLASS
33 #include <obd_class.h>
34 #include <lprocfs_status.h>
39 * JobID formats & JobID environment variable names for supported
43 * JobID format: 32 bit integer.
44 * JobID env var: SLURM_JOB_ID.
46 * JobID format: Decimal integer range to 99999.
47 * JobID env var: JOB_ID.
49 * JobID format: 6 digit integer by default (up to 999999), can be
50 * increased to 10 digit (up to 2147483646).
51 * JobID env var: LSB_JOBID.
53 * JobID format: String of machine_name.cluster_id.process_id, for
54 * example: fr2n02.32.0
55 * JobID env var: LOADL_STEP_ID.
57 * JobID format: String of sequence_number[.server_name][@server].
58 * JobID env var: PBS_JOBID.
60 * JobID format: Same as PBS.
61 * JobID env var: Same as PBS.
65 struct hlist_node js_hash; /* hash struct for this jobid */
66 struct list_head js_list; /* on ojs_list, with ojs_lock */
67 struct kref js_refcount; /* num users of this struct */
68 char js_jobid[LUSTRE_JOBID_SIZE]; /* job name + NUL*/
69 ktime_t js_time_init; /* time of initial stat*/
70 ktime_t js_time_latest; /* time of most recent stat*/
71 struct lprocfs_stats *js_stats; /* per-job statistics */
72 struct obd_job_stats *js_jobstats; /* for accessing ojs_lock */
73 struct rcu_head js_rcu; /* RCU head for job_reclaim_rcu*/
77 job_stat_hash(struct cfs_hash *hs, const void *key, const unsigned int bits)
79 return cfs_hash_djb2_hash(key, strlen(key), bits);
82 static void *job_stat_key(struct hlist_node *hnode)
85 job = hlist_entry(hnode, struct job_stat, js_hash);
89 static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
92 job = hlist_entry(hnode, struct job_stat, js_hash);
93 return (strlen(job->js_jobid) == strlen(key)) &&
94 !strncmp(job->js_jobid, key, strlen(key));
97 static void *job_stat_object(struct hlist_node *hnode)
99 return hlist_entry(hnode, struct job_stat, js_hash);
102 static bool job_getref_try(struct job_stat *job)
104 return kref_get_unless_zero(&job->js_refcount);
107 static void job_stat_get(struct cfs_hash *hs, struct hlist_node *hnode)
109 struct job_stat *job;
110 job = hlist_entry(hnode, struct job_stat, js_hash);
111 kref_get(&job->js_refcount);
114 static void job_reclaim_rcu(struct rcu_head *head)
116 struct job_stat *job = container_of(head, typeof(*job), js_rcu);
118 lprocfs_stats_free(&job->js_stats);
122 static void job_free(struct kref *kref)
124 struct job_stat *job = container_of(kref, struct job_stat,
127 LASSERT(job->js_jobstats != NULL);
128 spin_lock(&job->js_jobstats->ojs_lock);
129 list_del_rcu(&job->js_list);
130 spin_unlock(&job->js_jobstats->ojs_lock);
132 call_rcu(&job->js_rcu, job_reclaim_rcu);
135 static void job_putref(struct job_stat *job)
137 LASSERT(kref_read(&job->js_refcount) > 0);
138 kref_put(&job->js_refcount, job_free);
141 static void job_stat_put_locked(struct cfs_hash *hs, struct hlist_node *hnode)
143 struct job_stat *job;
145 job = hlist_entry(hnode, struct job_stat, js_hash);
149 static void job_stat_exit(struct cfs_hash *hs, struct hlist_node *hnode)
151 CERROR("should not have any items\n");
154 static struct cfs_hash_ops job_stats_hash_ops = {
155 .hs_hash = job_stat_hash,
156 .hs_key = job_stat_key,
157 .hs_keycmp = job_stat_keycmp,
158 .hs_object = job_stat_object,
159 .hs_get = job_stat_get,
160 .hs_put_locked = job_stat_put_locked,
161 .hs_exit = job_stat_exit,
165 * Jobstats expiry iterator to clean up old jobids
167 * Called for each job_stat structure on this device, it should delete stats
168 * older than the specified \a oldest_time in seconds. If \a oldest_time is
169 * in the future then this will delete all statistics (e.g. during shutdown).
171 * \param[in] hs hash of all jobids on this device
172 * \param[in] bd hash bucket containing this jobid
173 * \param[in] hnode hash structure for this jobid
174 * \param[in] data pointer to stats expiry time in seconds
176 static int job_cleanup_iter_callback(struct cfs_hash *hs,
177 struct cfs_hash_bd *bd,
178 struct hlist_node *hnode, void *data)
180 ktime_t oldest_time = *((ktime_t *)data);
181 struct job_stat *job;
183 job = hlist_entry(hnode, struct job_stat, js_hash);
184 if (ktime_before(job->js_time_latest, oldest_time))
185 cfs_hash_bd_del_locked(hs, bd, hnode);
191 * Clean up jobstats that were updated more than \a before seconds ago.
193 * Since this function may be called frequently, do not scan all of the
194 * jobstats on each call, only twice per cleanup interval. That means stats
195 * may be on average around cleanup_interval / 4 older than the cleanup
196 * interval, but that is not considered harmful.
198 * The value stored in ojs_cleanup_interval is how often to perform a cleanup
199 * scan, and 1/2 of the maximum age of the individual statistics. This is
200 * done rather than dividing the interval by two each time, because it is
201 * much easier to do the division when the value is initially set (in seconds)
202 * rather than after it has been converted to ktime_t, and maybe a bit faster.
204 * If \a clear is true then this will force clean up all jobstats
205 * (e.g. at shutdown).
207 * If there is already another thread doing jobstats cleanup, don't try to
208 * do this again in the current thread unless this is a force cleanup.
210 * \param[in] stats stucture tracking all job stats for this device
211 * \param[in] clear clear all job stats if true
213 static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool clear)
215 ktime_t cleanup_interval = stats->ojs_cleanup_interval;
216 ktime_t now = ktime_get_real();
219 if (likely(!clear)) {
220 /* ojs_cleanup_interval of zero means never clean up stats */
221 if (ktime_to_ns(cleanup_interval) == 0)
224 if (ktime_before(now, ktime_add(stats->ojs_cleanup_last,
228 if (stats->ojs_cleaning)
232 spin_lock(&stats->ojs_lock);
233 if (!clear && stats->ojs_cleaning) {
234 spin_unlock(&stats->ojs_lock);
238 stats->ojs_cleaning = true;
239 spin_unlock(&stats->ojs_lock);
241 /* Can't hold ojs_lock over hash iteration, since it is grabbed by
242 * job_cleanup_iter_callback()
243 * ->cfs_hash_bd_del_locked()
247 * Holding ojs_lock isn't necessary for safety of the hash iteration,
248 * since locking of the hash is handled internally, but there isn't
249 * any benefit to having multiple threads doing cleanup at one time.
251 * Subtract or add twice the cleanup_interval, since it is 1/2 the
252 * maximum age. When clearing all stats, push oldest into the future.
254 cleanup_interval = ktime_add(cleanup_interval, cleanup_interval);
256 oldest = ktime_sub(now, cleanup_interval);
258 oldest = ktime_add(now, cleanup_interval);
259 cfs_hash_for_each_safe(stats->ojs_hash, job_cleanup_iter_callback,
262 spin_lock(&stats->ojs_lock);
263 stats->ojs_cleaning = false;
264 stats->ojs_cleanup_last = ktime_get_real();
265 spin_unlock(&stats->ojs_lock);
268 static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
270 struct job_stat *job;
276 job->js_stats = lprocfs_stats_alloc(jobs->ojs_cntr_num, 0);
277 if (job->js_stats == NULL) {
282 jobs->ojs_cntr_init_fn(job->js_stats, 0, 0);
284 memcpy(job->js_jobid, jobid, sizeof(job->js_jobid));
285 job->js_time_latest = job->js_stats->ls_init;
286 job->js_jobstats = jobs;
287 INIT_HLIST_NODE(&job->js_hash);
288 INIT_LIST_HEAD(&job->js_list);
289 kref_init(&job->js_refcount);
294 int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
295 int event, long amount)
297 struct obd_job_stats *stats = &obd2obt(obd)->obt_jobstats;
298 struct job_stat *job, *job2;
301 LASSERT(stats != NULL);
302 LASSERT(stats->ojs_hash != NULL);
304 if (event >= stats->ojs_cntr_num)
307 if (jobid == NULL || strlen(jobid) == 0)
310 /* unterminated jobid should be handled in lustre_msg_get_jobid() */
311 if (strlen(jobid) >= LUSTRE_JOBID_SIZE) {
312 CERROR("%s: invalid jobid size %lu, expect %d\n", obd->obd_name,
313 (unsigned long)strlen(jobid) + 1, LUSTRE_JOBID_SIZE);
317 job = cfs_hash_lookup(stats->ojs_hash, jobid);
321 lprocfs_job_cleanup(stats, false);
323 job = job_alloc(jobid, stats);
327 job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
332 /* We cannot LASSERT(!list_empty(&job->js_list)) here,
333 * since we just lost the race for inserting "job" into the
334 * ojs_list, and some other thread is doing it _right_now_.
335 * Instead, be content the other thread is doing this, since
336 * "job2" was initialized in job_alloc() already. LU-2163 */
338 LASSERT(list_empty(&job->js_list));
339 spin_lock(&stats->ojs_lock);
340 list_add_tail_rcu(&job->js_list, &stats->ojs_list);
341 spin_unlock(&stats->ojs_lock);
345 LASSERT(stats == job->js_jobstats);
346 job->js_time_latest = ktime_get_real();
347 lprocfs_counter_add(job->js_stats, event, amount);
353 EXPORT_SYMBOL(lprocfs_job_stats_log);
355 void lprocfs_job_stats_fini(struct obd_device *obd)
357 struct obd_job_stats *stats = &obd2obt(obd)->obt_jobstats;
359 if (stats->ojs_hash == NULL)
362 lprocfs_job_cleanup(stats, true);
363 cfs_hash_putref(stats->ojs_hash);
364 stats->ojs_hash = NULL;
365 LASSERT(list_empty(&stats->ojs_list));
367 EXPORT_SYMBOL(lprocfs_job_stats_fini);
370 struct lprocfs_jobstats_data {
371 struct obd_job_stats *pjd_stats;
373 struct job_stat *pjd_last_job;
376 static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
378 struct lprocfs_jobstats_data *data = p->private;
379 struct obd_job_stats *stats = data->pjd_stats;
381 struct job_stat *job;
385 return SEQ_START_TOKEN;
387 /* if pos matches the offset of last saved job, start from saved job */
388 if (data->pjd_last_job && data->pjd_last_pos == off)
389 return data->pjd_last_job;
392 list_for_each_entry_rcu(job, &stats->ojs_list, js_list) {
399 static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
401 struct lprocfs_jobstats_data *data = p->private;
402 struct job_stat *job = NULL;
404 /* try to get a ref on current job (not deleted) */
405 if (v && v != SEQ_START_TOKEN && job_getref_try(v))
410 /* drop the ref on the old saved job */
411 if (data->pjd_last_job) {
412 job_putref(data->pjd_last_job);
413 data->pjd_last_job = NULL;
416 /* save the current job for the next read */
418 data->pjd_last_job = job;
421 static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
423 struct lprocfs_jobstats_data *data = p->private;
424 struct obd_job_stats *stats = data->pjd_stats;
425 struct job_stat *job;
426 struct list_head *cur;
429 data->pjd_last_pos = *pos;
430 if (v == SEQ_START_TOKEN) {
431 cur = &stats->ojs_list;
433 job = (struct job_stat *)v;
437 job = list_entry_rcu(cur->next, struct job_stat, js_list);
438 if (&job->js_list == &stats->ojs_list)
445 * Example of output on MDT:
449 * snapshot_time: 1322494486.123456789
450 * start_time: 1322494476.012345678
451 * elapsed_time: 10.111111111
452 * open: { samples: 1, unit: reqs }
453 * close: { samples: 1, unit: reqs }
454 * mknod: { samples: 0, unit: reqs }
455 * link: { samples: 0, unit: reqs }
456 * unlink: { samples: 0, unit: reqs }
457 * mkdir: { samples: 0, unit: reqs }
458 * rmdir: { samples: 0, unit: reqs }
459 * rename: { samples: 0, unit: reqs }
460 * getattr: { samples: 1, unit: reqs }
461 * setattr: { samples: 0, unit: reqs }
462 * getxattr: { samples: 0, unit: reqs }
463 * setxattr: { samples: 0, unit: reqs }
464 * statfs: { samples: 0, unit: reqs }
465 * sync: { samples: 0, unit: reqs }
467 * Example of output on OST:
471 * snapshot_time: 1322494602.123456789
472 * start_time: 1322494592.987654321
473 * elapsed_time: 9.135802468
474 * read: { samples: 0, unit: bytes, min: 0, max: 0, sum: 0 }
475 * write: { samples: 1, unit: bytes, min: 4096, max: 4096, sum: 4096 }
476 * setattr: { samples: 0, unit: reqs }
477 * punch: { samples: 0, unit: reqs }
478 * sync: { samples: 0, unit: reqs }
481 static const char spaces[] = " ";
483 static int inline width(const char *str, int len)
485 return len - min((int)strlen(str), 15);
488 static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
490 struct job_stat *job = v;
491 struct lprocfs_stats *s;
492 struct lprocfs_counter ret;
493 struct lprocfs_counter_header *cntr_header;
494 char escaped[LUSTRE_JOBID_SIZE * 4] = "";
495 char *quote = "", *c, *end;
498 if (v == SEQ_START_TOKEN) {
499 seq_puts(p, "job_stats:\n");
503 /* Quote and escape jobid characters to escape hex codes "\xHH" if
504 * it contains any non-standard characters (space, newline, etc),
505 * so it will be confined to single line and not break parsing.
507 for (c = job->js_jobid, end = job->js_jobid + sizeof(job->js_jobid);
508 c < end && *c != '\0';
510 if (!isalnum(*c) && strchr(".@-_:/", *c) == NULL) {
512 snprintf(escaped + joblen, sizeof(escaped), "\\x%02X",
516 escaped[joblen] = *c;
517 /* if jobid has ':', it should be quoted too */
522 /* '@' is reserved in YAML, so it cannot start a bare string. */
523 if (escaped[0] == '@')
526 seq_printf(p, "- %-16s %s%*s%s\n",
527 "job_id:", quote, joblen, escaped, quote);
528 lprocfs_stats_header(p, job->js_time_latest, job->js_stats->ls_init,
532 for (i = 0; i < s->ls_num; i++) {
533 struct obd_histogram *hist;
535 cntr_header = &s->ls_cnt_header[i];
536 lprocfs_stats_collect(s, i, &ret);
538 seq_printf(p, " %s:%.*s { samples: %11llu",
539 cntr_header->lc_name,
540 width(cntr_header->lc_name, 15), spaces,
542 if (cntr_header->lc_units[0] != '\0')
543 seq_printf(p, ", unit: %5s", cntr_header->lc_units);
545 if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
546 seq_printf(p, ", min: %8llu, max: %8llu, sum: %16llu",
547 ret.lc_count ? ret.lc_min : 0,
548 ret.lc_count ? ret.lc_max : 0,
549 ret.lc_count ? ret.lc_sum : 0);
551 if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
552 seq_printf(p, ", sumsq: %18llu",
553 ret.lc_count ? ret.lc_sumsquare : 0);
556 /* show obd_histogram */
557 hist = s->ls_cnt_header[i].lc_hist;
562 seq_puts(p, ", hist: { ");
563 for (j = 0; j < ARRAY_SIZE(hist->oh_buckets); j++) {
564 unsigned long val = hist->oh_buckets[j];
574 seq_printf(p, "%lu: %lu", BIT(j), val);
576 seq_printf(p, "%luK: %lu", BIT(j - 10),
579 seq_printf(p, "%luM: %lu", BIT(j - 20),
582 seq_printf(p, "%luG: %lu", BIT(j - 30),
593 static const struct seq_operations lprocfs_jobstats_seq_sops = {
594 .start = lprocfs_jobstats_seq_start,
595 .stop = lprocfs_jobstats_seq_stop,
596 .next = lprocfs_jobstats_seq_next,
597 .show = lprocfs_jobstats_seq_show,
600 static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
602 struct lprocfs_jobstats_data *data = NULL;
603 struct seq_file *seq;
606 rc = seq_open(file, &lprocfs_jobstats_seq_sops);
614 data->pjd_stats = pde_data(inode);
615 data->pjd_last_job = NULL;
616 data->pjd_last_pos = 0;
617 seq = file->private_data;
622 static ssize_t lprocfs_jobstats_seq_write(struct file *file,
623 const char __user *buf,
624 size_t len, loff_t *off)
626 struct seq_file *seq = file->private_data;
627 struct lprocfs_jobstats_data *data = seq->private;
628 struct obd_job_stats *stats = data->pjd_stats;
629 char jobid[4 * LUSTRE_JOBID_SIZE]; /* all escaped chars, plus ""\n\0 */
630 char *p1, *p2, *last;
632 struct job_stat *job;
634 if (len == 0 || len >= 4 * LUSTRE_JOBID_SIZE)
637 if (stats->ojs_hash == NULL)
640 if (copy_from_user(jobid, buf, len))
643 last = jobid + len - 1;
645 /* Trim '\n' if any */
649 /* decode escaped chars if jobid is a quoted string */
650 if (jobid[0] == '"' && *last == '"') {
653 for (p1 = jobid, p2 = jobid + 1; p2 <= last; p1++, p2++) {
656 } else if (p2 + 3 <= last && *(p2 + 1) == 'x' &&
657 sscanf(p2 + 2, "%02X", &c) == 1) {
667 jobid[LUSTRE_JOBID_SIZE - 1] = 0;
669 if (strcmp(jobid, "clear") == 0) {
670 lprocfs_job_cleanup(stats, true);
675 if (strlen(jobid) == 0)
678 job = cfs_hash_lookup(stats->ojs_hash, jobid);
682 cfs_hash_del_key(stats->ojs_hash, jobid);
689 * Clean up the seq file state when the /proc file is closed.
691 * This also expires old job stats from the cache after they have been
692 * printed in case the system is idle and not generating new jobstats.
694 * \param[in] inode struct inode for seq file being closed
695 * \param[in] file struct file for seq file being closed
697 * \retval 0 on success
698 * \retval negative errno on failure
700 static int lprocfs_jobstats_seq_release(struct inode *inode, struct file *file)
702 struct seq_file *seq = file->private_data;
703 struct lprocfs_jobstats_data *data = seq->private;
705 /* drop the ref of last saved job */
706 if (data->pjd_last_job) {
707 job_putref(data->pjd_last_job);
708 data->pjd_last_pos = 0;
709 data->pjd_last_job = NULL;
712 lprocfs_job_cleanup(data->pjd_stats, false);
715 return lprocfs_seq_release(inode, file);
718 static const struct proc_ops lprocfs_jobstats_seq_fops = {
719 PROC_OWNER(THIS_MODULE)
720 .proc_open = lprocfs_jobstats_seq_open,
721 .proc_read = seq_read,
722 .proc_write = lprocfs_jobstats_seq_write,
723 .proc_lseek = seq_lseek,
724 .proc_release = lprocfs_jobstats_seq_release,
727 int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
728 cntr_init_callback init_fn)
730 struct proc_dir_entry *entry;
731 struct obd_job_stats *stats;
734 LASSERT(obd->obd_proc_entry != NULL);
735 LASSERT(obd->obd_type->typ_name);
743 /* Currently needs to be a target due to the use of obt_jobstats. */
744 if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) != 0 &&
745 strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) != 0) {
746 CERROR("%s: invalid device type %s for job stats: rc = %d\n",
747 obd->obd_name, obd->obd_type->typ_name, -EINVAL);
750 stats = &obd2obt(obd)->obt_jobstats;
752 LASSERT(stats->ojs_hash == NULL);
753 stats->ojs_hash = cfs_hash_create("JOB_STATS",
754 HASH_JOB_STATS_CUR_BITS,
755 HASH_JOB_STATS_MAX_BITS,
756 HASH_JOB_STATS_BKT_BITS, 0,
761 if (stats->ojs_hash == NULL)
764 INIT_LIST_HEAD(&stats->ojs_list);
765 spin_lock_init(&stats->ojs_lock);
766 stats->ojs_cntr_num = cntr_num;
767 stats->ojs_cntr_init_fn = init_fn;
768 /* Store 1/2 the actual interval, since we use that the most, and
769 * it is easier to work with.
771 stats->ojs_cleanup_interval = ktime_set(600 / 2, 0); /* default 10 min*/
772 stats->ojs_cleanup_last = ktime_get_real();
774 entry = lprocfs_add_simple(obd->obd_proc_entry, "job_stats", stats,
775 &lprocfs_jobstats_seq_fops);
777 lprocfs_job_stats_fini(obd);
782 EXPORT_SYMBOL(lprocfs_job_stats_init);
783 #endif /* CONFIG_PROC_FS*/
785 ssize_t job_cleanup_interval_show(struct kobject *kobj, struct attribute *attr,
788 struct obd_device *obd = container_of(kobj, struct obd_device,
790 struct obd_job_stats *stats;
791 struct timespec64 ts;
793 stats = &obd2obt(obd)->obt_jobstats;
794 ts = ktime_to_timespec64(stats->ojs_cleanup_interval);
796 return scnprintf(buf, PAGE_SIZE, "%lld\n", (long long)ts.tv_sec * 2);
798 EXPORT_SYMBOL(job_cleanup_interval_show);
800 ssize_t job_cleanup_interval_store(struct kobject *kobj,
801 struct attribute *attr,
802 const char *buffer, size_t count)
804 struct obd_device *obd = container_of(kobj, struct obd_device,
806 struct obd_job_stats *stats;
810 stats = &obd2obt(obd)->obt_jobstats;
812 rc = kstrtouint(buffer, 0, &val);
816 stats->ojs_cleanup_interval = ktime_set(val / 2, 0);
817 lprocfs_job_cleanup(stats, false);
821 EXPORT_SYMBOL(job_cleanup_interval_store);