2 export blocking statistics in /proc/<pid>/task<tid>/sleep. Statistics
3 collection for given file is activated on the first read of
4 /proc/pid/sleep. When statistics collection is on on each context switch
5 current back-trace is built (through __builtin_return_address()). For each
6 monitored process there is a LRU list of such back-traces. Useful when trying
7 to understand where elapsed time is spent.
9 Signed-off-by: Nikita Danilov <nikita@clusterfs.com>
12 arch/i386/Kconfig.debug | 9 +
13 arch/i386/kernel/entry.S | 4
14 arch/i386/kernel/process.c | 52 +++++
15 arch/i386/mm/fault.c | 7
17 fs/proc/proc_misc.c | 7
18 include/linux/sched.h | 9 -
19 include/linux/sleep_info.h | 48 +++++
24 kernel/sleep_info.c | 392 +++++++++++++++++++++++++++++++++++++++++++++
25 13 files changed, 560 insertions(+), 15 deletions(-)
28 ===================================================================
29 --- linux.orig/Makefile
31 @@ -490,6 +490,10 @@ ifndef CONFIG_FRAME_POINTER
32 CFLAGS += -fomit-frame-pointer
35 +ifdef CONFIG_FRAME_POINTER_FORCE
36 +CFLAGS += -fno-omit-frame-pointer
39 ifdef CONFIG_DEBUG_INFO
42 Index: linux/arch/i386/Kconfig.debug
43 ===================================================================
44 --- linux.orig/arch/i386/Kconfig.debug
45 +++ linux/arch/i386/Kconfig.debug
46 @@ -79,4 +79,13 @@ config X86_MPPARSE
48 source "arch/i386/Kconfig.kgdb"
51 + bool "Export sleep statistics"
52 + depends on DEBUG_KERNEL && FRAME_POINTER
55 + If you say Y here, new file /proc/pid/stack will appear that contains
56 + call-traces where given process blocked most of the time. If unsure say
60 Index: linux/arch/x86_64/Kconfig.debug
61 ===================================================================
62 --- linux.orig/arch/x86_64/Kconfig.debug
63 +++ linux/arch/x86_64/Kconfig.debug
64 @@ -30,6 +30,21 @@ config SCHEDSTATS
65 application, you can say N to avoid the very slight overhead
69 + bool "Compile the kernel with frame pointers"
71 + Compile the kernel with frame pointers. This may help for some
72 + debugging with external debuggers. Note the standard oops backtracer
73 + doesn't make use of this and the x86-64 kernel doesn't ensure a
74 + consistent frame pointer through inline assembly (semaphores etc.)
75 + Normally you should say N.
77 +config FRAME_POINTER_FORCE
78 + bool "Compile the kernel with frame pointers"
79 + depends on FRAME_POINTER
81 + Enforce passing -fno-omit-frame-pointer to the compiler.
84 depends on GART_IOMMU && DEBUG_KERNEL
85 bool "Enable IOMMU debugging"
86 @@ -66,4 +81,13 @@ config IOMMU_LEAK
87 #config X86_REMOTE_DEBUG
88 # bool "kgdb debugging stub"
91 + bool "Export sleep statistics"
92 + depends on DEBUG_KERNEL && FRAME_POINTER && FRAME_POINTER_FORCE
95 + If you say Y here, new file /proc/pid/stack will appear that contains
96 + call-traces where given process blocked most of the time. If unsure say
100 Index: linux/fs/proc/base.c
101 ===================================================================
102 --- linux.orig/fs/proc/base.c
103 +++ linux/fs/proc/base.c
104 @@ -103,6 +103,9 @@ enum pid_directory_inos {
105 #ifdef CONFIG_AUDITSYSCALL
108 +#ifdef CONFIG_PROC_SLEEP
111 PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */
114 @@ -170,6 +173,9 @@ static struct pid_entry tid_base_stuff[]
115 #ifdef CONFIG_AUDITSYSCALL
116 E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO),
118 +#ifdef CONFIG_PROC_SLEEP
119 + E(PROC_TID_SLEEP, "sleep", S_IFREG|S_IRUGO),
124 @@ -1173,6 +1179,10 @@ out:
128 +#ifdef CONFIG_PROC_SLEEP
129 +extern struct file_operations proc_sleep_operations;
133 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
135 @@ -1463,6 +1473,11 @@ static struct dentry *proc_pident_lookup
136 inode->i_fop = &proc_loginuid_operations;
139 +#ifdef CONFIG_PROC_SLEEP
140 + case PROC_TID_SLEEP:
141 + inode->i_fop = &proc_sleep_operations;
145 printk("procfs: impossible type (%d)",p->type);
147 Index: linux/fs/proc/proc_misc.c
148 ===================================================================
149 --- linux.orig/fs/proc/proc_misc.c
150 +++ linux/fs/proc/proc_misc.c
151 @@ -270,6 +270,11 @@ static struct file_operations proc_cpuin
152 .release = seq_release,
155 +#ifdef CONFIG_PROC_SLEEP
156 +extern struct file_operations proc_global_sleep_operations;
157 +extern struct file_operations proc_global_stack_operations;
160 extern struct seq_operations vmstat_op;
161 static int vmstat_open(struct inode *inode, struct file *file)
163 @@ -641,4 +646,8 @@ void __init proc_misc_init(void)
164 entry->proc_fops = &ppc_htab_operations;
167 +#ifdef CONFIG_PROC_SLEEP
168 + create_seq_entry("sleep", 0, &proc_global_sleep_operations);
169 + create_seq_entry("stacktrace", 0, &proc_global_stack_operations);
172 Index: linux/include/linux/sched.h
173 ===================================================================
174 --- linux.orig/include/linux/sched.h
175 +++ linux/include/linux/sched.h
177 #include <linux/completion.h>
178 #include <linux/pid.h>
179 #include <linux/percpu.h>
180 +#include <linux/sleep_info.h>
183 extern int exec_shield;
184 @@ -646,6 +647,8 @@ struct task_struct {
185 /* process tracking callback */
186 struct list_head ptrack_list;
188 + /* where this task blocked */
189 + struct sleep_info sinfo;
192 static inline pid_t process_group(struct task_struct *tsk)
193 Index: linux/include/linux/sleep_info.h
194 ===================================================================
195 --- linux.orig/include/linux/sleep_info.h
196 +++ linux/include/linux/sleep_info.h
198 +#ifndef _LINUX_SLEEP_INFO_H
199 +#define _LINUX_SLEEP_INFO_H
203 +#include <linux/config.h>
204 +#include <linux/list.h>
205 +#include <linux/stddef.h>
206 +#include <linux/spinlock.h>
208 +#ifdef CONFIG_PROC_SLEEP
210 +struct __sleep_info;
214 + struct __sleep_info *p;
215 + unsigned long long last_in;
218 +void init_sleep_info(struct task_struct *tsk);
219 +void free_sleep_info(struct sleep_info *sinfo);
221 +void sleep_in_hook(struct task_struct *tsk);
222 +void sleep_ex_hook(struct task_struct *tsk);
224 +void stacktrace_record(void);
226 +extern struct file_operations proc_sleep_operations;
228 +/* CONFIG_PROC_SLEEP */
231 +struct sleep_info {};
233 +#define init_sleep_info(tsk)
234 +#define free_sleep_info(sinfo)
236 +#define sleep_in_hook(tsk)
237 +#define sleep_ex_hook(tsk)
238 +#define stacktrace_record()
240 +/* CONFIG_PROC_SLEEP */
246 +/* _LINUX_SLEEP_INFO_H */
248 Index: linux/kernel/Makefile
249 ===================================================================
250 --- linux.orig/kernel/Makefile
251 +++ linux/kernel/Makefile
252 @@ -27,6 +27,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
253 obj-$(CONFIG_AUDITFILESYSTEM) += auditfs.o
254 obj-$(CONFIG_KPROBES) += kprobes.o
255 obj-$(CONFIG_PTRACK) += ptrack.o
256 +obj-$(CONFIG_PROC_SLEEP) += sleep_info.o
258 ifneq ($(CONFIG_IA64),y)
259 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
260 Index: linux/kernel/exit.c
261 ===================================================================
262 --- linux.orig/kernel/exit.c
263 +++ linux/kernel/exit.c
265 #include <linux/mount.h>
266 #include <linux/proc_fs.h>
267 #include <linux/mempolicy.h>
268 +#include <linux/sleep_info.h>
270 #include <asm/uaccess.h>
271 #include <asm/unistd.h>
272 @@ -96,6 +97,7 @@ repeat:
273 write_unlock_irq(&tasklist_lock);
274 spin_unlock(&p->proc_lock);
275 proc_pid_flush(proc_dentry);
276 + free_sleep_info(&p->sinfo);
280 Index: linux/kernel/fork.c
281 ===================================================================
282 --- linux.orig/kernel/fork.c
283 +++ linux/kernel/fork.c
285 #include <linux/audit.h>
286 #include <linux/profile.h>
287 #include <linux/rmap.h>
288 +#include <linux/sleep_info.h>
290 #include <asm/pgtable.h>
291 #include <asm/pgalloc.h>
292 @@ -1172,6 +1173,8 @@ static task_t *copy_process(unsigned lon
293 if (!current->signal->tty && p->signal->tty)
294 p->signal->tty = NULL;
296 + init_sleep_info(p);
299 write_unlock_irq(&tasklist_lock);
301 Index: linux/kernel/sched.c
302 ===================================================================
303 --- linux.orig/kernel/sched.c
304 +++ linux/kernel/sched.c
305 @@ -2656,6 +2656,8 @@ asmlinkage void __sched schedule(void)
309 + sleep_in_hook(current);
314 @@ -2797,6 +2799,8 @@ switch_tasks:
315 preempt_enable_no_resched();
316 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
319 + sleep_ex_hook(current);
322 EXPORT_SYMBOL(schedule);
323 Index: linux/kernel/sleep_info.c
324 ===================================================================
325 --- linux.orig/kernel/sleep_info.c
326 +++ linux/kernel/sleep_info.c
328 +#include <linux/config.h>
329 +#include <linux/sleep_info.h>
330 +#include <linux/seq_file.h>
331 +#include <linux/kallsyms.h>
332 +#include <linux/slab.h>
333 +#include <linux/sched.h>
334 +#include <linux/proc_fs.h>
335 +#include <linux/fs.h>
336 +#include <linux/module.h>
337 +#include <linux/hardirq.h>
339 +#include <asm/div64.h>
340 +#include <linux/errno.h>
342 +#ifdef CONFIG_PROC_SLEEP
344 +#define SLEEP_TRACES_DEF (32)
345 +#define GLOBAL_SLEEP_TRACES_DEF (512)
346 +#define SLEEP_TRACE_DEPTH (20)
347 +#define GLOBAL_STACK_TRACES_DEF (512)
349 +struct stack_trace {
351 + unsigned long long total;
352 + unsigned long long max;
353 + struct list_head lru;
354 + unsigned long hash;
356 + void *frame[SLEEP_TRACE_DEPTH];
359 +struct __sleep_info {
362 + struct list_head lru;
363 + struct stack_trace traces[0];
366 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
368 + return p > (void *)tinfo &&
369 + p < (void *)tinfo + THREAD_SIZE - 3;
372 +static void fill_trace(struct stack_trace *trace)
375 + struct thread_info *tinfo = current_thread_info();
376 + unsigned long fp; /* frame pointer */
378 + /* Grab fp right from our regs */
379 +#if defined(CONFIG_X86_64) && defined(CONFIG_FRAME_POINTER_FORCE)
380 + asm ("movq %%rbp, %0" : "=r" (fp) : );
381 +#define FP_RETADDR_OFFSET (8)
382 +#elif defined(CONFIG_X86)
383 + asm ("movl %%ebp, %0" : "=r" (fp) : );
384 +#define FP_RETADDR_OFFSET (4)
386 +#error Unsupported platform.
389 + for (i = -3; /* skip three innermost frames */
390 + i < SLEEP_TRACE_DEPTH && valid_stack_ptr(tinfo, (void *)fp);
391 + ++i, fp = *(unsigned long *)fp) {
393 + trace->frame[i] = *(void **)(fp + FP_RETADDR_OFFSET);
395 + trace->depth = max(i, 0);
398 +static int stack_trace_eq(struct stack_trace *t1, struct stack_trace *t2)
401 + t1->hash == t2->hash && t1->depth == t2->depth &&
402 + !memcmp(t1->frame, t2->frame, t1->depth * sizeof t1->frame[0]);
405 +static unsigned long stack_trace_hash(struct stack_trace *trace)
410 + for (i = 0, hash = 0; i < trace->depth; ++ i)
411 + hash += ((unsigned long)trace->frame[i]) >> 3;
415 +/* from sleepometer by Andrew Morton */
416 +static unsigned long long grab_time(void)
418 +#if defined(CONFIG_X86) && !defined(CONFIG_X86_64)
420 + * do_gettimeofday() goes backwards sometimes :(. Usethe TSC
422 + unsigned long long ret;
423 + extern unsigned long cpu_khz;
426 + do_div(ret, cpu_khz / 1000);
429 + struct timeval now;
430 + unsigned long long ret;
432 + do_gettimeofday(&now);
435 + ret += now.tv_usec;
440 +static void zero_sleep_info(struct __sleep_info *info)
444 + INIT_LIST_HEAD(&info->lru);
445 + for (i = 0; i < info->nr_traces; ++ i) {
446 + struct stack_trace *trace;
448 + trace = &info->traces[i];
453 + list_add(&trace->lru, &info->lru);
454 + memset(trace->frame, 0, sizeof trace->frame);
458 +static int alloc_sleep_info(struct sleep_info *sinfo, int nr_traces)
460 + struct __sleep_info *result;
462 + result = kmalloc(sizeof *result + nr_traces * sizeof result->traces[0],
464 + if (result == NULL)
468 + sinfo->last_in = 0;
469 + result->nr_traces = nr_traces;
470 + spin_lock_init(&result->lock);
471 + zero_sleep_info(result);
475 +void init_sleep_info(struct task_struct *tsk)
477 + tsk->sinfo.p = NULL;
480 +void free_sleep_info(struct sleep_info *sinfo)
486 +void sleep_in_hook(struct task_struct *tsk)
488 + tsk->sinfo.last_in = grab_time();
491 +void update_sinfo(struct __sleep_info *sinfo, unsigned long long last_in)
493 + if (sinfo != NULL && last_in != 0) {
494 + unsigned long long delta;
495 + struct stack_trace trace;
496 + struct stack_trace *target;
499 + delta = grab_time() - last_in;
500 + fill_trace(&trace);
501 + target = NULL; /* to shut gcc up */
502 + trace.hash = stack_trace_hash(&trace);
503 + spin_lock(&sinfo->lock);
504 + for (i = 0; i < sinfo->nr_traces; ++ i) {
505 + target = &sinfo->traces[i];
506 + if (stack_trace_eq(&trace, target)) {
508 + target->total += delta;
509 + target->max = max(target->max, delta);
513 + if (i == sinfo->nr_traces) {
514 + target = container_of(sinfo->lru.prev,
515 + struct stack_trace, lru);
517 + target->total = target->max = delta;
518 + target->hash = trace.hash;
519 + target->depth = trace.depth;
520 + memcpy(target->frame, trace.frame, sizeof target->frame);
522 + list_move(&target->lru, &sinfo->lru);
523 + spin_unlock(&sinfo->lock);
527 +static struct sleep_info global_sinfo = {
531 +void sleep_ex_hook(struct task_struct *tsk)
533 + struct sleep_info *cur;
536 + update_sinfo(cur->p, cur->last_in);
537 + update_sinfo(global_sinfo.p, cur->last_in);
541 +static spinlock_t sleep_serializer = SPIN_LOCK_UNLOCKED;
543 +static void *sinfo_start(struct sleep_info *sinfo, int nr_traces, loff_t l)
545 + spin_lock(&sleep_serializer);
546 + if (sinfo->p == NULL)
547 + alloc_sleep_info(sinfo, nr_traces);
548 + spin_unlock(&sleep_serializer);
549 + if (sinfo->p == NULL)
552 + if (l >= sinfo->p->nr_traces)
555 + return &sinfo->p->traces[l];
558 +static void *sinfo_next(struct sleep_info *sinfo, void *v, loff_t *pos)
562 + if (*pos >= sinfo->p->nr_traces)
565 + return ((struct stack_trace *)v) + 1;
569 + * seq_file methods for /proc/pid/sleep. No locking is needed here, because we
570 + * are iterating over sinfo->traces[] array rather than over sinfo->lru
571 + * list. Actually spin locking is not allowed, because we can schedule between
572 + * sleep_start() and sleep_stop().
575 +static void *sleep_start(struct seq_file *m, loff_t *pos)
577 + struct task_struct *task;
580 + return sinfo_start(&task->sinfo, SLEEP_TRACES_DEF, *pos);
583 +static void sleep_stop(struct seq_file *m, void *v)
587 +static void *sleep_next(struct seq_file *m, void *v, loff_t *pos)
589 + struct task_struct *task;
592 + return sinfo_next(&task->sinfo, v, pos);
595 +static int show_sleep(struct seq_file *m, void *v)
597 + struct stack_trace *trace;
601 + if (trace->nr == 0)
604 + seq_printf(m, "\n%u %llu %llu",
605 + trace->nr, trace->total, trace->max);
606 + for (i = 0; i < trace->depth; ++ i) {
610 + unsigned long address;
611 + unsigned long offset;
612 + unsigned long size;
614 + address = (unsigned long) trace->frame[i];
615 + name = kallsyms_lookup(address, &size,
616 + &offset, &module, namebuf);
617 + seq_printf(m, "\n\t%i %#lx ", i, address);
619 + seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
621 + seq_printf(m, "\n");
625 +struct seq_operations proc_pid_sleep_op = {
626 + .start = sleep_start,
627 + .next = sleep_next,
628 + .stop = sleep_stop,
632 +static int sleep_open(struct inode *inode, struct file *file)
634 + struct task_struct *task = PROC_I(inode)->task;
635 + int ret = seq_open(file, &proc_pid_sleep_op);
637 + struct seq_file *m = file->private_data;
643 +static void reset_sleep_info(struct sleep_info *sinfo, int nr_traces)
645 + spin_lock(&sleep_serializer);
646 + if (sinfo->p == NULL)
647 + alloc_sleep_info(sinfo, nr_traces);
648 + if (sinfo->p != NULL)
649 + zero_sleep_info(sinfo->p);
650 + spin_unlock(&sleep_serializer);
653 +static ssize_t sleep_write(struct file *file, const char __user *buffer,
654 + size_t count, loff_t *ppos)
656 + struct task_struct *tsk = PROC_I(file->f_dentry->d_inode)->task;
658 + reset_sleep_info(&tsk->sinfo, SLEEP_TRACES_DEF);
662 +struct file_operations proc_sleep_operations = {
663 + .open = sleep_open,
665 + .llseek = seq_lseek,
666 + .release = seq_release,
667 + .write = sleep_write
670 +static void *global_sleep_start(struct seq_file *m, loff_t *pos)
672 + return sinfo_start(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF, *pos);
675 +static void *global_sleep_next(struct seq_file *m, void *v, loff_t *pos)
677 + return sinfo_next(&global_sinfo, v, pos);
681 +struct seq_operations global_sleep_op = {
682 + .start = global_sleep_start,
683 + .next = global_sleep_next,
684 + .stop = sleep_stop,
688 +static int global_sleep_open(struct inode *inode, struct file *file)
690 + return seq_open(file, &global_sleep_op);
693 +static ssize_t global_sleep_write(struct file *file, const char __user *buffer,
694 + size_t count, loff_t *ppos)
696 + reset_sleep_info(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF);
700 +struct file_operations proc_global_sleep_operations = {
701 + .open = global_sleep_open,
703 + .llseek = seq_lseek,
704 + .release = seq_release,
705 + .write = global_sleep_write
708 +static struct sleep_info stack_sinfo = {
712 +static void *global_stack_start(struct seq_file *m, loff_t *pos)
714 + return sinfo_start(&stack_sinfo, GLOBAL_STACK_TRACES_DEF, *pos);
717 +static void *global_stack_next(struct seq_file *m, void *v, loff_t *pos)
719 + return sinfo_next(&stack_sinfo, v, pos);
723 +struct seq_operations global_stack_op = {
724 + .start = global_stack_start,
725 + .next = global_stack_next,
726 + .stop = sleep_stop,
730 +static int global_stack_open(struct inode *inode, struct file *file)
732 + return seq_open(file, &global_stack_op);
735 +static ssize_t global_stack_write(struct file *file, const char __user *buffer,
736 + size_t count, loff_t *ppos)
738 + reset_sleep_info(&stack_sinfo, GLOBAL_STACK_TRACES_DEF);
742 +struct file_operations proc_global_stack_operations = {
743 + .open = global_stack_open,
745 + .llseek = seq_lseek,
746 + .release = seq_release,
747 + .write = global_stack_write
750 +void stacktrace_record(void)
752 + if (!in_interrupt())
753 + update_sinfo(stack_sinfo.p, 1);
755 +EXPORT_SYMBOL(stacktrace_record);
757 +/* CONFIG_PROC_SLEEP */