2 export blocking statistics in /proc/<pid>/task<tid>/sleep. Statistics
3 collection for given file is activated on the first read of
4 /proc/pid/sleep. When statistics collection is on on each context switch
5 current back-trace is built (through __builtin_return_address()). For each
6 monitored process there is a LRU list of such back-traces. Useful when trying
7 to understand where elapsed time is spent.
9 Signed-off-by: Nikita Danilov <nikita@clusterfs.com>
12 arch/i386/Kconfig.debug | 9 +
13 arch/i386/kernel/entry.S | 4
14 arch/i386/kernel/process.c | 52 +++++
15 arch/i386/mm/fault.c | 7
17 fs/proc/proc_misc.c | 7
18 include/linux/sched.h | 9 -
19 include/linux/sleep_info.h | 48 +++++
24 kernel/sleep_info.c | 392 +++++++++++++++++++++++++++++++++++++++++++++
25 13 files changed, 560 insertions(+), 15 deletions(-)
27 Index: linux-2.6.16.46-0.14/Makefile
28 ===================================================================
29 --- linux-2.6.16.46-0.14.orig/Makefile 2007-07-18 08:02:30.000000000 +0300
30 +++ linux-2.6.16.46-0.14/Makefile 2007-08-30 05:56:23.000000000 +0300
32 CFLAGS += -fomit-frame-pointer
35 +ifdef CONFIG_FRAME_POINTER_FORCE
36 +CFLAGS += -fno-omit-frame-pointer
39 ifdef CONFIG_DEBUG_INFO
42 Index: linux-2.6.16.46-0.14/arch/i386/Kconfig.debug
43 ===================================================================
44 --- linux-2.6.16.46-0.14.orig/arch/i386/Kconfig.debug 2007-07-18 08:02:28.000000000 +0300
45 +++ linux-2.6.16.46-0.14/arch/i386/Kconfig.debug 2007-08-30 05:56:23.000000000 +0300
47 say N then kdb can only be used from a PC (AT) keyboard or a serial
51 + bool "Export sleep statistics"
52 + depends on DEBUG_KERNEL && FRAME_POINTER
55 + If you say Y here, new file /proc/pid/stack will appear that contains
56 + call-traces where given process blocked most of the time. If unsure say
60 Index: linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug
61 ===================================================================
62 --- linux-2.6.16.46-0.14.orig/arch/x86_64/Kconfig.debug 2007-07-18 08:02:30.000000000 +0300
63 +++ linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug 2007-08-30 05:56:23.000000000 +0300
69 + bool "Compile the kernel with frame pointers"
71 + Compile the kernel with frame pointers. This may help for some
72 + debugging with external debuggers. Note the standard oops backtracer
73 + doesn't make use of this and the x86-64 kernel doesn't ensure a
74 + consistent frame pointer through inline assembly (semaphores etc.)
75 + Normally you should say N.
77 +config FRAME_POINTER_FORCE
78 + bool "Compile the kernel with frame pointers"
79 + depends on FRAME_POINTER
81 + Enforce passing -fno-omit-frame-pointer to the compiler.
84 depends on GART_IOMMU && DEBUG_KERNEL
85 bool "Enable IOMMU debugging"
87 #config X86_REMOTE_DEBUG
88 # bool "kgdb debugging stub"
91 + bool "Export sleep statistics"
92 + depends on DEBUG_KERNEL && FRAME_POINTER && FRAME_POINTER_FORCE
95 + If you say Y here, new file /proc/pid/stack will appear that contains
96 + call-traces where given process blocked most of the time. If unsure say
100 Index: linux-2.6.16.46-0.14/fs/proc/base.c
101 ===================================================================
102 --- linux-2.6.16.46-0.14.orig/fs/proc/base.c 2007-07-18 08:02:25.000000000 +0300
103 +++ linux-2.6.16.46-0.14/fs/proc/base.c 2007-08-30 06:15:24.000000000 +0300
109 +#ifdef CONFIG_PROC_SLEEP
112 /* Add new entries before this */
113 PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */
116 #ifdef CONFIG_AUDITSYSCALL
117 E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO),
119 +#ifdef CONFIG_PROC_SLEEP
120 + E(PROC_TID_SLEEP, "sleep", S_IFREG|S_IRUGO),
125 @@ -1546,6 +1551,10 @@
129 +#ifdef CONFIG_PROC_SLEEP
130 +extern struct file_operations proc_sleep_operations;
134 static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
136 @@ -1885,6 +1894,11 @@
137 inode->i_fop = &proc_loginuid_operations;
140 +#ifdef CONFIG_PROC_SLEEP
141 + case PROC_TID_SLEEP:
142 + inode->i_fop = &proc_sleep_operations;
146 printk("procfs: impossible type (%d)",p->type);
148 Index: linux-2.6.16.46-0.14/fs/proc/proc_misc.c
149 ===================================================================
150 --- linux-2.6.16.46-0.14.orig/fs/proc/proc_misc.c 2007-07-18 08:02:25.000000000 +0300
151 +++ linux-2.6.16.46-0.14/fs/proc/proc_misc.c 2007-08-30 05:56:23.000000000 +0300
153 .release = seq_release,
156 +#ifdef CONFIG_PROC_SLEEP
157 +extern struct file_operations proc_global_sleep_operations;
158 +extern struct file_operations proc_global_stack_operations;
161 extern struct seq_operations vmstat_op;
162 static int vmstat_open(struct inode *inode, struct file *file)
166 entry->proc_fops = &proc_sysrq_trigger_operations;
168 +#ifdef CONFIG_PROC_SLEEP
169 + create_seq_entry("sleep", 0, &proc_global_sleep_operations);
170 + create_seq_entry("stacktrace", 0, &proc_global_stack_operations);
173 Index: linux-2.6.16.46-0.14/include/linux/sched.h
174 ===================================================================
175 --- linux-2.6.16.46-0.14.orig/include/linux/sched.h 2007-07-18 08:02:36.000000000 +0300
176 +++ linux-2.6.16.46-0.14/include/linux/sched.h 2007-08-30 06:02:43.000000000 +0300
178 #include <linux/topology.h>
179 #include <linux/seccomp.h>
180 #include <linux/rcupdate.h>
181 +#include <linux/sleep_info.h>
183 #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
186 unsigned long ttwu_move_affine;
187 unsigned long ttwu_move_balance;
189 + /* where this task blocked */
190 + struct sleep_info sinfo;
193 extern void partition_sched_domains(cpumask_t *partition1,
194 Index: linux-2.6.16.46-0.14/include/linux/sleep_info.h
195 ===================================================================
196 --- linux-2.6.16.46-0.14.orig/include/linux/sleep_info.h 2006-06-16 16:07:58.000000000 +0300
197 +++ linux-2.6.16.46-0.14/include/linux/sleep_info.h 2007-08-30 05:56:23.000000000 +0300
199 +#ifndef _LINUX_SLEEP_INFO_H
200 +#define _LINUX_SLEEP_INFO_H
204 +#include <linux/config.h>
205 +#include <linux/list.h>
206 +#include <linux/stddef.h>
207 +#include <linux/spinlock.h>
209 +#ifdef CONFIG_PROC_SLEEP
211 +struct __sleep_info;
215 + struct __sleep_info *p;
216 + unsigned long long last_in;
219 +void init_sleep_info(struct task_struct *tsk);
220 +void free_sleep_info(struct sleep_info *sinfo);
222 +void sleep_in_hook(struct task_struct *tsk);
223 +void sleep_ex_hook(struct task_struct *tsk);
225 +void stacktrace_record(void);
227 +extern struct file_operations proc_sleep_operations;
229 +/* CONFIG_PROC_SLEEP */
232 +struct sleep_info {};
234 +#define init_sleep_info(tsk)
235 +#define free_sleep_info(sinfo)
237 +#define sleep_in_hook(tsk)
238 +#define sleep_ex_hook(tsk)
239 +#define stacktrace_record()
241 +/* CONFIG_PROC_SLEEP */
247 +/* _LINUX_SLEEP_INFO_H */
249 Index: linux-2.6.16.46-0.14/kernel/Makefile
250 ===================================================================
251 --- linux-2.6.16.46-0.14.orig/kernel/Makefile 2007-07-18 08:02:35.000000000 +0300
252 +++ linux-2.6.16.46-0.14/kernel/Makefile 2007-08-30 06:12:26.000000000 +0300
254 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
255 obj-$(CONFIG_PAGG) += pagg.o
256 obj-$(CONFIG_RELAY) += relay.o
257 +obj-$(CONFIG_PROC_SLEEP) += sleep_info.o
259 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
260 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
261 Index: linux-2.6.16.46-0.14/kernel/exit.c
262 ===================================================================
263 --- linux-2.6.16.46-0.14.orig/kernel/exit.c 2007-08-30 05:43:46.000000000 +0300
264 +++ linux-2.6.16.46-0.14/kernel/exit.c 2007-08-30 06:13:03.000000000 +0300
266 #include <linux/mutex.h>
267 #include <linux/pagg.h>
268 #include <linux/audit.h> /* for audit_free() */
269 +#include <linux/sleep_info.h>
271 #include <asm/uaccess.h>
272 #include <asm/unistd.h>
274 write_unlock_irq(&tasklist_lock);
275 spin_unlock(&p->proc_lock);
276 proc_pid_flush(proc_dentry);
277 + free_sleep_info(&p->sinfo);
281 Index: linux-2.6.16.46-0.14/kernel/fork.c
282 ===================================================================
283 --- linux-2.6.16.46-0.14.orig/kernel/fork.c 2007-07-18 08:02:35.000000000 +0300
284 +++ linux-2.6.16.46-0.14/kernel/fork.c 2007-08-30 06:13:36.000000000 +0300
286 #include <linux/pagg.h>
287 #include <linux/delayacct.h>
288 #include <linux/taskstats_kern.h>
289 +#include <linux/sleep_info.h>
291 #include <asm/pgtable.h>
292 #include <asm/pgalloc.h>
293 @@ -1222,6 +1223,8 @@
294 attach_pid(p, PIDTYPE_TGID, p->tgid);
295 attach_pid(p, PIDTYPE_PID, p->pid);
297 + init_sleep_info(p);
301 spin_unlock(¤t->sighand->siglock);
302 Index: linux-2.6.16.46-0.14/kernel/sched.c
303 ===================================================================
304 --- linux-2.6.16.46-0.14.orig/kernel/sched.c 2007-08-30 05:43:48.000000000 +0300
305 +++ linux-2.6.16.46-0.14/kernel/sched.c 2007-08-30 05:56:23.000000000 +0300
306 @@ -2934,6 +2934,8 @@
308 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
310 + sleep_in_hook(current);
315 @@ -3136,6 +3138,8 @@
317 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
320 + sleep_ex_hook(current);
323 EXPORT_SYMBOL(preempt_schedule);
324 Index: linux-2.6.16.46-0.14/kernel/sleep_info.c
325 ===================================================================
326 --- linux-2.6.16.46-0.14.orig/kernel/sleep_info.c 2006-06-16 16:07:58.000000000 +0300
327 +++ linux-2.6.16.46-0.14/kernel/sleep_info.c 2007-08-30 05:56:23.000000000 +0300
329 +#include <linux/config.h>
330 +#include <linux/sleep_info.h>
331 +#include <linux/seq_file.h>
332 +#include <linux/kallsyms.h>
333 +#include <linux/slab.h>
334 +#include <linux/sched.h>
335 +#include <linux/proc_fs.h>
336 +#include <linux/fs.h>
337 +#include <linux/module.h>
338 +#include <linux/hardirq.h>
340 +#include <asm/div64.h>
341 +#include <linux/errno.h>
343 +#ifdef CONFIG_PROC_SLEEP
345 +#define SLEEP_TRACES_DEF (32)
346 +#define GLOBAL_SLEEP_TRACES_DEF (512)
347 +#define SLEEP_TRACE_DEPTH (20)
348 +#define GLOBAL_STACK_TRACES_DEF (512)
350 +struct stack_trace {
352 + unsigned long long total;
353 + unsigned long long max;
354 + struct list_head lru;
355 + unsigned long hash;
357 + void *frame[SLEEP_TRACE_DEPTH];
360 +struct __sleep_info {
363 + struct list_head lru;
364 + struct stack_trace traces[0];
367 +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
369 + return p > (void *)tinfo &&
370 + p < (void *)tinfo + THREAD_SIZE - 3;
373 +static void fill_trace(struct stack_trace *trace)
376 + struct thread_info *tinfo = current_thread_info();
377 + unsigned long fp; /* frame pointer */
379 + /* Grab fp right from our regs */
380 +#if defined(CONFIG_X86_64) && defined(CONFIG_FRAME_POINTER_FORCE)
381 + asm ("movq %%rbp, %0" : "=r" (fp) : );
382 +#define FP_RETADDR_OFFSET (8)
383 +#elif defined(CONFIG_X86)
384 + asm ("movl %%ebp, %0" : "=r" (fp) : );
385 +#define FP_RETADDR_OFFSET (4)
387 +#error Unsupported platform.
390 + for (i = -3; /* skip three innermost frames */
391 + i < SLEEP_TRACE_DEPTH && valid_stack_ptr(tinfo, (void *)fp);
392 + ++i, fp = *(unsigned long *)fp) {
394 + trace->frame[i] = *(void **)(fp + FP_RETADDR_OFFSET);
396 + trace->depth = max(i, 0);
399 +static int stack_trace_eq(struct stack_trace *t1, struct stack_trace *t2)
402 + t1->hash == t2->hash && t1->depth == t2->depth &&
403 + !memcmp(t1->frame, t2->frame, t1->depth * sizeof t1->frame[0]);
406 +static unsigned long stack_trace_hash(struct stack_trace *trace)
411 + for (i = 0, hash = 0; i < trace->depth; ++ i)
412 + hash += ((unsigned long)trace->frame[i]) >> 3;
416 +/* from sleepometer by Andrew Morton */
417 +static unsigned long long grab_time(void)
419 +#if defined(CONFIG_X86) && !defined(CONFIG_X86_64)
421 + * do_gettimeofday() goes backwards sometimes :(. Usethe TSC
423 + unsigned long long ret;
424 + extern unsigned long cpu_khz;
427 + do_div(ret, cpu_khz / 1000);
430 + struct timeval now;
431 + unsigned long long ret;
433 + do_gettimeofday(&now);
436 + ret += now.tv_usec;
441 +static void zero_sleep_info(struct __sleep_info *info)
445 + INIT_LIST_HEAD(&info->lru);
446 + for (i = 0; i < info->nr_traces; ++ i) {
447 + struct stack_trace *trace;
449 + trace = &info->traces[i];
454 + list_add(&trace->lru, &info->lru);
455 + memset(trace->frame, 0, sizeof trace->frame);
459 +static int alloc_sleep_info(struct sleep_info *sinfo, int nr_traces)
461 + struct __sleep_info *result;
463 + result = kmalloc(sizeof *result + nr_traces * sizeof result->traces[0],
465 + if (result == NULL)
469 + sinfo->last_in = 0;
470 + result->nr_traces = nr_traces;
471 + spin_lock_init(&result->lock);
472 + zero_sleep_info(result);
476 +void init_sleep_info(struct task_struct *tsk)
478 + tsk->sinfo.p = NULL;
481 +void free_sleep_info(struct sleep_info *sinfo)
487 +void sleep_in_hook(struct task_struct *tsk)
489 + tsk->sinfo.last_in = grab_time();
492 +void update_sinfo(struct __sleep_info *sinfo, unsigned long long last_in)
494 + if (sinfo != NULL && last_in != 0) {
495 + unsigned long long delta;
496 + struct stack_trace trace;
497 + struct stack_trace *target;
500 + delta = grab_time() - last_in;
501 + fill_trace(&trace);
502 + target = NULL; /* to shut gcc up */
503 + trace.hash = stack_trace_hash(&trace);
504 + spin_lock(&sinfo->lock);
505 + for (i = 0; i < sinfo->nr_traces; ++ i) {
506 + target = &sinfo->traces[i];
507 + if (stack_trace_eq(&trace, target)) {
509 + target->total += delta;
510 + target->max = max(target->max, delta);
514 + if (i == sinfo->nr_traces) {
515 + target = container_of(sinfo->lru.prev,
516 + struct stack_trace, lru);
518 + target->total = target->max = delta;
519 + target->hash = trace.hash;
520 + target->depth = trace.depth;
521 + memcpy(target->frame, trace.frame, sizeof target->frame);
523 + list_move(&target->lru, &sinfo->lru);
524 + spin_unlock(&sinfo->lock);
528 +static struct sleep_info global_sinfo = {
532 +void sleep_ex_hook(struct task_struct *tsk)
534 + struct sleep_info *cur;
537 + update_sinfo(cur->p, cur->last_in);
538 + update_sinfo(global_sinfo.p, cur->last_in);
542 +static spinlock_t sleep_serializer = SPIN_LOCK_UNLOCKED;
544 +static void *sinfo_start(struct sleep_info *sinfo, int nr_traces, loff_t l)
546 + spin_lock(&sleep_serializer);
547 + if (sinfo->p == NULL)
548 + alloc_sleep_info(sinfo, nr_traces);
549 + spin_unlock(&sleep_serializer);
550 + if (sinfo->p == NULL)
553 + if (l >= sinfo->p->nr_traces)
556 + return &sinfo->p->traces[l];
559 +static void *sinfo_next(struct sleep_info *sinfo, void *v, loff_t *pos)
563 + if (*pos >= sinfo->p->nr_traces)
566 + return ((struct stack_trace *)v) + 1;
570 + * seq_file methods for /proc/pid/sleep. No locking is needed here, because we
571 + * are iterating over sinfo->traces[] array rather than over sinfo->lru
572 + * list. Actually spin locking is not allowed, because we can schedule between
573 + * sleep_start() and sleep_stop().
576 +static void *sleep_start(struct seq_file *m, loff_t *pos)
578 + struct task_struct *task;
581 + return sinfo_start(&task->sinfo, SLEEP_TRACES_DEF, *pos);
584 +static void sleep_stop(struct seq_file *m, void *v)
588 +static void *sleep_next(struct seq_file *m, void *v, loff_t *pos)
590 + struct task_struct *task;
593 + return sinfo_next(&task->sinfo, v, pos);
596 +static int show_sleep(struct seq_file *m, void *v)
598 + struct stack_trace *trace;
602 + if (trace->nr == 0)
605 + seq_printf(m, "\n%u %llu %llu",
606 + trace->nr, trace->total, trace->max);
607 + for (i = 0; i < trace->depth; ++ i) {
611 + unsigned long address;
612 + unsigned long offset;
613 + unsigned long size;
615 + address = (unsigned long) trace->frame[i];
616 + name = kallsyms_lookup(address, &size,
617 + &offset, &module, namebuf);
618 + seq_printf(m, "\n\t%i %#lx ", i, address);
620 + seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
622 + seq_printf(m, "\n");
626 +struct seq_operations proc_pid_sleep_op = {
627 + .start = sleep_start,
628 + .next = sleep_next,
629 + .stop = sleep_stop,
633 +static int sleep_open(struct inode *inode, struct file *file)
635 + struct task_struct *task = PROC_I(inode)->task;
636 + int ret = seq_open(file, &proc_pid_sleep_op);
638 + struct seq_file *m = file->private_data;
644 +static void reset_sleep_info(struct sleep_info *sinfo, int nr_traces)
646 + spin_lock(&sleep_serializer);
647 + if (sinfo->p == NULL)
648 + alloc_sleep_info(sinfo, nr_traces);
649 + if (sinfo->p != NULL)
650 + zero_sleep_info(sinfo->p);
651 + spin_unlock(&sleep_serializer);
654 +static ssize_t sleep_write(struct file *file, const char __user *buffer,
655 + size_t count, loff_t *ppos)
657 + struct task_struct *tsk = PROC_I(file->f_dentry->d_inode)->task;
659 + reset_sleep_info(&tsk->sinfo, SLEEP_TRACES_DEF);
663 +struct file_operations proc_sleep_operations = {
664 + .open = sleep_open,
666 + .llseek = seq_lseek,
667 + .release = seq_release,
668 + .write = sleep_write
671 +static void *global_sleep_start(struct seq_file *m, loff_t *pos)
673 + return sinfo_start(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF, *pos);
676 +static void *global_sleep_next(struct seq_file *m, void *v, loff_t *pos)
678 + return sinfo_next(&global_sinfo, v, pos);
682 +struct seq_operations global_sleep_op = {
683 + .start = global_sleep_start,
684 + .next = global_sleep_next,
685 + .stop = sleep_stop,
689 +static int global_sleep_open(struct inode *inode, struct file *file)
691 + return seq_open(file, &global_sleep_op);
694 +static ssize_t global_sleep_write(struct file *file, const char __user *buffer,
695 + size_t count, loff_t *ppos)
697 + reset_sleep_info(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF);
701 +struct file_operations proc_global_sleep_operations = {
702 + .open = global_sleep_open,
704 + .llseek = seq_lseek,
705 + .release = seq_release,
706 + .write = global_sleep_write
709 +static struct sleep_info stack_sinfo = {
713 +static void *global_stack_start(struct seq_file *m, loff_t *pos)
715 + return sinfo_start(&stack_sinfo, GLOBAL_STACK_TRACES_DEF, *pos);
718 +static void *global_stack_next(struct seq_file *m, void *v, loff_t *pos)
720 + return sinfo_next(&stack_sinfo, v, pos);
724 +struct seq_operations global_stack_op = {
725 + .start = global_stack_start,
726 + .next = global_stack_next,
727 + .stop = sleep_stop,
731 +static int global_stack_open(struct inode *inode, struct file *file)
733 + return seq_open(file, &global_stack_op);
736 +static ssize_t global_stack_write(struct file *file, const char __user *buffer,
737 + size_t count, loff_t *ppos)
739 + reset_sleep_info(&stack_sinfo, GLOBAL_STACK_TRACES_DEF);
743 +struct file_operations proc_global_stack_operations = {
744 + .open = global_stack_open,
746 + .llseek = seq_lseek,
747 + .release = seq_release,
748 + .write = global_stack_write
751 +void stacktrace_record(void)
753 + if (!in_interrupt())
754 + update_sinfo(stack_sinfo.p, 1);
756 +EXPORT_SYMBOL(stacktrace_record);
758 +/* CONFIG_PROC_SLEEP */