export blocking statistics in /proc//task/sleep. Statistics collection for given file is activated on the first read of /proc/pid/sleep. When statistics collection is on on each context switch current back-trace is built (through __builtin_return_address()). For each monitored process there is a LRU list of such back-traces. Useful when trying to understand where elapsed time is spent. Signed-off-by: Nikita Danilov arch/i386/Kconfig.debug | 9 + arch/i386/kernel/entry.S | 4 arch/i386/kernel/process.c | 52 +++++ arch/i386/mm/fault.c | 7 fs/proc/base.c | 15 + fs/proc/proc_misc.c | 7 include/linux/sched.h | 9 - include/linux/sleep_info.h | 48 +++++ kernel/Makefile | 1 kernel/exit.c | 10 - kernel/fork.c | 17 + kernel/sched.c | 4 kernel/sleep_info.c | 392 +++++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 560 insertions(+), 15 deletions(-) Index: linux-2.6.16.46-0.14/Makefile =================================================================== --- linux-2.6.16.46-0.14.orig/Makefile 2007-07-18 08:02:30.000000000 +0300 +++ linux-2.6.16.46-0.14/Makefile 2007-08-30 05:56:23.000000000 +0300 @@ -493,6 +493,10 @@ CFLAGS += -fomit-frame-pointer endif +ifdef CONFIG_FRAME_POINTER_FORCE +CFLAGS += -fno-omit-frame-pointer +endif + ifdef CONFIG_DEBUG_INFO CFLAGS += -g endif Index: linux-2.6.16.46-0.14/arch/i386/Kconfig.debug =================================================================== --- linux-2.6.16.46-0.14.orig/arch/i386/Kconfig.debug 2007-07-18 08:02:28.000000000 +0300 +++ linux-2.6.16.46-0.14/arch/i386/Kconfig.debug 2007-08-30 05:56:23.000000000 +0300 @@ -190,4 +190,13 @@ say N then kdb can only be used from a PC (AT) keyboard or a serial console. +config PROC_SLEEP + bool "Export sleep statistics" + depends on DEBUG_KERNEL && FRAME_POINTER + default n + help + If you say Y here, new file /proc/pid/stack will appear that contains + call-traces where given process blocked most of the time. If unsure say + N. + endmenu Index: linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug =================================================================== --- linux-2.6.16.46-0.14.orig/arch/x86_64/Kconfig.debug 2007-07-18 08:02:30.000000000 +0300 +++ linux-2.6.16.46-0.14/arch/x86_64/Kconfig.debug 2007-08-30 05:56:23.000000000 +0300 @@ -122,6 +122,21 @@ setting to 2. +config FRAME_POINTER + bool "Compile the kernel with frame pointers" + help + Compile the kernel with frame pointers. This may help for some + debugging with external debuggers. Note the standard oops backtracer + doesn't make use of this and the x86-64 kernel doesn't ensure a + consistent frame pointer through inline assembly (semaphores etc.) + Normally you should say N. + +config FRAME_POINTER_FORCE + bool "Compile the kernel with frame pointers" + depends on FRAME_POINTER + help + Enforce passing -fno-omit-frame-pointer to the compiler. + config IOMMU_DEBUG depends on GART_IOMMU && DEBUG_KERNEL bool "Enable IOMMU debugging" @@ -148,4 +163,13 @@ #config X86_REMOTE_DEBUG # bool "kgdb debugging stub" +config PROC_SLEEP + bool "Export sleep statistics" + depends on DEBUG_KERNEL && FRAME_POINTER && FRAME_POINTER_FORCE + default n + help + If you say Y here, new file /proc/pid/stack will appear that contains + call-traces where given process blocked most of the time. If unsure say + N. + endmenu Index: linux-2.6.16.46-0.14/fs/proc/base.c =================================================================== --- linux-2.6.16.46-0.14.orig/fs/proc/base.c 2007-07-18 08:02:25.000000000 +0300 +++ linux-2.6.16.46-0.14/fs/proc/base.c 2007-08-30 06:15:24.000000000 +0300 @@ -167,7 +167,9 @@ #endif PROC_TID_OOM_SCORE, PROC_TID_OOM_ADJUST, - +#ifdef CONFIG_PROC_SLEEP + PROC_TID_SLEEP, +#endif /* Add new entries before this */ PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ }; @@ -267,6 +269,9 @@ #ifdef CONFIG_AUDITSYSCALL E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), #endif +#ifdef CONFIG_PROC_SLEEP + E(PROC_TID_SLEEP, "sleep", S_IFREG|S_IRUGO), +#endif {0,0,NULL,0} }; @@ -1546,6 +1551,10 @@ return ~0U; } +#ifdef CONFIG_PROC_SLEEP +extern struct file_operations proc_sleep_operations; +#endif + /* SMP-safe */ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { @@ -1885,6 +1894,11 @@ inode->i_fop = &proc_loginuid_operations; break; #endif +#ifdef CONFIG_PROC_SLEEP + case PROC_TID_SLEEP: + inode->i_fop = &proc_sleep_operations; + break; +#endif default: printk("procfs: impossible type (%d)",p->type); iput(inode); Index: linux-2.6.16.46-0.14/fs/proc/proc_misc.c =================================================================== --- linux-2.6.16.46-0.14.orig/fs/proc/proc_misc.c 2007-07-18 08:02:25.000000000 +0300 +++ linux-2.6.16.46-0.14/fs/proc/proc_misc.c 2007-08-30 05:56:23.000000000 +0300 @@ -414,6 +414,11 @@ .release = seq_release, }; +#ifdef CONFIG_PROC_SLEEP +extern struct file_operations proc_global_sleep_operations; +extern struct file_operations proc_global_stack_operations; +#endif + extern struct seq_operations vmstat_op; static int vmstat_open(struct inode *inode, struct file *file) { @@ -784,4 +789,8 @@ if (entry) entry->proc_fops = &proc_sysrq_trigger_operations; #endif +#ifdef CONFIG_PROC_SLEEP + create_seq_entry("sleep", 0, &proc_global_sleep_operations); + create_seq_entry("stacktrace", 0, &proc_global_stack_operations); +#endif } Index: linux-2.6.16.46-0.14/include/linux/sched.h =================================================================== --- linux-2.6.16.46-0.14.orig/include/linux/sched.h 2007-07-18 08:02:36.000000000 +0300 +++ linux-2.6.16.46-0.14/include/linux/sched.h 2007-08-30 06:02:43.000000000 +0300 @@ -35,6 +35,7 @@ #include #include #include +#include #include /* For AT_VECTOR_SIZE */ @@ -679,6 +680,8 @@ unsigned long ttwu_move_affine; unsigned long ttwu_move_balance; #endif + /* where this task blocked */ + struct sleep_info sinfo; }; extern void partition_sched_domains(cpumask_t *partition1, Index: linux-2.6.16.46-0.14/include/linux/sleep_info.h =================================================================== --- linux-2.6.16.46-0.14.orig/include/linux/sleep_info.h 2006-06-16 16:07:58.000000000 +0300 +++ linux-2.6.16.46-0.14/include/linux/sleep_info.h 2007-08-30 05:56:23.000000000 +0300 @@ -0,0 +1,50 @@ +#ifndef _LINUX_SLEEP_INFO_H +#define _LINUX_SLEEP_INFO_H + +#ifdef __KERNEL__ + +#include +#include +#include +#include + +#ifdef CONFIG_PROC_SLEEP + +struct __sleep_info; +struct task_struct; + +struct sleep_info { + struct __sleep_info *p; + unsigned long long last_in; +}; + +void init_sleep_info(struct task_struct *tsk); +void free_sleep_info(struct sleep_info *sinfo); + +void sleep_in_hook(struct task_struct *tsk); +void sleep_ex_hook(struct task_struct *tsk); + +void stacktrace_record(void); + +extern struct file_operations proc_sleep_operations; + +/* CONFIG_PROC_SLEEP */ +#else + +struct sleep_info {}; + +#define init_sleep_info(tsk) +#define free_sleep_info(sinfo) + +#define sleep_in_hook(tsk) +#define sleep_ex_hook(tsk) +#define stacktrace_record() + +/* CONFIG_PROC_SLEEP */ +#endif + +/* __KERNEL__ */ +#endif + +/* _LINUX_SLEEP_INFO_H */ +#endif Index: linux-2.6.16.46-0.14/kernel/Makefile =================================================================== --- linux-2.6.16.46-0.14.orig/kernel/Makefile 2007-07-18 08:02:35.000000000 +0300 +++ linux-2.6.16.46-0.14/kernel/Makefile 2007-08-30 06:12:26.000000000 +0300 @@ -38,6 +38,7 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_PAGG) += pagg.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_PROC_SLEEP) += sleep_info.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is Index: linux-2.6.16.46-0.14/kernel/exit.c =================================================================== --- linux-2.6.16.46-0.14.orig/kernel/exit.c 2007-08-30 05:43:46.000000000 +0300 +++ linux-2.6.16.46-0.14/kernel/exit.c 2007-08-30 06:13:03.000000000 +0300 @@ -39,6 +39,7 @@ #include #include #include /* for audit_free() */ +#include #include #include @@ -114,6 +115,7 @@ write_unlock_irq(&tasklist_lock); spin_unlock(&p->proc_lock); proc_pid_flush(proc_dentry); + free_sleep_info(&p->sinfo); release_thread(p); put_task_struct(p); Index: linux-2.6.16.46-0.14/kernel/fork.c =================================================================== --- linux-2.6.16.46-0.14.orig/kernel/fork.c 2007-07-18 08:02:35.000000000 +0300 +++ linux-2.6.16.46-0.14/kernel/fork.c 2007-08-30 06:13:36.000000000 +0300 @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -1222,6 +1223,8 @@ attach_pid(p, PIDTYPE_TGID, p->tgid); attach_pid(p, PIDTYPE_PID, p->pid); + init_sleep_info(p); + nr_threads++; total_forks++; spin_unlock(¤t->sighand->siglock); Index: linux-2.6.16.46-0.14/kernel/sched.c =================================================================== --- linux-2.6.16.46-0.14.orig/kernel/sched.c 2007-08-30 05:43:48.000000000 +0300 +++ linux-2.6.16.46-0.14/kernel/sched.c 2007-08-30 05:56:23.000000000 +0300 @@ -2934,6 +2934,8 @@ } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + sleep_in_hook(current); + need_resched: preempt_disable(); prev = current; @@ -3136,6 +3138,8 @@ barrier(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; + + sleep_ex_hook(current); } EXPORT_SYMBOL(preempt_schedule); Index: linux-2.6.16.46-0.14/kernel/sleep_info.c =================================================================== --- linux-2.6.16.46-0.14.orig/kernel/sleep_info.c 2006-06-16 16:07:58.000000000 +0300 +++ linux-2.6.16.46-0.14/kernel/sleep_info.c 2007-08-30 05:56:23.000000000 +0300 @@ -0,0 +1,431 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_PROC_SLEEP + +#define SLEEP_TRACES_DEF (32) +#define GLOBAL_SLEEP_TRACES_DEF (512) +#define SLEEP_TRACE_DEPTH (20) +#define GLOBAL_STACK_TRACES_DEF (512) + +struct stack_trace { + unsigned nr; + unsigned long long total; + unsigned long long max; + struct list_head lru; + unsigned long hash; + int depth; + void *frame[SLEEP_TRACE_DEPTH]; +}; + +struct __sleep_info { + spinlock_t lock; + int nr_traces; + struct list_head lru; + struct stack_trace traces[0]; +}; + +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; +} + +static void fill_trace(struct stack_trace *trace) +{ + int i; + struct thread_info *tinfo = current_thread_info(); + unsigned long fp; /* frame pointer */ + + /* Grab fp right from our regs */ +#if defined(CONFIG_X86_64) && defined(CONFIG_FRAME_POINTER_FORCE) + asm ("movq %%rbp, %0" : "=r" (fp) : ); +#define FP_RETADDR_OFFSET (8) +#elif defined(CONFIG_X86) + asm ("movl %%ebp, %0" : "=r" (fp) : ); +#define FP_RETADDR_OFFSET (4) +#else +#error Unsupported platform. +#endif + + for (i = -3; /* skip three innermost frames */ + i < SLEEP_TRACE_DEPTH && valid_stack_ptr(tinfo, (void *)fp); + ++i, fp = *(unsigned long *)fp) { + if (i >= 0) + trace->frame[i] = *(void **)(fp + FP_RETADDR_OFFSET); + } + trace->depth = max(i, 0); +} + +static int stack_trace_eq(struct stack_trace *t1, struct stack_trace *t2) +{ + return + t1->hash == t2->hash && t1->depth == t2->depth && + !memcmp(t1->frame, t2->frame, t1->depth * sizeof t1->frame[0]); +} + +static unsigned long stack_trace_hash(struct stack_trace *trace) +{ + int i; + unsigned hash; + + for (i = 0, hash = 0; i < trace->depth; ++ i) + hash += ((unsigned long)trace->frame[i]) >> 3; + return hash; +} + +/* from sleepometer by Andrew Morton */ +static unsigned long long grab_time(void) +{ +#if defined(CONFIG_X86) && !defined(CONFIG_X86_64) + /* + * do_gettimeofday() goes backwards sometimes :(. Usethe TSC + */ + unsigned long long ret; + extern unsigned long cpu_khz; + + rdtscll(ret); + do_div(ret, cpu_khz / 1000); + return ret; +#else + struct timeval now; + unsigned long long ret; + + do_gettimeofday(&now); + ret = now.tv_sec; + ret *= 1000000; + ret += now.tv_usec; + return ret; +#endif +} + +static void zero_sleep_info(struct __sleep_info *info) +{ + int i; + + INIT_LIST_HEAD(&info->lru); + for (i = 0; i < info->nr_traces; ++ i) { + struct stack_trace *trace; + + trace = &info->traces[i]; + trace->nr = 0; + trace->total = 0; + trace->max = 0; + trace->hash = 0; + list_add(&trace->lru, &info->lru); + memset(trace->frame, 0, sizeof trace->frame); + } +} + +static int alloc_sleep_info(struct sleep_info *sinfo, int nr_traces) +{ + struct __sleep_info *result; + + result = kmalloc(sizeof *result + nr_traces * sizeof result->traces[0], + GFP_ATOMIC); + if (result == NULL) + return -ENOMEM; + + sinfo->p = result; + sinfo->last_in = 0; + result->nr_traces = nr_traces; + spin_lock_init(&result->lock); + zero_sleep_info(result); + return 0; +} + +void init_sleep_info(struct task_struct *tsk) +{ + tsk->sinfo.p = NULL; +} + +void free_sleep_info(struct sleep_info *sinfo) +{ + kfree(sinfo->p); + sinfo->p = NULL; +} + +void sleep_in_hook(struct task_struct *tsk) +{ + tsk->sinfo.last_in = grab_time(); +} + +void update_sinfo(struct __sleep_info *sinfo, unsigned long long last_in) +{ + if (sinfo != NULL && last_in != 0) { + unsigned long long delta; + struct stack_trace trace; + struct stack_trace *target; + int i; + + delta = grab_time() - last_in; + fill_trace(&trace); + target = NULL; /* to shut gcc up */ + trace.hash = stack_trace_hash(&trace); + spin_lock(&sinfo->lock); + for (i = 0; i < sinfo->nr_traces; ++ i) { + target = &sinfo->traces[i]; + if (stack_trace_eq(&trace, target)) { + ++ target->nr; + target->total += delta; + target->max = max(target->max, delta); + break; + } + } + if (i == sinfo->nr_traces) { + target = container_of(sinfo->lru.prev, + struct stack_trace, lru); + target->nr = 1; + target->total = target->max = delta; + target->hash = trace.hash; + target->depth = trace.depth; + memcpy(target->frame, trace.frame, sizeof target->frame); + } + list_move(&target->lru, &sinfo->lru); + spin_unlock(&sinfo->lock); + } +} + +static struct sleep_info global_sinfo = { + .p = NULL +}; + +void sleep_ex_hook(struct task_struct *tsk) +{ + struct sleep_info *cur; + + cur = &tsk->sinfo; + update_sinfo(cur->p, cur->last_in); + update_sinfo(global_sinfo.p, cur->last_in); + cur->last_in = 0; +} + +static spinlock_t sleep_serializer = SPIN_LOCK_UNLOCKED; + +static void *sinfo_start(struct sleep_info *sinfo, int nr_traces, loff_t l) +{ + spin_lock(&sleep_serializer); + if (sinfo->p == NULL) + alloc_sleep_info(sinfo, nr_traces); + spin_unlock(&sleep_serializer); + if (sinfo->p == NULL) + return NULL; + + if (l >= sinfo->p->nr_traces) + return NULL; + else + return &sinfo->p->traces[l]; +} + +static void *sinfo_next(struct sleep_info *sinfo, void *v, loff_t *pos) +{ + (*pos)++; + + if (*pos >= sinfo->p->nr_traces) + return NULL; + else + return ((struct stack_trace *)v) + 1; +} + +/* + * seq_file methods for /proc/pid/sleep. No locking is needed here, because we + * are iterating over sinfo->traces[] array rather than over sinfo->lru + * list. Actually spin locking is not allowed, because we can schedule between + * sleep_start() and sleep_stop(). + */ + +static void *sleep_start(struct seq_file *m, loff_t *pos) +{ + struct task_struct *task; + + task = m->private; + return sinfo_start(&task->sinfo, SLEEP_TRACES_DEF, *pos); +} + +static void sleep_stop(struct seq_file *m, void *v) +{ +} + +static void *sleep_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct task_struct *task; + + task = m->private; + return sinfo_next(&task->sinfo, v, pos); +} + +static int show_sleep(struct seq_file *m, void *v) +{ + struct stack_trace *trace; + int i; + + trace = v; + if (trace->nr == 0) + return 0; + + seq_printf(m, "\n%u %llu %llu", + trace->nr, trace->total, trace->max); + for (i = 0; i < trace->depth; ++ i) { + char *module; + const char *name; + char namebuf[128]; + unsigned long address; + unsigned long offset; + unsigned long size; + + address = (unsigned long) trace->frame[i]; + name = kallsyms_lookup(address, &size, + &offset, &module, namebuf); + seq_printf(m, "\n\t%i %#lx ", i, address); + if (name != NULL) + seq_printf(m, "%s+%#lx/%#lx", name, offset, size); + } + seq_printf(m, "\n"); + return 0; +} + +struct seq_operations proc_pid_sleep_op = { + .start = sleep_start, + .next = sleep_next, + .stop = sleep_stop, + .show = show_sleep +}; + +static int sleep_open(struct inode *inode, struct file *file) +{ + struct task_struct *task = PROC_I(inode)->task; + int ret = seq_open(file, &proc_pid_sleep_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = task; + } + return ret; +} + +static void reset_sleep_info(struct sleep_info *sinfo, int nr_traces) +{ + spin_lock(&sleep_serializer); + if (sinfo->p == NULL) + alloc_sleep_info(sinfo, nr_traces); + if (sinfo->p != NULL) + zero_sleep_info(sinfo->p); + spin_unlock(&sleep_serializer); +} + +static ssize_t sleep_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct task_struct *tsk = PROC_I(file->f_dentry->d_inode)->task; + + reset_sleep_info(&tsk->sinfo, SLEEP_TRACES_DEF); + return count; +} + +struct file_operations proc_sleep_operations = { + .open = sleep_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = sleep_write +}; + +static void *global_sleep_start(struct seq_file *m, loff_t *pos) +{ + return sinfo_start(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF, *pos); +} + +static void *global_sleep_next(struct seq_file *m, void *v, loff_t *pos) +{ + return sinfo_next(&global_sinfo, v, pos); +} + + +struct seq_operations global_sleep_op = { + .start = global_sleep_start, + .next = global_sleep_next, + .stop = sleep_stop, + .show = show_sleep +}; + +static int global_sleep_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &global_sleep_op); +} + +static ssize_t global_sleep_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + reset_sleep_info(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF); + return count; +} + +struct file_operations proc_global_sleep_operations = { + .open = global_sleep_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = global_sleep_write +}; + +static struct sleep_info stack_sinfo = { + .p = NULL +}; + +static void *global_stack_start(struct seq_file *m, loff_t *pos) +{ + return sinfo_start(&stack_sinfo, GLOBAL_STACK_TRACES_DEF, *pos); +} + +static void *global_stack_next(struct seq_file *m, void *v, loff_t *pos) +{ + return sinfo_next(&stack_sinfo, v, pos); +} + + +struct seq_operations global_stack_op = { + .start = global_stack_start, + .next = global_stack_next, + .stop = sleep_stop, + .show = show_sleep +}; + +static int global_stack_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &global_stack_op); +} + +static ssize_t global_stack_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + reset_sleep_info(&stack_sinfo, GLOBAL_STACK_TRACES_DEF); + return count; +} + +struct file_operations proc_global_stack_operations = { + .open = global_stack_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = global_stack_write +}; + +void stacktrace_record(void) +{ + if (!in_interrupt()) + update_sinfo(stack_sinfo.p, 1); +} +EXPORT_SYMBOL(stacktrace_record); + +/* CONFIG_PROC_SLEEP */ +#endif