--- /dev/null
+
+export blocking statistics in /proc/<pid>/task<tid>/sleep. Statistics
+collection for given file is activated on the first read of
+/proc/pid/sleep. When statistics collection is on on each context switch
+current back-trace is built (through __builtin_return_address()). For each
+monitored process there is a LRU list of such back-traces. Useful when trying
+to understand where elapsed time is spent.
+
+Signed-off-by: Nikita Danilov <nikita@clusterfs.com>
+
+
+ arch/i386/Kconfig.debug | 9 +
+ arch/i386/kernel/entry.S | 4
+ arch/i386/kernel/process.c | 52 +++++
+ arch/i386/mm/fault.c | 7
+ fs/proc/base.c | 15 +
+ fs/proc/proc_misc.c | 7
+ include/linux/sched.h | 9 -
+ include/linux/sleep_info.h | 48 +++++
+ kernel/Makefile | 1
+ kernel/exit.c | 10 -
+ kernel/fork.c | 17 +
+ kernel/sched.c | 4
+ kernel/sleep_info.c | 392 +++++++++++++++++++++++++++++++++++++++++++++
+ 13 files changed, 560 insertions(+), 15 deletions(-)
+
+Index: linux/arch/i386/Kconfig.debug
+===================================================================
+--- linux.orig/arch/i386/Kconfig.debug
++++ linux/arch/i386/Kconfig.debug
+@@ -79,4 +79,13 @@ config X86_MPPARSE
+
+ source "arch/i386/Kconfig.kgdb"
+
++config PROC_SLEEP
++ bool "Export sleep statistics"
++ depends on DEBUG_KERNEL && FRAME_POINTER
++ default n
++ help
++ If you say Y here, new file /proc/pid/stack will appear that contains
++ call-traces where given process blocked most of the time. If unsure say
++ N.
++
+ endmenu
+Index: linux/arch/i386/kernel/entry.S
+===================================================================
+--- linux.orig/arch/i386/kernel/entry.S
++++ linux/arch/i386/kernel/entry.S
+@@ -312,6 +312,8 @@ ENTRY(lcall27)
+ jmp do_lcall
+
+
++ENTRY(kernel_entries_start)
++
+ ENTRY(ret_from_fork)
+ pushl %eax
+ call schedule_tail
+@@ -548,6 +550,8 @@ syscall_badsys:
+ movl $-ENOSYS,EAX(%esp)
+ jmp resume_userspace
+
++ENTRY(kernel_entries_end)
++
+ /*
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+Index: linux/arch/i386/kernel/process.c
+===================================================================
+--- linux.orig/arch/i386/kernel/process.c
++++ linux/arch/i386/kernel/process.c
+@@ -137,6 +137,9 @@ static void poll_idle (void)
+ }
+ }
+
++void cpu_idle_before(void)
++{;}
++
+ /*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+@@ -206,6 +209,9 @@ void __init select_idle_routine(const st
+ }
+ }
+
++void cpu_idle_after(void)
++{;}
++
+ static int __init idle_setup (char *str)
+ {
+ if (!strncmp(str, "poll", 4)) {
+@@ -279,7 +285,7 @@ __asm__(".section .text\n"
+ /*
+ * Create a kernel thread
+ */
+-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
++static int arch_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+ {
+ struct pt_regs regs;
+
+@@ -299,6 +305,50 @@ int kernel_thread(int (*fn)(void *), voi
+ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
+ }
+
++#if defined(CONFIG_PROC_SLEEP)
++struct bag {
++ int (*fn)(void *);
++ void * arg;
++};
++
++void kernel_thread_trampoline_before(void)
++{;}
++
++static int kernel_thread_trampoline(void *__arg)
++{
++ struct bag *bag;
++ int (*fn)(void *);
++ void * arg;
++
++ bag = __arg;
++ fn = bag->fn;
++ arg = bag->arg;
++ kfree(bag);
++ return fn(arg);
++}
++
++void kernel_thread_trampoline_after(void)
++{;}
++
++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
++{
++ struct bag *bag;
++
++ bag = kmalloc(sizeof *bag, GFP_ATOMIC);
++ if (bag != NULL) {
++ bag->fn = fn;
++ bag->arg = arg;
++ return arch_kernel_thread(kernel_thread_trampoline, bag, flags);
++ } else
++ return -ENOMEM;
++}
++#else
++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
++{
++ return arch_kernel_thread(fn, arg, flags);
++}
++#endif
++
+ /*
+ * Free current thread data structures etc..
+ */
+Index: linux/arch/i386/mm/fault.c
+===================================================================
+--- linux.orig/arch/i386/mm/fault.c
++++ linux/arch/i386/mm/fault.c
+@@ -214,6 +214,9 @@ static inline int is_prefetch(struct pt_
+
+ asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
+
++void do_page_fault_before(void)
++{;}
++
+ /*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+@@ -574,3 +577,7 @@ vmalloc_fault:
+ return;
+ }
+ }
++
++void do_page_fault_after(void)
++{;}
++
+Index: linux/fs/proc/base.c
+===================================================================
+--- linux.orig/fs/proc/base.c
++++ linux/fs/proc/base.c
+@@ -103,6 +103,9 @@ enum pid_directory_inos {
+ #ifdef CONFIG_AUDITSYSCALL
+ PROC_TID_LOGINUID,
+ #endif
++#ifdef CONFIG_PROC_SLEEP
++ PROC_TID_SLEEP,
++#endif
+ PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */
+ };
+
+@@ -170,6 +173,9 @@ static struct pid_entry tid_base_stuff[]
+ #ifdef CONFIG_AUDITSYSCALL
+ E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO),
+ #endif
++#ifdef CONFIG_PROC_SLEEP
++ E(PROC_TID_SLEEP, "sleep", S_IFREG|S_IRUGO),
++#endif
+ {0,0,NULL,0}
+ };
+
+@@ -1173,6 +1179,10 @@ out:
+ return ~0U;
+ }
+
++#ifdef CONFIG_PROC_SLEEP
++extern struct file_operations proc_sleep_operations;
++#endif
++
+ /* SMP-safe */
+ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
+ {
+@@ -1463,6 +1473,11 @@ static struct dentry *proc_pident_lookup
+ inode->i_fop = &proc_loginuid_operations;
+ break;
+ #endif
++#ifdef CONFIG_PROC_SLEEP
++ case PROC_TID_SLEEP:
++ inode->i_fop = &proc_sleep_operations;
++ break;
++#endif
+ default:
+ printk("procfs: impossible type (%d)",p->type);
+ iput(inode);
+Index: linux/fs/proc/proc_misc.c
+===================================================================
+--- linux.orig/fs/proc/proc_misc.c
++++ linux/fs/proc/proc_misc.c
+@@ -270,6 +270,10 @@ static struct file_operations proc_cpuin
+ .release = seq_release,
+ };
+
++#ifdef CONFIG_PROC_SLEEP
++extern struct file_operations proc_global_sleep_operations;
++#endif
++
+ extern struct seq_operations vmstat_op;
+ static int vmstat_open(struct inode *inode, struct file *file)
+ {
+@@ -641,4 +645,7 @@ void __init proc_misc_init(void)
+ entry->proc_fops = &ppc_htab_operations;
+ }
+ #endif
++#ifdef CONFIG_PROC_SLEEP
++ create_seq_entry("sleep", 0, &proc_global_sleep_operations);
++#endif
+ }
+Index: linux/include/linux/sched.h
+===================================================================
+--- linux.orig/include/linux/sched.h
++++ linux/include/linux/sched.h
+@@ -29,6 +29,7 @@
+ #include <linux/completion.h>
+ #include <linux/pid.h>
+ #include <linux/percpu.h>
++#include <linux/sleep_info.h>
+
+ struct exec_domain;
+ extern int exec_shield;
+@@ -646,6 +647,8 @@ struct task_struct {
+ /* process tracking callback */
+ struct list_head ptrack_list;
+ #endif
++ /* where this task blocked */
++ struct sleep_info sinfo;
+ };
+
+ static inline pid_t process_group(struct task_struct *tsk)
+Index: linux/include/linux/sleep_info.h
+===================================================================
+--- linux.orig/include/linux/sleep_info.h
++++ linux/include/linux/sleep_info.h
+@@ -0,0 +1,48 @@
++#ifndef _LINUX_SLEEP_INFO_H
++#define _LINUX_SLEEP_INFO_H
++
++#ifdef __KERNEL__
++
++#include <linux/config.h>
++#include <linux/list.h>
++#include <linux/stddef.h>
++#include <linux/spinlock.h>
++
++#ifdef CONFIG_PROC_SLEEP
++
++struct __sleep_info;
++
++struct sleep_info {
++ struct __sleep_info *p;
++ unsigned long long last_in;
++};
++
++extern void init_sleep_info(struct task_struct *tsk);
++extern void free_sleep_info(struct sleep_info *sinfo);
++
++struct task_struct;
++
++extern void sleep_in_hook(struct task_struct *tsk);
++extern void sleep_ex_hook(struct task_struct *tsk);
++
++extern struct file_operations proc_sleep_operations;
++
++/* CONFIG_PROC_SLEEP */
++#else
++
++struct sleep_info {};
++
++#define init_sleep_info(tsk)
++#define free_sleep_info(sinfo)
++
++#define sleep_in_hook(tsk)
++#define sleep_ex_hook(tsk)
++
++/* CONFIG_PROC_SLEEP */
++#endif
++
++/* __KERNEL__ */
++#endif
++
++/* _LINUX_SLEEP_INFO_H */
++#endif
+Index: linux/kernel/Makefile
+===================================================================
+--- linux.orig/kernel/Makefile
++++ linux/kernel/Makefile
+@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
+ obj-$(CONFIG_AUDITFILESYSTEM) += auditfs.o
+ obj-$(CONFIG_KPROBES) += kprobes.o
+ obj-$(CONFIG_PTRACK) += ptrack.o
++obj-$(CONFIG_PROC_SLEEP) += sleep_info.o
+
+ ifneq ($(CONFIG_IA64),y)
+ # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+Index: linux/kernel/exit.c
+===================================================================
+--- linux.orig/kernel/exit.c
++++ linux/kernel/exit.c
+@@ -25,6 +25,7 @@
+ #include <linux/mount.h>
+ #include <linux/proc_fs.h>
+ #include <linux/mempolicy.h>
++#include <linux/sleep_info.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/unistd.h>
+@@ -96,6 +97,7 @@ repeat:
+ write_unlock_irq(&tasklist_lock);
+ spin_unlock(&p->proc_lock);
+ proc_pid_flush(proc_dentry);
++ free_sleep_info(&p->sinfo);
+ release_thread(p);
+ put_task_struct(p);
+
+Index: linux/kernel/fork.c
+===================================================================
+--- linux.orig/kernel/fork.c
++++ linux/kernel/fork.c
+@@ -40,6 +40,7 @@
+ #include <linux/audit.h>
+ #include <linux/profile.h>
+ #include <linux/rmap.h>
++#include <linux/sleep_info.h>
+
+ #include <asm/pgtable.h>
+ #include <asm/pgalloc.h>
+@@ -1172,6 +1173,8 @@ static task_t *copy_process(unsigned lon
+ if (!current->signal->tty && p->signal->tty)
+ p->signal->tty = NULL;
+
++ init_sleep_info(p);
++
+ nr_threads++;
+ write_unlock_irq(&tasklist_lock);
+ retval = 0;
+Index: linux/kernel/sched.c
+===================================================================
+--- linux.orig/kernel/sched.c
++++ linux/kernel/sched.c
+@@ -2656,6 +2656,8 @@ asmlinkage void __sched schedule(void)
+ }
+ }
+
++ sleep_in_hook(current);
++
+ need_resched:
+ preempt_disable();
+ prev = current;
+@@ -2797,6 +2799,8 @@ switch_tasks:
+ preempt_enable_no_resched();
+ if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+ goto need_resched;
++
++ sleep_ex_hook(current);
+ }
+
+ EXPORT_SYMBOL(schedule);
+Index: linux/kernel/sleep_info.c
+===================================================================
+--- linux.orig/kernel/sleep_info.c
++++ linux/kernel/sleep_info.c
+@@ -0,0 +1,392 @@
++#include <linux/config.h>
++#include <linux/sleep_info.h>
++#include <linux/seq_file.h>
++#include <linux/kallsyms.h>
++#include <linux/slab.h>
++#include <linux/sched.h>
++#include <linux/proc_fs.h>
++#include <linux/fs.h>
++
++#include <asm/div64.h>
++#include <linux/errno.h>
++
++#ifdef CONFIG_PROC_SLEEP
++
++#define SLEEP_TRACES_DEF (32)
++#define GLOBAL_SLEEP_TRACES_DEF (512)
++#define SLEEP_TRACE_DEPTH (20)
++
++struct stack_trace {
++ unsigned nr;
++ unsigned long long total;
++ unsigned long long max;
++ struct list_head lru;
++ unsigned long hash;
++ void *frame[SLEEP_TRACE_DEPTH];
++};
++
++struct __sleep_info {
++ spinlock_t lock;
++ int nr_traces;
++ struct list_head lru;
++ struct stack_trace traces[0];
++};
++
++extern struct exec_domain default_exec_domain;
++
++extern void kernel_thread_trampoline_before(void);
++extern void kernel_thread_trampoline_after(void);
++
++extern void cpu_idle_before(void);
++extern void cpu_idle_after(void);
++
++extern void do_page_fault_before(void);
++extern void do_page_fault_after(void);
++
++extern void kernel_entries_start(void);
++extern void kernel_entries_end(void);
++
++static int is_last_frame(void *addr)
++{
++ if (addr == NULL)
++ return 1;
++ else if ((void *)kernel_thread_trampoline_before < addr &&
++ addr < (void *)kernel_thread_trampoline_after)
++ return 1;
++ else if ((void *)cpu_idle_before < addr && addr < (void *)cpu_idle_after)
++ return 1;
++ else if ((void *)do_page_fault_before < addr &&
++ addr < (void *)do_page_fault_after)
++ return 1;
++ else if ((void *)kernel_entries_start < addr &&
++ addr < (void *)kernel_entries_end)
++ return 1;
++ else if (!kernel_text_address((unsigned long)addr))
++ return 1;
++ else
++ return 0;
++}
++
++static void fill_trace(struct stack_trace *trace)
++{
++ int i;
++ void *addr;
++
++#define FRAME(nr) \
++ case (nr): \
++ addr = __builtin_return_address((nr) + 2); \
++ break
++
++ memset(trace->frame, 0, sizeof trace->frame);
++ addr = NULL;
++ for (i = 0; i < SLEEP_TRACE_DEPTH; ++ i) {
++ switch(i) {
++ FRAME(0);
++ FRAME(1);
++ FRAME(2);
++ FRAME(3);
++ FRAME(4);
++ FRAME(5);
++ FRAME(6);
++ FRAME(7);
++ FRAME(8);
++ FRAME(9);
++ FRAME(10);
++ FRAME(11);
++ FRAME(12);
++ FRAME(13);
++ FRAME(14);
++ FRAME(15);
++ FRAME(16);
++ FRAME(17);
++ FRAME(18);
++ FRAME(19);
++ FRAME(20);
++ default:
++ BUG();
++ }
++ trace->frame[i] = addr;
++ if (is_last_frame(addr))
++ break;
++ }
++
++}
++
++static int stack_trace_eq(struct stack_trace *t1, struct stack_trace *t2)
++{
++ return
++ t1->hash == t2->hash &&
++ !memcmp(t1->frame, t2->frame, sizeof t1->frame);
++}
++
++static unsigned long stack_trace_hash(struct stack_trace *trace)
++{
++ int i;
++ unsigned hash;
++
++ for (i = 0, hash = 0; i < SLEEP_TRACE_DEPTH; ++ i)
++ hash += (unsigned long)trace->frame[i];
++ return hash;
++}
++
++/* from sleepometer by Andrew Morton */
++static unsigned long long grab_time(void)
++{
++#ifdef CONFIG_X86
++ /*
++ * do_gettimeofday() goes backwards sometimes :(. Usethe TSC
++ */
++ unsigned long long ret;
++ extern unsigned long cpu_khz;
++
++ rdtscll(ret);
++ do_div(ret, cpu_khz / 1000);
++ return ret;
++#else
++ struct timeval now;
++ unsigned long long ret;
++
++ do_gettimeofday(&now);
++ ret = now.tv_sec;
++ ret *= 1000000;
++ ret += now.tv_usec;
++ return ret;
++#endif
++}
++
++static int alloc_sleep_info(struct sleep_info *sinfo, int nr_traces)
++{
++ struct __sleep_info *result;
++ int i;
++
++ result = kmalloc(sizeof *result + nr_traces * sizeof result->traces[0],
++ GFP_ATOMIC);
++ if (result == NULL)
++ return -ENOMEM;
++
++ sinfo->p = result;
++ sinfo->last_in = 0;
++ result->nr_traces = nr_traces;
++ spin_lock_init(&result->lock);
++ INIT_LIST_HEAD(&result->lru);
++ for (i = 0; i < nr_traces; ++ i) {
++ struct stack_trace *trace;
++
++ trace = &result->traces[i];
++ trace->nr = 0;
++ trace->total = 0;
++ trace->max = 0;
++ trace->hash = 0;
++ list_add(&trace->lru, &result->lru);
++ memset(trace->frame, 0, sizeof trace->frame);
++ }
++ return 0;
++}
++
++void init_sleep_info(struct task_struct *tsk)
++{
++ tsk->sinfo.p = NULL;
++}
++
++void free_sleep_info(struct sleep_info *sinfo)
++{
++ kfree(sinfo->p);
++ sinfo->p = NULL;
++}
++
++void sleep_in_hook(struct task_struct *tsk)
++{
++ tsk->sinfo.last_in = grab_time();
++}
++
++void update_sinfo(struct __sleep_info *sinfo, unsigned long long last_in)
++{
++ if (sinfo != NULL && last_in != 0) {
++ unsigned long long delta;
++ struct stack_trace trace;
++ struct stack_trace *target;
++ int i;
++
++ delta = grab_time() - last_in;
++ fill_trace(&trace);
++ target = NULL; /* to shut gcc up */
++ trace.hash = stack_trace_hash(&trace);
++ spin_lock(&sinfo->lock);
++ for (i = 0; i < sinfo->nr_traces; ++ i) {
++ target = &sinfo->traces[i];
++ if (stack_trace_eq(&trace, target)) {
++ ++ target->nr;
++ target->total += delta;
++ target->max = max(target->max, delta);
++ break;
++ }
++ }
++ if (i == sinfo->nr_traces) {
++ target = container_of(sinfo->lru.prev,
++ struct stack_trace, lru);
++ target->nr = 1;
++ target->total = target->max = delta;
++ target->hash = trace.hash;
++ memcpy(target->frame, trace.frame, sizeof target->frame);
++ }
++ list_move(&target->lru, &sinfo->lru);
++ spin_unlock(&sinfo->lock);
++ }
++}
++
++static struct sleep_info global_sinfo = {
++ .p = NULL
++};
++
++void sleep_ex_hook(struct task_struct *tsk)
++{
++ struct sleep_info *cur;
++
++ cur = &tsk->sinfo;
++ update_sinfo(cur->p, cur->last_in);
++ update_sinfo(global_sinfo.p, cur->last_in);
++ cur->last_in = 0;
++}
++
++static spinlock_t sleep_serializer = SPIN_LOCK_UNLOCKED;
++
++static void *sinfo_start(struct sleep_info *sinfo, int nr_traces, loff_t l)
++{
++ spin_lock(&sleep_serializer);
++ if (sinfo->p == NULL)
++ alloc_sleep_info(sinfo, nr_traces);
++ spin_unlock(&sleep_serializer);
++ if (sinfo->p == NULL)
++ return NULL;
++
++ if (l >= sinfo->p->nr_traces)
++ return NULL;
++ else
++ return &sinfo->p->traces[l];
++}
++
++static void *sinfo_next(struct sleep_info *sinfo, void *v, loff_t *pos)
++{
++ (*pos)++;
++
++ if (*pos >= sinfo->p->nr_traces)
++ return NULL;
++ else
++ return ((struct stack_trace *)v) + 1;
++}
++
++/*
++ * seq_file methods for /proc/pid/sleep. No locking is needed here, because we
++ * are iterating over sinfo->traces[] array rather than over sinfo->lru
++ * list. Actually spin locking is not allowed, because we can schedule between
++ * sleep_start() and sleep_stop().
++ */
++
++static void *sleep_start(struct seq_file *m, loff_t *pos)
++{
++ struct task_struct *task;
++
++ task = m->private;
++ return sinfo_start(&task->sinfo, SLEEP_TRACES_DEF, *pos);
++}
++
++static void sleep_stop(struct seq_file *m, void *v)
++{
++}
++
++static void *sleep_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ struct task_struct *task;
++
++ task = m->private;
++ return sinfo_next(&task->sinfo, v, pos);
++}
++
++static int show_sleep(struct seq_file *m, void *v)
++{
++ struct stack_trace *trace;
++ int i;
++
++ trace = v;
++ if (trace->nr == 0)
++ return 0;
++
++ seq_printf(m, "\n%u %llu %llu",
++ trace->nr, trace->total, trace->max);
++ for (i = 0; i < SLEEP_TRACE_DEPTH; ++ i) {
++ char *module;
++ const char *name;
++ char namebuf[128];
++ unsigned long address;
++ unsigned long offset;
++ unsigned long size;
++
++ address = (unsigned long) trace->frame[i];
++ name = kallsyms_lookup(address, &size,
++ &offset, &module, namebuf);
++ seq_printf(m, "\n\t%i %#lx ", i, address);
++ if (name != NULL)
++ seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
++ }
++ seq_printf(m, "\n");
++ return 0;
++}
++
++struct seq_operations proc_pid_sleep_op = {
++ .start = sleep_start,
++ .next = sleep_next,
++ .stop = sleep_stop,
++ .show = show_sleep
++};
++
++static int sleep_open(struct inode *inode, struct file *file)
++{
++ struct task_struct *task = PROC_I(inode)->task;
++ int ret = seq_open(file, &proc_pid_sleep_op);
++ if (!ret) {
++ struct seq_file *m = file->private_data;
++ m->private = task;
++ }
++ return ret;
++}
++
++struct file_operations proc_sleep_operations = {
++ .open = sleep_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
++static void *global_sleep_start(struct seq_file *m, loff_t *pos)
++{
++ return sinfo_start(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF, *pos);
++}
++
++static void *global_sleep_next(struct seq_file *m, void *v, loff_t *pos)
++{
++ return sinfo_next(&global_sinfo, v, pos);
++}
++
++
++struct seq_operations global_sleep_op = {
++ .start = global_sleep_start,
++ .next = global_sleep_next,
++ .stop = sleep_stop,
++ .show = show_sleep
++};
++
++static int global_sleep_open(struct inode *inode, struct file *file)
++{
++ return seq_open(file, &global_sleep_op);
++}
++
++struct file_operations proc_global_sleep_operations = {
++ .open = global_sleep_open,
++ .read = seq_read,
++ .llseek = seq_lseek,
++ .release = seq_release,
++};
++
++
++/* CONFIG_PROC_SLEEP */
++#endif