From: nikita Date: Tue, 17 Oct 2006 22:00:30 +0000 (+0000) Subject: patches: add sleeping statistics collection patch. See mail message to colibri@ for... X-Git-Tag: v1_8_0_110~486^2~470 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=4ab791fb779d5b6166ce3125479943a5b18b4695;p=fs%2Flustre-release.git patches: add sleeping statistics collection patch. See mail message to colibri@ for more information. --- diff --git a/lustre/kernel_patches/patches/proc-sleep-2.6.9.patch b/lustre/kernel_patches/patches/proc-sleep-2.6.9.patch new file mode 100644 index 0000000..d04fddd --- /dev/null +++ b/lustre/kernel_patches/patches/proc-sleep-2.6.9.patch @@ -0,0 +1,788 @@ + +export blocking statistics in /proc//task/sleep. Statistics +collection for given file is activated on the first read of +/proc/pid/sleep. When statistics collection is on on each context switch +current back-trace is built (through __builtin_return_address()). For each +monitored process there is a LRU list of such back-traces. Useful when trying +to understand where elapsed time is spent. + +Signed-off-by: Nikita Danilov + + + arch/i386/Kconfig.debug | 9 + + arch/i386/kernel/entry.S | 4 + arch/i386/kernel/process.c | 52 +++++ + arch/i386/mm/fault.c | 7 + fs/proc/base.c | 15 + + fs/proc/proc_misc.c | 7 + include/linux/sched.h | 9 - + include/linux/sleep_info.h | 48 +++++ + kernel/Makefile | 1 + kernel/exit.c | 10 - + kernel/fork.c | 17 + + kernel/sched.c | 4 + kernel/sleep_info.c | 392 +++++++++++++++++++++++++++++++++++++++++++++ + 13 files changed, 560 insertions(+), 15 deletions(-) + +Index: linux/arch/i386/Kconfig.debug +=================================================================== +--- linux.orig/arch/i386/Kconfig.debug ++++ linux/arch/i386/Kconfig.debug +@@ -79,4 +79,13 @@ config X86_MPPARSE + + source "arch/i386/Kconfig.kgdb" + ++config PROC_SLEEP ++ bool "Export sleep statistics" ++ depends on DEBUG_KERNEL && FRAME_POINTER ++ default n ++ help ++ If you say Y here, new file /proc/pid/stack will appear that contains ++ call-traces where given process blocked most of the time. If unsure say ++ N. ++ + endmenu +Index: linux/arch/i386/kernel/entry.S +=================================================================== +--- linux.orig/arch/i386/kernel/entry.S ++++ linux/arch/i386/kernel/entry.S +@@ -312,6 +312,8 @@ ENTRY(lcall27) + jmp do_lcall + + ++ENTRY(kernel_entries_start) ++ + ENTRY(ret_from_fork) + pushl %eax + call schedule_tail +@@ -548,6 +550,8 @@ syscall_badsys: + movl $-ENOSYS,EAX(%esp) + jmp resume_userspace + ++ENTRY(kernel_entries_end) ++ + /* + * Build the entry stubs and pointer table with + * some assembler magic. +Index: linux/arch/i386/kernel/process.c +=================================================================== +--- linux.orig/arch/i386/kernel/process.c ++++ linux/arch/i386/kernel/process.c +@@ -137,6 +137,9 @@ static void poll_idle (void) + } + } + ++void cpu_idle_before(void) ++{;} ++ + /* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a +@@ -206,6 +209,9 @@ void __init select_idle_routine(const st + } + } + ++void cpu_idle_after(void) ++{;} ++ + static int __init idle_setup (char *str) + { + if (!strncmp(str, "poll", 4)) { +@@ -279,7 +285,7 @@ __asm__(".section .text\n" + /* + * Create a kernel thread + */ +-int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) ++static int arch_kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) + { + struct pt_regs regs; + +@@ -299,6 +305,50 @@ int kernel_thread(int (*fn)(void *), voi + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); + } + ++#if defined(CONFIG_PROC_SLEEP) ++struct bag { ++ int (*fn)(void *); ++ void * arg; ++}; ++ ++void kernel_thread_trampoline_before(void) ++{;} ++ ++static int kernel_thread_trampoline(void *__arg) ++{ ++ struct bag *bag; ++ int (*fn)(void *); ++ void * arg; ++ ++ bag = __arg; ++ fn = bag->fn; ++ arg = bag->arg; ++ kfree(bag); ++ return fn(arg); ++} ++ ++void kernel_thread_trampoline_after(void) ++{;} ++ ++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) ++{ ++ struct bag *bag; ++ ++ bag = kmalloc(sizeof *bag, GFP_ATOMIC); ++ if (bag != NULL) { ++ bag->fn = fn; ++ bag->arg = arg; ++ return arch_kernel_thread(kernel_thread_trampoline, bag, flags); ++ } else ++ return -ENOMEM; ++} ++#else ++int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) ++{ ++ return arch_kernel_thread(fn, arg, flags); ++} ++#endif ++ + /* + * Free current thread data structures etc.. + */ +Index: linux/arch/i386/mm/fault.c +=================================================================== +--- linux.orig/arch/i386/mm/fault.c ++++ linux/arch/i386/mm/fault.c +@@ -214,6 +214,9 @@ static inline int is_prefetch(struct pt_ + + asmlinkage void do_invalid_op(struct pt_regs *, unsigned long); + ++void do_page_fault_before(void) ++{;} ++ + /* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate +@@ -574,3 +577,7 @@ vmalloc_fault: + return; + } + } ++ ++void do_page_fault_after(void) ++{;} ++ +Index: linux/fs/proc/base.c +=================================================================== +--- linux.orig/fs/proc/base.c ++++ linux/fs/proc/base.c +@@ -103,6 +103,9 @@ enum pid_directory_inos { + #ifdef CONFIG_AUDITSYSCALL + PROC_TID_LOGINUID, + #endif ++#ifdef CONFIG_PROC_SLEEP ++ PROC_TID_SLEEP, ++#endif + PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ + }; + +@@ -170,6 +173,9 @@ static struct pid_entry tid_base_stuff[] + #ifdef CONFIG_AUDITSYSCALL + E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), + #endif ++#ifdef CONFIG_PROC_SLEEP ++ E(PROC_TID_SLEEP, "sleep", S_IFREG|S_IRUGO), ++#endif + {0,0,NULL,0} + }; + +@@ -1173,6 +1179,10 @@ out: + return ~0U; + } + ++#ifdef CONFIG_PROC_SLEEP ++extern struct file_operations proc_sleep_operations; ++#endif ++ + /* SMP-safe */ + static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) + { +@@ -1463,6 +1473,11 @@ static struct dentry *proc_pident_lookup + inode->i_fop = &proc_loginuid_operations; + break; + #endif ++#ifdef CONFIG_PROC_SLEEP ++ case PROC_TID_SLEEP: ++ inode->i_fop = &proc_sleep_operations; ++ break; ++#endif + default: + printk("procfs: impossible type (%d)",p->type); + iput(inode); +Index: linux/fs/proc/proc_misc.c +=================================================================== +--- linux.orig/fs/proc/proc_misc.c ++++ linux/fs/proc/proc_misc.c +@@ -270,6 +270,10 @@ static struct file_operations proc_cpuin + .release = seq_release, + }; + ++#ifdef CONFIG_PROC_SLEEP ++extern struct file_operations proc_global_sleep_operations; ++#endif ++ + extern struct seq_operations vmstat_op; + static int vmstat_open(struct inode *inode, struct file *file) + { +@@ -641,4 +645,7 @@ void __init proc_misc_init(void) + entry->proc_fops = &ppc_htab_operations; + } + #endif ++#ifdef CONFIG_PROC_SLEEP ++ create_seq_entry("sleep", 0, &proc_global_sleep_operations); ++#endif + } +Index: linux/include/linux/sched.h +=================================================================== +--- linux.orig/include/linux/sched.h ++++ linux/include/linux/sched.h +@@ -29,6 +29,7 @@ + #include + #include + #include ++#include + + struct exec_domain; + extern int exec_shield; +@@ -646,6 +647,8 @@ struct task_struct { + /* process tracking callback */ + struct list_head ptrack_list; + #endif ++ /* where this task blocked */ ++ struct sleep_info sinfo; + }; + + static inline pid_t process_group(struct task_struct *tsk) +Index: linux/include/linux/sleep_info.h +=================================================================== +--- linux.orig/include/linux/sleep_info.h ++++ linux/include/linux/sleep_info.h +@@ -0,0 +1,48 @@ ++#ifndef _LINUX_SLEEP_INFO_H ++#define _LINUX_SLEEP_INFO_H ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PROC_SLEEP ++ ++struct __sleep_info; ++ ++struct sleep_info { ++ struct __sleep_info *p; ++ unsigned long long last_in; ++}; ++ ++extern void init_sleep_info(struct task_struct *tsk); ++extern void free_sleep_info(struct sleep_info *sinfo); ++ ++struct task_struct; ++ ++extern void sleep_in_hook(struct task_struct *tsk); ++extern void sleep_ex_hook(struct task_struct *tsk); ++ ++extern struct file_operations proc_sleep_operations; ++ ++/* CONFIG_PROC_SLEEP */ ++#else ++ ++struct sleep_info {}; ++ ++#define init_sleep_info(tsk) ++#define free_sleep_info(sinfo) ++ ++#define sleep_in_hook(tsk) ++#define sleep_ex_hook(tsk) ++ ++/* CONFIG_PROC_SLEEP */ ++#endif ++ ++/* __KERNEL__ */ ++#endif ++ ++/* _LINUX_SLEEP_INFO_H */ ++#endif +Index: linux/kernel/Makefile +=================================================================== +--- linux.orig/kernel/Makefile ++++ linux/kernel/Makefile +@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o + obj-$(CONFIG_AUDITFILESYSTEM) += auditfs.o + obj-$(CONFIG_KPROBES) += kprobes.o + obj-$(CONFIG_PTRACK) += ptrack.o ++obj-$(CONFIG_PROC_SLEEP) += sleep_info.o + + ifneq ($(CONFIG_IA64),y) + # According to Alan Modra , the -fno-omit-frame-pointer is +Index: linux/kernel/exit.c +=================================================================== +--- linux.orig/kernel/exit.c ++++ linux/kernel/exit.c +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -96,6 +97,7 @@ repeat: + write_unlock_irq(&tasklist_lock); + spin_unlock(&p->proc_lock); + proc_pid_flush(proc_dentry); ++ free_sleep_info(&p->sinfo); + release_thread(p); + put_task_struct(p); + +Index: linux/kernel/fork.c +=================================================================== +--- linux.orig/kernel/fork.c ++++ linux/kernel/fork.c +@@ -40,6 +40,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1172,6 +1173,8 @@ static task_t *copy_process(unsigned lon + if (!current->signal->tty && p->signal->tty) + p->signal->tty = NULL; + ++ init_sleep_info(p); ++ + nr_threads++; + write_unlock_irq(&tasklist_lock); + retval = 0; +Index: linux/kernel/sched.c +=================================================================== +--- linux.orig/kernel/sched.c ++++ linux/kernel/sched.c +@@ -2656,6 +2656,8 @@ asmlinkage void __sched schedule(void) + } + } + ++ sleep_in_hook(current); ++ + need_resched: + preempt_disable(); + prev = current; +@@ -2797,6 +2799,8 @@ switch_tasks: + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; ++ ++ sleep_ex_hook(current); + } + + EXPORT_SYMBOL(schedule); +Index: linux/kernel/sleep_info.c +=================================================================== +--- linux.orig/kernel/sleep_info.c ++++ linux/kernel/sleep_info.c +@@ -0,0 +1,392 @@ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#ifdef CONFIG_PROC_SLEEP ++ ++#define SLEEP_TRACES_DEF (32) ++#define GLOBAL_SLEEP_TRACES_DEF (512) ++#define SLEEP_TRACE_DEPTH (20) ++ ++struct stack_trace { ++ unsigned nr; ++ unsigned long long total; ++ unsigned long long max; ++ struct list_head lru; ++ unsigned long hash; ++ void *frame[SLEEP_TRACE_DEPTH]; ++}; ++ ++struct __sleep_info { ++ spinlock_t lock; ++ int nr_traces; ++ struct list_head lru; ++ struct stack_trace traces[0]; ++}; ++ ++extern struct exec_domain default_exec_domain; ++ ++extern void kernel_thread_trampoline_before(void); ++extern void kernel_thread_trampoline_after(void); ++ ++extern void cpu_idle_before(void); ++extern void cpu_idle_after(void); ++ ++extern void do_page_fault_before(void); ++extern void do_page_fault_after(void); ++ ++extern void kernel_entries_start(void); ++extern void kernel_entries_end(void); ++ ++static int is_last_frame(void *addr) ++{ ++ if (addr == NULL) ++ return 1; ++ else if ((void *)kernel_thread_trampoline_before < addr && ++ addr < (void *)kernel_thread_trampoline_after) ++ return 1; ++ else if ((void *)cpu_idle_before < addr && addr < (void *)cpu_idle_after) ++ return 1; ++ else if ((void *)do_page_fault_before < addr && ++ addr < (void *)do_page_fault_after) ++ return 1; ++ else if ((void *)kernel_entries_start < addr && ++ addr < (void *)kernel_entries_end) ++ return 1; ++ else if (!kernel_text_address((unsigned long)addr)) ++ return 1; ++ else ++ return 0; ++} ++ ++static void fill_trace(struct stack_trace *trace) ++{ ++ int i; ++ void *addr; ++ ++#define FRAME(nr) \ ++ case (nr): \ ++ addr = __builtin_return_address((nr) + 2); \ ++ break ++ ++ memset(trace->frame, 0, sizeof trace->frame); ++ addr = NULL; ++ for (i = 0; i < SLEEP_TRACE_DEPTH; ++ i) { ++ switch(i) { ++ FRAME(0); ++ FRAME(1); ++ FRAME(2); ++ FRAME(3); ++ FRAME(4); ++ FRAME(5); ++ FRAME(6); ++ FRAME(7); ++ FRAME(8); ++ FRAME(9); ++ FRAME(10); ++ FRAME(11); ++ FRAME(12); ++ FRAME(13); ++ FRAME(14); ++ FRAME(15); ++ FRAME(16); ++ FRAME(17); ++ FRAME(18); ++ FRAME(19); ++ FRAME(20); ++ default: ++ BUG(); ++ } ++ trace->frame[i] = addr; ++ if (is_last_frame(addr)) ++ break; ++ } ++ ++} ++ ++static int stack_trace_eq(struct stack_trace *t1, struct stack_trace *t2) ++{ ++ return ++ t1->hash == t2->hash && ++ !memcmp(t1->frame, t2->frame, sizeof t1->frame); ++} ++ ++static unsigned long stack_trace_hash(struct stack_trace *trace) ++{ ++ int i; ++ unsigned hash; ++ ++ for (i = 0, hash = 0; i < SLEEP_TRACE_DEPTH; ++ i) ++ hash += (unsigned long)trace->frame[i]; ++ return hash; ++} ++ ++/* from sleepometer by Andrew Morton */ ++static unsigned long long grab_time(void) ++{ ++#ifdef CONFIG_X86 ++ /* ++ * do_gettimeofday() goes backwards sometimes :(. Usethe TSC ++ */ ++ unsigned long long ret; ++ extern unsigned long cpu_khz; ++ ++ rdtscll(ret); ++ do_div(ret, cpu_khz / 1000); ++ return ret; ++#else ++ struct timeval now; ++ unsigned long long ret; ++ ++ do_gettimeofday(&now); ++ ret = now.tv_sec; ++ ret *= 1000000; ++ ret += now.tv_usec; ++ return ret; ++#endif ++} ++ ++static int alloc_sleep_info(struct sleep_info *sinfo, int nr_traces) ++{ ++ struct __sleep_info *result; ++ int i; ++ ++ result = kmalloc(sizeof *result + nr_traces * sizeof result->traces[0], ++ GFP_ATOMIC); ++ if (result == NULL) ++ return -ENOMEM; ++ ++ sinfo->p = result; ++ sinfo->last_in = 0; ++ result->nr_traces = nr_traces; ++ spin_lock_init(&result->lock); ++ INIT_LIST_HEAD(&result->lru); ++ for (i = 0; i < nr_traces; ++ i) { ++ struct stack_trace *trace; ++ ++ trace = &result->traces[i]; ++ trace->nr = 0; ++ trace->total = 0; ++ trace->max = 0; ++ trace->hash = 0; ++ list_add(&trace->lru, &result->lru); ++ memset(trace->frame, 0, sizeof trace->frame); ++ } ++ return 0; ++} ++ ++void init_sleep_info(struct task_struct *tsk) ++{ ++ tsk->sinfo.p = NULL; ++} ++ ++void free_sleep_info(struct sleep_info *sinfo) ++{ ++ kfree(sinfo->p); ++ sinfo->p = NULL; ++} ++ ++void sleep_in_hook(struct task_struct *tsk) ++{ ++ tsk->sinfo.last_in = grab_time(); ++} ++ ++void update_sinfo(struct __sleep_info *sinfo, unsigned long long last_in) ++{ ++ if (sinfo != NULL && last_in != 0) { ++ unsigned long long delta; ++ struct stack_trace trace; ++ struct stack_trace *target; ++ int i; ++ ++ delta = grab_time() - last_in; ++ fill_trace(&trace); ++ target = NULL; /* to shut gcc up */ ++ trace.hash = stack_trace_hash(&trace); ++ spin_lock(&sinfo->lock); ++ for (i = 0; i < sinfo->nr_traces; ++ i) { ++ target = &sinfo->traces[i]; ++ if (stack_trace_eq(&trace, target)) { ++ ++ target->nr; ++ target->total += delta; ++ target->max = max(target->max, delta); ++ break; ++ } ++ } ++ if (i == sinfo->nr_traces) { ++ target = container_of(sinfo->lru.prev, ++ struct stack_trace, lru); ++ target->nr = 1; ++ target->total = target->max = delta; ++ target->hash = trace.hash; ++ memcpy(target->frame, trace.frame, sizeof target->frame); ++ } ++ list_move(&target->lru, &sinfo->lru); ++ spin_unlock(&sinfo->lock); ++ } ++} ++ ++static struct sleep_info global_sinfo = { ++ .p = NULL ++}; ++ ++void sleep_ex_hook(struct task_struct *tsk) ++{ ++ struct sleep_info *cur; ++ ++ cur = &tsk->sinfo; ++ update_sinfo(cur->p, cur->last_in); ++ update_sinfo(global_sinfo.p, cur->last_in); ++ cur->last_in = 0; ++} ++ ++static spinlock_t sleep_serializer = SPIN_LOCK_UNLOCKED; ++ ++static void *sinfo_start(struct sleep_info *sinfo, int nr_traces, loff_t l) ++{ ++ spin_lock(&sleep_serializer); ++ if (sinfo->p == NULL) ++ alloc_sleep_info(sinfo, nr_traces); ++ spin_unlock(&sleep_serializer); ++ if (sinfo->p == NULL) ++ return NULL; ++ ++ if (l >= sinfo->p->nr_traces) ++ return NULL; ++ else ++ return &sinfo->p->traces[l]; ++} ++ ++static void *sinfo_next(struct sleep_info *sinfo, void *v, loff_t *pos) ++{ ++ (*pos)++; ++ ++ if (*pos >= sinfo->p->nr_traces) ++ return NULL; ++ else ++ return ((struct stack_trace *)v) + 1; ++} ++ ++/* ++ * seq_file methods for /proc/pid/sleep. No locking is needed here, because we ++ * are iterating over sinfo->traces[] array rather than over sinfo->lru ++ * list. Actually spin locking is not allowed, because we can schedule between ++ * sleep_start() and sleep_stop(). ++ */ ++ ++static void *sleep_start(struct seq_file *m, loff_t *pos) ++{ ++ struct task_struct *task; ++ ++ task = m->private; ++ return sinfo_start(&task->sinfo, SLEEP_TRACES_DEF, *pos); ++} ++ ++static void sleep_stop(struct seq_file *m, void *v) ++{ ++} ++ ++static void *sleep_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct task_struct *task; ++ ++ task = m->private; ++ return sinfo_next(&task->sinfo, v, pos); ++} ++ ++static int show_sleep(struct seq_file *m, void *v) ++{ ++ struct stack_trace *trace; ++ int i; ++ ++ trace = v; ++ if (trace->nr == 0) ++ return 0; ++ ++ seq_printf(m, "\n%u %llu %llu", ++ trace->nr, trace->total, trace->max); ++ for (i = 0; i < SLEEP_TRACE_DEPTH; ++ i) { ++ char *module; ++ const char *name; ++ char namebuf[128]; ++ unsigned long address; ++ unsigned long offset; ++ unsigned long size; ++ ++ address = (unsigned long) trace->frame[i]; ++ name = kallsyms_lookup(address, &size, ++ &offset, &module, namebuf); ++ seq_printf(m, "\n\t%i %#lx ", i, address); ++ if (name != NULL) ++ seq_printf(m, "%s+%#lx/%#lx", name, offset, size); ++ } ++ seq_printf(m, "\n"); ++ return 0; ++} ++ ++struct seq_operations proc_pid_sleep_op = { ++ .start = sleep_start, ++ .next = sleep_next, ++ .stop = sleep_stop, ++ .show = show_sleep ++}; ++ ++static int sleep_open(struct inode *inode, struct file *file) ++{ ++ struct task_struct *task = PROC_I(inode)->task; ++ int ret = seq_open(file, &proc_pid_sleep_op); ++ if (!ret) { ++ struct seq_file *m = file->private_data; ++ m->private = task; ++ } ++ return ret; ++} ++ ++struct file_operations proc_sleep_operations = { ++ .open = sleep_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++static void *global_sleep_start(struct seq_file *m, loff_t *pos) ++{ ++ return sinfo_start(&global_sinfo, GLOBAL_SLEEP_TRACES_DEF, *pos); ++} ++ ++static void *global_sleep_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ return sinfo_next(&global_sinfo, v, pos); ++} ++ ++ ++struct seq_operations global_sleep_op = { ++ .start = global_sleep_start, ++ .next = global_sleep_next, ++ .stop = sleep_stop, ++ .show = show_sleep ++}; ++ ++static int global_sleep_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &global_sleep_op); ++} ++ ++struct file_operations proc_global_sleep_operations = { ++ .open = global_sleep_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++ ++/* CONFIG_PROC_SLEEP */ ++#endif diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series index 8040b68..709df80 100644 --- a/lustre/kernel_patches/series/2.6-rhel4.series +++ b/lustre/kernel_patches/series/2.6-rhel4.series @@ -29,3 +29,4 @@ raid5-merge-ios.patch raid5-serialize-ovelapping-reqs.patch jbd-stats-2.6.9.patch bitops_ext2_find_next_le_bit-2.6.patch +proc-sleep-2.6.9.patch