From 3a88442dfffc451b0c598a95d49896e6ba271304 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Sun, 13 Dec 2020 04:57:23 -0700 Subject: [PATCH] LU-13676 tools: find unique stacks from sysrq-t Enhance the crash-find-unique-traces.awk script to be more useful: - allow parsing output from sysrq-t - skip leading fields from input (e.g. timestamps, syslog header) - sort messages to avoid duplicates - handle some variance in kernel function names - print thread names as well as PIDs for all matching threads - print number of threads matching a particular stack trace - warn if parts of the stack traces were lost due to buffer overflow Test-Parameters: trivial Signed-off-by: Andreas Dilger Change-Id: Icfd70fbf255f413ca1aab0b94c62754e4c00628d Reviewed-on: https://review.whamcloud.com/40956 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Yang Sheng Reviewed-by: Oleg Drokin --- contrib/debug_tools/crash-find-unique-traces.awk | 340 +++++++++++++++++------ 1 file changed, 255 insertions(+), 85 deletions(-) diff --git a/contrib/debug_tools/crash-find-unique-traces.awk b/contrib/debug_tools/crash-find-unique-traces.awk index c77a597..b958b5c 100755 --- a/contrib/debug_tools/crash-find-unique-traces.awk +++ b/contrib/debug_tools/crash-find-unique-traces.awk @@ -1,129 +1,299 @@ #!/usr/bin/awk -f - +# parse stack traces and identify interesting threads, avoiding the common +# threads that are just sleeping and not doing anything interesting. +# +# works with stack traces from "crash foreach bt", and kernel stacks from +# "sysrq-t" or "echo t > /proc/sysrq-trigger" with timestamps/kernel: +# stripped via "sed -e 's/.*kernel: //'" +BEGIN { + unique_nr=0 +} function add_unique_chain(chain) { + if (chain=="") + return for (i = 0; i < unique_nr; i++) { if (unique[i] == chain) { + #print "appending["i"]: "command":"current_pid + unique_num[i]++ + unique_pids[i] = unique_pids[i] command":"current_pid" " return 0 } } - unique_pids[unique_nr] = unique_pids[unique_nr] current_pid " " + #print "adding["unique_nr"]: "command":"current_pid":: "chain + unique_num[unique_nr]++ + unique_pids[unique_nr] = unique_pids[unique_nr] command":"current_pid" " unique[unique_nr++] = chain } -function dump_chain() +function add_to_chain(tmp) { + # not currently processing a stack + if (collect_chain==0) + return + + # skip duplicate/common parts of every stack + if (tmp=="?") + return + if (index(tmp, "__cond_resched")) + return + if (index(tmp, "_cond_resched")) + tmp="schedule" + if (index(tmp, "do_syscall_64")) + return + if (index(tmp, "entry_SYSCALL")) + return + if (index(tmp, "ret_from_fork")) + return + if (index(tmp, "__schedule")) + return + if (index(tmp, "schedule_timeout")) + return + if (index(tmp, "schedule_hrtimeout_range")) + return + if (index(tmp, "system_call_fastpath")) + return + if (index(tmp, "SyS_select")) + return + if (index(tmp, "SyS_pselect")) + return + if (index(tmp, "sys_select")) + return + + # strip off function offset/length + sub("[+]0x[0-9a-f]+[/].*", "", tmp) + if (index(tmp, "kthread")) + return + + if (chain=="") + chain=tmp + else + chain=chain","tmp +} + +function dump_chain(why) { if (collect_chain) { - #print pl_cmdname,pl_pid + #printf "dumping(%s): %s\n", why, command_name add_unique_chain(chain) chain="" collect_chain=0 + skip_unknown=0 + } +} + +/Missed [0-9]* kernel messages/ { + if (collect_chain) { + incomplete=1 + dump_chain("incomplete") } } -/^PID:/ { - traces++ - dump_chain() +# > crash -s +# PID: 0 TASK: ffffffff82013480 CPU: 0 COMMAND: "swapper/0" +# #0 [ffffffff82003e28] __schedule at ffffffff81610f2e +# #1 [ffffffff82003ec8] schedule_idle at ffffffff8161181e +/PID: .*TASK: / { + dump_chain("PID") collect_chain=1 - current_pid=$2 + for (i = 1; i < 12; i++) { + if ($i == "PID:") { + start_offset=i-1 + break + } + } + current_pid=$(2+start_offset) + command=$(8+start_offset) + #print $0 + #printf "P: offset=%u, pid=%u, command=%s\n", + # start_offset, current_pid, command } +# #2 [ffffffff82003ed0] do_idle at ffffffff810cddaf +# #3 [ffffffff82003ef0] cpu_startup_entry at ffffffff810cdfef /#[0-9]* / { - tmp=$3 - #if (start_to_analyze==0) - # next - if (collect_chain==0) - next - if (index(tmp, "do_syscall_64")) - next - if (index(tmp, "ret_from_fork")) - next - if (index(tmp, "entry_SYSCALL")) - next - #sub("[+]0x[0-9a-f]+[/].*", "", tmp) - if (chain=="") - chain=tmp - else - chain=chain","tmp + add_to_chain($(3+start_offset)) +} + + +# SysRq : Show State +# task PC stack pid father +# mdt04_084 R running task 0 141145 2 0x00000080 +# Call Trace: +# [] __cond_resched+0x26/0x30 +# [] _cond_resched+0x3a/0x50 +# [] kmem_cache_alloc+0x35/0x1f0 +# [] LNetMDBind+0x7c/0x5e0 [lnet] +/ R running task / { + dump_chain("running") + collect_chain=1 + #skip_unknown=1 + for (i = 1; i < 12; i++) { + if ($i == "R") { + start_offset=i-2 + break + } + } + current_pid=$(6+start_offset) + command=$(1+start_offset) + #print $0 + #printf "%s: offset=%u, pid=%u, command=%s\n", $(2+start_offset), + # start_offset, current_pid, command +} + +# SysRq : Show State +# task PC stack pid father +# bash S ffff8e3295fdb150 0 227559 227404 0x00000080 +# worker D ffff8abbbfb1ac80 0 4090 1 0x00000000 +# Call Trace: +# [] schedule+0x29/0x70 +# [] do_wait+0x1f6/0x260 +# this regexp is x86_64-specific +/ [SD] ffff[0-9a-f]* / { + dump_chain("sleeping") + collect_chain=1 + for (i = 1; i < 12; i++) { + if ($i == "S" || $i == "D") { + start_offset=i-2 + break + } + } + current_pid=$(5+start_offset) + command=$(1+start_offset) + #print $0 + #printf "%s: offset=%u, pid=%s, command=%s\n", $(2+start_offset), + # start_offset, current_pid, command +} + +# mdt01_001: page allocation failure: order:4, mode:0x10c050 +# CPU: 1 PID: 9374 Comm: mdt01_001 Kdump: loaded Tainted: G +# Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 +# Call Trace: +# [] dump_stack+0x19/0x1b +# [] warn_alloc_failed+0x110/0x180 +/CPU: [0-9]* PID: [0-9]* Comm: / { + dump_chain("dump") + collect_chain=1 + for (i = 1; i < 8; i++) { + if ($i == "CPU:") + start_offset=i-1 + } + current_pid=$(4+start_offset) + command=$(6+start_offset) +} + +# [] ? ep_scan_ready_list.isra.7+0x1b9/0x1f0 +# [] schedule_hrtimeout_range+0x13/0x20 +# this regexp is x86_64-specific +/ \[\] / { + this_offset=2 + if (skip_unknown==1) { + if ($(start_offset+this_offset)=="?") + this_offset++ + #print "adding @"this_offset": "$(start_offset+this_offset) + } + add_to_chain($(start_offset+this_offset)) } /ret_from_fork/ { - dump_chain() + dump_chain("ret") } /entry_SYSCALL/ { - dump_chain() + dump_chain("syscall") } END { - not_interesting[0]="schedule,ldlm_bl_thread_main,kthread" - not_interesting[1]="schedule,ptlrpcd,kthread" - not_interesting[2]="schedule,mgc_requeue_thread,kthread" - not_interesting[3]="schedule,schedule_timeout,ksocknal_connd,kthread" - not_interesting[4]="schedule,ptlrpc_hr_main,kthread" - not_interesting[5]="schedule,ksocknal_scheduler" - not_interesting[6]="schedule,osp_precreate_thread" - not_interesting[7]="schedule,mgs_ir_notify,kthread" - not_interesting[8]="schedule,qmt_reba_thread,kthread" - not_interesting[9]="schedule,expired_lock_main,kthread" - not_interesting[10]="schedule,schedule_timeout,lnet_peer_discovery,kthread" - not_interesting[11]="schedule,ping_evictor_main,kthread" - not_interesting[12]="schedule,schedule_timeout,qsd_upd_thread,kthread" - not_interesting[13]="schedule,ptlrpc_wait_event,ptlrpc_main,kthread" - not_interesting[14]="schedule,lnet_sock_accept,lnet_acceptor,kthread" - not_interesting[15]="schedule,schedule_timeout,lnet_monitor_thread,kthread" - not_interesting[16]="schedule,ofd_inconsistency_verification_main,kthread" - not_interesting[17]="schedule,schedule_timeout,ksocknal_reaper,kthread" - not_interesting[18]="schedule,kjournald2,kthread" - not_interesting[19]="schedule,rescuer_thread,kthread" - not_interesting[20]="schedule,oom_reaper,kthread" - not_interesting[21]="schedule,worker_thread,kthread" - not_interesting[22]="schedule,kthread_worker_fn,kthread" - not_interesting[23]="schedule,osp_sync_process_queues,llog_process_thread,llog_process_or_fork,llog_cat_process_cb,llog_process_thread,llog_process_or_fork,llog_cat_process_or_fork,llog_cat_process,osp_sync_thread,kthread" - not_interesting[24]="schedule,smpboot_thread_fn,kthread" - not_interesting[25]="schedule,schedule_timeout,rcu_gp_kthread,kthread" - not_interesting[26]="schedule,rcu_gp_kthread,kthread" - not_interesting[27]="schedule,devtmpfsd,kthread" - not_interesting[28]="schedule,schedule_timeout,watchdog,kthread" - not_interesting[29]="schedule,oom_reaper" - not_interesting[30]="schedule,kcompactd,kthread" - not_interesting[31]="schedule,schedule_timeout,khugepaged,kthread" - not_interesting[32]="schedule,kswapd,kthread" - not_interesting[33]="schedule,schedule_hrtimeout_range_clock,do_sys_poll,__se_sys_poll" - not_interesting[34]="schedule,schedule_hrtimeout_range_clock,do_select,core_sys_select,kern_select,__x64_sys_select" - not_interesting[35]="schedule,schedule_hrtimeout_range_clock,__se_sys_rt_sigtimedwait" - not_interesting[36]="schedule,do_wait,kernel_wait4,__se_sys_wait4" - not_interesting[37]="schedule,schedule_timeout,wait_for_common,lnet_monitor_thread,kthread" - not_interesting[38]="schedule,schedule_hrtimeout_range_clock,ep_poll,do_epoll_wait,__x64_sys_epoll_wait" - not_interesting[39]="schedule,kthreadd" - not_interesting[40]="schedule,khvcd,kthread" - not_interesting[41]="schedule,schedule_timeout,ptlrpcd,kthread" - not_interesting[42]="schedule,schedule_timeout,mdt_coordinator,kthread" - not_interesting[43]="schedule,distribute_txn_commit_thread,kthread" - not_interesting[44]="schedule,lnet_acceptor,kthread" - not_interesting[45]="schedule,osp_send_update_thread,kthread" - not_interesting[46]="schedule,pipe_wait,pipe_read,__vfs_read,vfs_read,ksys_read" - not_interesting[47]="default_idle,do_idle,cpu_startup_entry,start_kernel,secondary_startup_64" - not_interesting[48]="default_idle,do_idle,cpu_startup_entry,secondary_startup_64" - not_interesting[49]="do_select,core_sys_select,kern_select,__x64_sys_select" - -#not_interesting[]="" -#not_interesting[]="" -#schedule,kthreadd + if (incomplete) + print "**** messages lost, stack traces may be incomplete ****" + i=0 + # this list should be kept sorted to avoid duplicates + # "!}sort" on next line in Vim + not_interesting[i++]="default_idle,do_idle,cpu_startup_entry,secondary_startup_64" + not_interesting[i++]="default_idle,do_idle,cpu_startup_entry,start_kernel,secondary_startup_64" + not_interesting[i++]="sched_show_task,show_state_filter,sysrq_handle_showstate,__handle_sysrq,write_sysrq_trigger,proc_reg_write,vfs_write,SyS_write" + not_interesting[i++]="schedule" + not_interesting[i++]="schedule,__se_sys_rt_sigtimedwait" + not_interesting[i++]="schedule,cfs_wi_scheduler" + not_interesting[i++]="schedule,devtmpfsd" + not_interesting[i++]="schedule,distribute_txn_commit_thread" + not_interesting[i++]="schedule,do_nanosleep,hrtimer_nanosleep,SyS_nanosleep" + not_interesting[i++]="schedule,do_select" + not_interesting[i++]="schedule,do_select,kern_select" + not_interesting[i++]="schedule,do_sys_poll,SyS_poll" + not_interesting[i++]="schedule,do_sys_poll,SyS_ppoll" + not_interesting[i++]="schedule,do_sys_poll,__se_sys_poll" + not_interesting[i++]="schedule,do_wait,SyS_wait4" + not_interesting[i++]="schedule,do_wait,kernel_wait4,__se_sys_wait4" + not_interesting[i++]="schedule,ep_poll,SyS_epoll_wait" + not_interesting[i++]="schedule,ep_poll,SyS_epoll_wait,SyS_epoll_pwait" + not_interesting[i++]="schedule,ep_poll,do_epoll_wait,__x64_sys_epoll_wait" + not_interesting[i++]="schedule,expired_lock_main" + not_interesting[i++]="schedule,futex_wait_queue_me,futex_wait,do_futex,SyS_futex" + not_interesting[i++]="schedule,ib_fmr_cleanup_thread" + not_interesting[i++]="schedule,ipmi_thread" + not_interesting[i++]="schedule,kauditd_thread" + not_interesting[i++]="schedule,kcompactd" + not_interesting[i++]="schedule,khugepaged" + not_interesting[i++]="schedule,khvcd" + not_interesting[i++]="schedule,kiblnd_connd" + not_interesting[i++]="schedule,kiblnd_scheduler" + not_interesting[i++]="schedule,kjournald2" + not_interesting[i++]="schedule,ksm_scan_thread" + not_interesting[i++]="schedule,ksocknal_connd" + not_interesting[i++]="schedule,ksocknal_reaper" + not_interesting[i++]="schedule,ksocknal_scheduler" + not_interesting[i++]="schedule,kswapd" + not_interesting[i++]="schedule,kthread_worker_fn" + not_interesting[i++]="schedule,kthreadd" + not_interesting[i++]="schedule,lcw_dispatch_main" + not_interesting[i++]="schedule,ldlm_bl_thread_main" + not_interesting[i++]="schedule,ll_agl_thread" + not_interesting[i++]="schedule,ll_statahead_thread" + not_interesting[i++]="schedule,lnet_acceptor" + not_interesting[i++]="schedule,lnet_monitor_thread" + not_interesting[i++]="schedule,lnet_peer_discovery" + not_interesting[i++]="schedule,lnet_sock_accept,lnet_acceptor" + not_interesting[i++]="schedule,mdt_coordinator" + not_interesting[i++]="schedule,mgc_requeue_thread" + not_interesting[i++]="schedule,mgs_ir_notify" + not_interesting[i++]="schedule,n_tty_read,tty_read,vfs_read,SyS_read" + not_interesting[i++]="schedule,ofd_inconsistency_verification_main" + not_interesting[i++]="schedule,oom_reaper" + not_interesting[i++]="schedule,osp_precreate_thread" + not_interesting[i++]="schedule,osp_send_update_thread" + not_interesting[i++]="schedule,osp_sync_process_queues,llog_process_thread,llog_process_or_fork,llog_cat_process_cb,llog_process_thread,llog_process_or_fork,llog_cat_process_or_fork,llog_cat_process,osp_sync_thread" + not_interesting[i++]="schedule,ping_evictor_main" + not_interesting[i++]="schedule,pipe_wait,pipe_read,__vfs_read,vfs_read,ksys_read" + not_interesting[i++]="schedule,poll_do_select" + not_interesting[i++]="schedule,ptlrpc_hr_main" + not_interesting[i++]="schedule,ptlrpc_wait_event,ptlrpc_main" + not_interesting[i++]="schedule,ptlrpcd" + not_interesting[i++]="schedule,qmt_reba_thread" + not_interesting[i++]="schedule,qsd_upd_thread" + not_interesting[i++]="schedule,rcu_gp_kthread" + not_interesting[i++]="schedule,rescuer_thread" + not_interesting[i++]="schedule,sched_show_task,show_state_filter,sysrq_handle_showstate,__handle_sysrq,write_sysrq_trigger,proc_reg_write,vfs_write,SyS_write" + not_interesting[i++]="schedule,scsi_error_handler" + not_interesting[i++]="schedule,smpboot_thread_fn" + not_interesting[i++]="schedule,sys_pause" + not_interesting[i++]="schedule,wait_for_common,lnet_monitor_thread" + not_interesting[i++]="schedule,watchdog" + not_interesting[i++]="schedule,worker_thread" for (i = 0; i < unique_nr; i++) { dump=1 for (j in not_interesting) { - if (index(unique[i], not_interesting[j])) { - dump=0 - break - } +# if (index(unique[i], not_interesting[j])) { +# dump=0 +# break +# } if (unique[i]==not_interesting[j]) { dump=0 break } } if (dump) - print unique[i],"PIDs:",unique_pids[i] + printf("%s\n\tPIDs(%d): %s\n\n", + unique[i],unique_num[i],unique_pids[i]) } } -- 1.8.3.1