Whamcloud - gitweb
LU-13676 tools: find unique stacks from sysrq-t 56/40956/5
authorAndreas Dilger <adilger@whamcloud.com>
Sun, 13 Dec 2020 11:57:23 +0000 (04:57 -0700)
committerOleg Drokin <green@whamcloud.com>
Mon, 18 Jan 2021 13:54:03 +0000 (13:54 +0000)
Enhance the crash-find-unique-traces.awk script to be more useful:
- allow parsing output from sysrq-t
- skip leading fields from input (e.g. timestamps, syslog header)
- sort messages to avoid duplicates
- handle some variance in kernel function names
- print thread names as well as PIDs for all matching threads
- print number of threads matching a particular stack trace
- warn if parts of the stack traces were lost due to buffer overflow

Test-Parameters: trivial
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: Icfd70fbf255f413ca1aab0b94c62754e4c00628d
Reviewed-on: https://review.whamcloud.com/40956
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Reviewed-by: Yang Sheng <ys@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
contrib/debug_tools/crash-find-unique-traces.awk

index c77a597..b958b5c 100755 (executable)
 #!/usr/bin/awk -f
 #!/usr/bin/awk -f
-
+# parse stack traces and identify interesting threads, avoiding the common
+# threads that are just sleeping and not doing anything interesting.
+#
+# works with stack traces from "crash foreach bt", and kernel stacks from
+# "sysrq-t" or "echo t > /proc/sysrq-trigger" with timestamps/kernel:
+# stripped via "sed -e 's/.*kernel: //'"
+BEGIN {
+       unique_nr=0
+}
 function add_unique_chain(chain) {
 function add_unique_chain(chain) {
+       if (chain=="")
+               return
        for (i = 0; i < unique_nr; i++) {
                if (unique[i] == chain) {
        for (i = 0; i < unique_nr; i++) {
                if (unique[i] == chain) {
+                       #print "appending["i"]: "command":"current_pid
+                       unique_num[i]++
+                       unique_pids[i] = unique_pids[i] command":"current_pid" "
                        return 0
                }
        }
                        return 0
                }
        }
-       unique_pids[unique_nr] = unique_pids[unique_nr] current_pid " "
+       #print "adding["unique_nr"]: "command":"current_pid":: "chain
+       unique_num[unique_nr]++
+       unique_pids[unique_nr] = unique_pids[unique_nr] command":"current_pid" "
        unique[unique_nr++] = chain
 }
 
        unique[unique_nr++] = chain
 }
 
-function dump_chain()
+function add_to_chain(tmp) {
+       # not currently processing a stack
+       if (collect_chain==0)
+               return
+
+       # skip duplicate/common parts of every stack
+       if (tmp=="?")
+               return
+       if (index(tmp, "__cond_resched"))
+               return
+       if (index(tmp, "_cond_resched"))
+               tmp="schedule"
+       if (index(tmp, "do_syscall_64"))
+               return
+       if (index(tmp, "entry_SYSCALL"))
+               return
+       if (index(tmp, "ret_from_fork"))
+               return
+       if (index(tmp, "__schedule"))
+               return
+       if (index(tmp, "schedule_timeout"))
+               return
+       if (index(tmp, "schedule_hrtimeout_range"))
+               return
+       if (index(tmp, "system_call_fastpath"))
+               return
+       if (index(tmp, "SyS_select"))
+               return
+       if (index(tmp, "SyS_pselect"))
+               return
+       if (index(tmp, "sys_select"))
+               return
+
+       # strip off function offset/length
+       sub("[+]0x[0-9a-f]+[/].*", "", tmp)
+       if (index(tmp, "kthread"))
+               return
+
+       if (chain=="")
+               chain=tmp
+       else
+               chain=chain","tmp
+}
+
+function dump_chain(why)
 {
        if (collect_chain) {
 {
        if (collect_chain) {
-               #print pl_cmdname,pl_pid
+               #printf "dumping(%s): %s\n", why, command_name
                add_unique_chain(chain)
                chain=""
                collect_chain=0
                add_unique_chain(chain)
                chain=""
                collect_chain=0
+               skip_unknown=0
+       }
+}
+
+/Missed [0-9]* kernel messages/ {
+       if (collect_chain) {
+               incomplete=1
+               dump_chain("incomplete")
        }
 }
 
        }
 }
 
-/^PID:/ {
-       traces++
-       dump_chain()
+# > crash -s
+# PID: 0      TASK: ffffffff82013480  CPU: 0   COMMAND: "swapper/0"
+#  #0 [ffffffff82003e28] __schedule at ffffffff81610f2e
+#  #1 [ffffffff82003ec8] schedule_idle at ffffffff8161181e
+/PID: .*TASK: / {
+       dump_chain("PID")
        collect_chain=1
        collect_chain=1
-       current_pid=$2
+       for (i = 1; i < 12; i++) {
+               if ($i == "PID:") {
+                       start_offset=i-1
+                       break
+               }
+       }
+       current_pid=$(2+start_offset)
+       command=$(8+start_offset)
+       #print $0
+       #printf "P: offset=%u, pid=%u, command=%s\n",
+       #       start_offset, current_pid, command
 }
 
 }
 
+#  #2 [ffffffff82003ed0] do_idle at ffffffff810cddaf
+#  #3 [ffffffff82003ef0] cpu_startup_entry at ffffffff810cdfef
 /#[0-9]* / {
 /#[0-9]* / {
-       tmp=$3
-       #if (start_to_analyze==0)
-       #       next
-       if (collect_chain==0)
-               next
-       if (index(tmp, "do_syscall_64"))
-               next
-       if (index(tmp, "ret_from_fork"))
-               next
-       if (index(tmp, "entry_SYSCALL"))
-               next
-       #sub("[+]0x[0-9a-f]+[/].*", "", tmp)
-       if (chain=="")
-               chain=tmp
-       else
-               chain=chain","tmp
+       add_to_chain($(3+start_offset))
+}
+
+
+# SysRq : Show State
+#   task                        PC stack   pid father
+# mdt04_084       R  running task        0 141145      2 0x00000080
+# Call Trace:
+#  [<ffffffffbc0d66a6>] __cond_resched+0x26/0x30
+#  [<ffffffffbc77f4ca>] _cond_resched+0x3a/0x50
+#  [<ffffffffbc223495>] kmem_cache_alloc+0x35/0x1f0
+#  [<ffffffffc0dc363c>] LNetMDBind+0x7c/0x5e0 [lnet]
+/ R  running task / {
+       dump_chain("running")
+       collect_chain=1
+       #skip_unknown=1
+       for (i = 1; i < 12; i++) {
+               if ($i == "R") {
+                       start_offset=i-2
+                       break
+               }
+       }
+       current_pid=$(6+start_offset)
+       command=$(1+start_offset)
+       #print $0
+       #printf "%s: offset=%u, pid=%u, command=%s\n", $(2+start_offset),
+       #       start_offset, current_pid, command
+}
+
+# SysRq : Show State
+#   task                        PC stack   pid father
+# bash            S ffff8e3295fdb150     0 227559 227404 0x00000080
+# worker          D ffff8abbbfb1ac80     0  4090      1 0x00000000
+# Call Trace:
+#  [<ffffffffbc77f229>] schedule+0x29/0x70
+#  [<ffffffffbc0a07a6>] do_wait+0x1f6/0x260
+# this regexp is x86_64-specific
+/ [SD] ffff[0-9a-f]* / {
+       dump_chain("sleeping")
+       collect_chain=1
+       for (i = 1; i < 12; i++) {
+               if ($i == "S" || $i == "D") {
+                       start_offset=i-2
+                       break
+               }
+       }
+       current_pid=$(5+start_offset)
+       command=$(1+start_offset)
+       #print $0
+       #printf "%s: offset=%u, pid=%s, command=%s\n", $(2+start_offset),
+       #       start_offset, current_pid, command
+}
+
+# mdt01_001: page allocation failure: order:4, mode:0x10c050
+# CPU: 1 PID: 9374 Comm: mdt01_001 Kdump: loaded Tainted: G
+# Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+# Call Trace:
+#  [<ffffffffbc563021>] dump_stack+0x19/0x1b
+#  [<ffffffffbbfbcbf0>] warn_alloc_failed+0x110/0x180
+/CPU: [0-9]* PID: [0-9]* Comm: / {
+       dump_chain("dump")
+       collect_chain=1
+       for (i = 1; i < 8; i++) {
+               if ($i == "CPU:")
+                       start_offset=i-1
+       }
+       current_pid=$(4+start_offset)
+       command=$(6+start_offset)
+}
+
+#  [<ffffffffbc296ba9>] ? ep_scan_ready_list.isra.7+0x1b9/0x1f0
+#  [<ffffffffbc77e363>] schedule_hrtimeout_range+0x13/0x20
+# this regexp is x86_64-specific
+/ \[<ffff[0-9a-f]*>\] / {
+       this_offset=2
+       if (skip_unknown==1) {
+               if ($(start_offset+this_offset)=="?")
+                       this_offset++
+               #print "adding @"this_offset": "$(start_offset+this_offset)
+       }
+       add_to_chain($(start_offset+this_offset))
 }
 
 /ret_from_fork/ {
 }
 
 /ret_from_fork/ {
-       dump_chain()
+       dump_chain("ret")
 }
 
 /entry_SYSCALL/ {
 }
 
 /entry_SYSCALL/ {
-       dump_chain()
+       dump_chain("syscall")
 }
 
 END {
 }
 
 END {
-       not_interesting[0]="schedule,ldlm_bl_thread_main,kthread"
-       not_interesting[1]="schedule,ptlrpcd,kthread"
-       not_interesting[2]="schedule,mgc_requeue_thread,kthread"
-       not_interesting[3]="schedule,schedule_timeout,ksocknal_connd,kthread"
-       not_interesting[4]="schedule,ptlrpc_hr_main,kthread"
-       not_interesting[5]="schedule,ksocknal_scheduler"
-       not_interesting[6]="schedule,osp_precreate_thread"
-       not_interesting[7]="schedule,mgs_ir_notify,kthread"
-       not_interesting[8]="schedule,qmt_reba_thread,kthread"
-       not_interesting[9]="schedule,expired_lock_main,kthread"
-       not_interesting[10]="schedule,schedule_timeout,lnet_peer_discovery,kthread"
-       not_interesting[11]="schedule,ping_evictor_main,kthread"
-       not_interesting[12]="schedule,schedule_timeout,qsd_upd_thread,kthread"
-       not_interesting[13]="schedule,ptlrpc_wait_event,ptlrpc_main,kthread"
-       not_interesting[14]="schedule,lnet_sock_accept,lnet_acceptor,kthread"
-       not_interesting[15]="schedule,schedule_timeout,lnet_monitor_thread,kthread"
-       not_interesting[16]="schedule,ofd_inconsistency_verification_main,kthread"
-       not_interesting[17]="schedule,schedule_timeout,ksocknal_reaper,kthread"
-       not_interesting[18]="schedule,kjournald2,kthread"
-       not_interesting[19]="schedule,rescuer_thread,kthread"
-       not_interesting[20]="schedule,oom_reaper,kthread"
-       not_interesting[21]="schedule,worker_thread,kthread"
-       not_interesting[22]="schedule,kthread_worker_fn,kthread"
-       not_interesting[23]="schedule,osp_sync_process_queues,llog_process_thread,llog_process_or_fork,llog_cat_process_cb,llog_process_thread,llog_process_or_fork,llog_cat_process_or_fork,llog_cat_process,osp_sync_thread,kthread"
-       not_interesting[24]="schedule,smpboot_thread_fn,kthread"
-       not_interesting[25]="schedule,schedule_timeout,rcu_gp_kthread,kthread"
-       not_interesting[26]="schedule,rcu_gp_kthread,kthread"
-       not_interesting[27]="schedule,devtmpfsd,kthread"
-       not_interesting[28]="schedule,schedule_timeout,watchdog,kthread"
-       not_interesting[29]="schedule,oom_reaper"
-       not_interesting[30]="schedule,kcompactd,kthread"
-       not_interesting[31]="schedule,schedule_timeout,khugepaged,kthread"
-       not_interesting[32]="schedule,kswapd,kthread"
-       not_interesting[33]="schedule,schedule_hrtimeout_range_clock,do_sys_poll,__se_sys_poll"
-       not_interesting[34]="schedule,schedule_hrtimeout_range_clock,do_select,core_sys_select,kern_select,__x64_sys_select"
-       not_interesting[35]="schedule,schedule_hrtimeout_range_clock,__se_sys_rt_sigtimedwait"
-       not_interesting[36]="schedule,do_wait,kernel_wait4,__se_sys_wait4"
-       not_interesting[37]="schedule,schedule_timeout,wait_for_common,lnet_monitor_thread,kthread"
-       not_interesting[38]="schedule,schedule_hrtimeout_range_clock,ep_poll,do_epoll_wait,__x64_sys_epoll_wait"
-       not_interesting[39]="schedule,kthreadd"
-       not_interesting[40]="schedule,khvcd,kthread"
-       not_interesting[41]="schedule,schedule_timeout,ptlrpcd,kthread"
-       not_interesting[42]="schedule,schedule_timeout,mdt_coordinator,kthread"
-       not_interesting[43]="schedule,distribute_txn_commit_thread,kthread"
-       not_interesting[44]="schedule,lnet_acceptor,kthread"
-       not_interesting[45]="schedule,osp_send_update_thread,kthread"
-       not_interesting[46]="schedule,pipe_wait,pipe_read,__vfs_read,vfs_read,ksys_read"
-       not_interesting[47]="default_idle,do_idle,cpu_startup_entry,start_kernel,secondary_startup_64"
-       not_interesting[48]="default_idle,do_idle,cpu_startup_entry,secondary_startup_64"
-       not_interesting[49]="do_select,core_sys_select,kern_select,__x64_sys_select"
-
-#not_interesting[]=""
-#not_interesting[]=""
-#schedule,kthreadd
+       if (incomplete)
+               print "**** messages lost, stack traces may be incomplete ****"
+       i=0
+       # this list should be kept sorted to avoid duplicates
+       # "!}sort" on next line in Vim
+       not_interesting[i++]="default_idle,do_idle,cpu_startup_entry,secondary_startup_64"
+       not_interesting[i++]="default_idle,do_idle,cpu_startup_entry,start_kernel,secondary_startup_64"
+       not_interesting[i++]="sched_show_task,show_state_filter,sysrq_handle_showstate,__handle_sysrq,write_sysrq_trigger,proc_reg_write,vfs_write,SyS_write"
+       not_interesting[i++]="schedule"
+       not_interesting[i++]="schedule,__se_sys_rt_sigtimedwait"
+       not_interesting[i++]="schedule,cfs_wi_scheduler"
+       not_interesting[i++]="schedule,devtmpfsd"
+       not_interesting[i++]="schedule,distribute_txn_commit_thread"
+       not_interesting[i++]="schedule,do_nanosleep,hrtimer_nanosleep,SyS_nanosleep"
+       not_interesting[i++]="schedule,do_select"
+       not_interesting[i++]="schedule,do_select,kern_select"
+       not_interesting[i++]="schedule,do_sys_poll,SyS_poll"
+       not_interesting[i++]="schedule,do_sys_poll,SyS_ppoll"
+       not_interesting[i++]="schedule,do_sys_poll,__se_sys_poll"
+       not_interesting[i++]="schedule,do_wait,SyS_wait4"
+       not_interesting[i++]="schedule,do_wait,kernel_wait4,__se_sys_wait4"
+       not_interesting[i++]="schedule,ep_poll,SyS_epoll_wait"
+       not_interesting[i++]="schedule,ep_poll,SyS_epoll_wait,SyS_epoll_pwait"
+       not_interesting[i++]="schedule,ep_poll,do_epoll_wait,__x64_sys_epoll_wait"
+       not_interesting[i++]="schedule,expired_lock_main"
+       not_interesting[i++]="schedule,futex_wait_queue_me,futex_wait,do_futex,SyS_futex"
+       not_interesting[i++]="schedule,ib_fmr_cleanup_thread"
+       not_interesting[i++]="schedule,ipmi_thread"
+       not_interesting[i++]="schedule,kauditd_thread"
+       not_interesting[i++]="schedule,kcompactd"
+       not_interesting[i++]="schedule,khugepaged"
+       not_interesting[i++]="schedule,khvcd"
+       not_interesting[i++]="schedule,kiblnd_connd"
+       not_interesting[i++]="schedule,kiblnd_scheduler"
+       not_interesting[i++]="schedule,kjournald2"
+       not_interesting[i++]="schedule,ksm_scan_thread"
+       not_interesting[i++]="schedule,ksocknal_connd"
+       not_interesting[i++]="schedule,ksocknal_reaper"
+       not_interesting[i++]="schedule,ksocknal_scheduler"
+       not_interesting[i++]="schedule,kswapd"
+       not_interesting[i++]="schedule,kthread_worker_fn"
+       not_interesting[i++]="schedule,kthreadd"
+       not_interesting[i++]="schedule,lcw_dispatch_main"
+       not_interesting[i++]="schedule,ldlm_bl_thread_main"
+       not_interesting[i++]="schedule,ll_agl_thread"
+       not_interesting[i++]="schedule,ll_statahead_thread"
+       not_interesting[i++]="schedule,lnet_acceptor"
+       not_interesting[i++]="schedule,lnet_monitor_thread"
+       not_interesting[i++]="schedule,lnet_peer_discovery"
+       not_interesting[i++]="schedule,lnet_sock_accept,lnet_acceptor"
+       not_interesting[i++]="schedule,mdt_coordinator"
+       not_interesting[i++]="schedule,mgc_requeue_thread"
+       not_interesting[i++]="schedule,mgs_ir_notify"
+       not_interesting[i++]="schedule,n_tty_read,tty_read,vfs_read,SyS_read"
+       not_interesting[i++]="schedule,ofd_inconsistency_verification_main"
+       not_interesting[i++]="schedule,oom_reaper"
+       not_interesting[i++]="schedule,osp_precreate_thread"
+       not_interesting[i++]="schedule,osp_send_update_thread"
+       not_interesting[i++]="schedule,osp_sync_process_queues,llog_process_thread,llog_process_or_fork,llog_cat_process_cb,llog_process_thread,llog_process_or_fork,llog_cat_process_or_fork,llog_cat_process,osp_sync_thread"
+       not_interesting[i++]="schedule,ping_evictor_main"
+       not_interesting[i++]="schedule,pipe_wait,pipe_read,__vfs_read,vfs_read,ksys_read"
+       not_interesting[i++]="schedule,poll_do_select"
+       not_interesting[i++]="schedule,ptlrpc_hr_main"
+       not_interesting[i++]="schedule,ptlrpc_wait_event,ptlrpc_main"
+       not_interesting[i++]="schedule,ptlrpcd"
+       not_interesting[i++]="schedule,qmt_reba_thread"
+       not_interesting[i++]="schedule,qsd_upd_thread"
+       not_interesting[i++]="schedule,rcu_gp_kthread"
+       not_interesting[i++]="schedule,rescuer_thread"
+       not_interesting[i++]="schedule,sched_show_task,show_state_filter,sysrq_handle_showstate,__handle_sysrq,write_sysrq_trigger,proc_reg_write,vfs_write,SyS_write"
+       not_interesting[i++]="schedule,scsi_error_handler"
+       not_interesting[i++]="schedule,smpboot_thread_fn"
+       not_interesting[i++]="schedule,sys_pause"
+       not_interesting[i++]="schedule,wait_for_common,lnet_monitor_thread"
+       not_interesting[i++]="schedule,watchdog"
+       not_interesting[i++]="schedule,worker_thread"
 
        for (i = 0; i < unique_nr; i++) {
                dump=1
                for (j in not_interesting) {
 
        for (i = 0; i < unique_nr; i++) {
                dump=1
                for (j in not_interesting) {
-                       if (index(unique[i], not_interesting[j])) {
-                               dump=0
-                               break
-                       }
+#                      if (index(unique[i], not_interesting[j])) {
+#                              dump=0
+#                              break
+#                      }
                        if (unique[i]==not_interesting[j]) {
                                dump=0
                                break
                        }
                }
                if (dump)
                        if (unique[i]==not_interesting[j]) {
                                dump=0
                                break
                        }
                }
                if (dump)
-                       print unique[i],"PIDs:",unique_pids[i]
+                       printf("%s\n\tPIDs(%d): %s\n\n",
+                              unique[i],unique_num[i],unique_pids[i])
        }
 
 }
        }
 
 }