From 744f5f8e6d8b902953431b0a39542b86c09275b1 Mon Sep 17 00:00:00 2001 From: Vitaly Fertman Date: Thu, 11 Feb 2010 12:39:25 -0800 Subject: [PATCH] b=15977 analyse only consistent part of the log due to debug log buffer overflow, old messages are lost; as each CPU has its own buffers which overflows at different time, it may happen that FREE is lost for still existing ALLOC; make leak_finder to analyze the log since the oldest overflow time. i=rread i=green --- libcfs/include/libcfs/libcfs_debug.h | 13 +++++++++- libcfs/libcfs/darwin/darwin-tracefile.c | 3 ++- libcfs/libcfs/linux/linux-tracefile.c | 3 ++- libcfs/libcfs/linux/linux-tracefile.h | 10 -------- libcfs/libcfs/tracefile.c | 13 +++------- libcfs/libcfs/winnt/winnt-tracefile.c | 1 + lnet/utils/debug.c | 9 ++++--- lustre/tests/conf-sanity.sh | 2 +- lustre/tests/leak_finder.pl | 43 +++++++++++++++++++++++++++++++-- 9 files changed, 69 insertions(+), 28 deletions(-) diff --git a/libcfs/include/libcfs/libcfs_debug.h b/libcfs/include/libcfs/libcfs_debug.h index 1cc8844..5380c61 100644 --- a/libcfs/include/libcfs/libcfs_debug.h +++ b/libcfs/include/libcfs/libcfs_debug.h @@ -65,6 +65,16 @@ extern unsigned int libcfs_catastrophe; extern unsigned int libcfs_panic_on_lbug; /** + * three types of trace_data in linux + */ +typedef enum { + CFS_TCD_TYPE_PROC = 0, + CFS_TCD_TYPE_SOFTIRQ, + CFS_TCD_TYPE_IRQ, + CFS_TCD_TYPE_MAX +} cfs_trace_buf_type_t; + +/** * Format for debug message headers */ struct ptldebug_header { @@ -72,7 +82,8 @@ struct ptldebug_header { __u32 ph_flags; __u32 ph_subsys; __u32 ph_mask; - __u32 ph_cpu_id; + __u16 ph_cpu_id; + __u16 ph_type; __u32 ph_sec; __u64 ph_usec; __u32 ph_stack; diff --git a/libcfs/libcfs/darwin/darwin-tracefile.c b/libcfs/libcfs/darwin/darwin-tracefile.c index 74eb1e3..7ab943f 100644 --- a/libcfs/libcfs/darwin/darwin-tracefile.c +++ b/libcfs/libcfs/darwin/darwin-tracefile.c @@ -173,7 +173,8 @@ set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, do_gettimeofday(&tv); header->ph_subsys = subsys; header->ph_mask = mask; - header->ph_cpu_id = smp_processor_id(); + header->ph_cpu_id = cfs_smp_processor_id(); + header->ph_type = 0; header->ph_sec = (__u32)tv.tv_sec; header->ph_usec = tv.tv_usec; header->ph_stack = stack; diff --git a/libcfs/libcfs/linux/linux-tracefile.c b/libcfs/libcfs/linux/linux-tracefile.c index fd6e6b9..3028269 100644 --- a/libcfs/libcfs/linux/linux-tracefile.c +++ b/libcfs/libcfs/linux/linux-tracefile.c @@ -190,7 +190,8 @@ cfs_set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, header->ph_subsys = subsys; header->ph_mask = mask; - header->ph_cpu_id = smp_processor_id(); + header->ph_cpu_id = cfs_smp_processor_id(); + header->ph_type = cfs_trace_buf_idx_get(); header->ph_sec = (__u32)tv.tv_sec; header->ph_usec = tv.tv_usec; header->ph_stack = stack; diff --git a/libcfs/libcfs/linux/linux-tracefile.h b/libcfs/libcfs/linux/linux-tracefile.h index b910074..8ad632c 100644 --- a/libcfs/libcfs/linux/linux-tracefile.h +++ b/libcfs/libcfs/linux/linux-tracefile.h @@ -37,14 +37,4 @@ #ifndef __LIBCFS_LINUX_TRACEFILE_H__ #define __LIBCFS_LINUX_TRACEFILE_H__ -/** - * three types of trace_data in linux - */ -typedef enum { - CFS_TCD_TYPE_PROC = 0, - CFS_TCD_TYPE_SOFTIRQ, - CFS_TCD_TYPE_IRQ, - CFS_TCD_TYPE_MAX -} cfs_trace_buf_type_t; - #endif diff --git a/libcfs/libcfs/tracefile.c b/libcfs/libcfs/tracefile.c index 5a29dfa..63a23ef 100644 --- a/libcfs/libcfs/tracefile.c +++ b/libcfs/libcfs/tracefile.c @@ -245,7 +245,7 @@ int libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls, int subsys, int mask, const char *format2, ...) { struct cfs_trace_cpu_data *tcd = NULL; - struct ptldebug_header header; + struct ptldebug_header header = {0}; struct cfs_trace_page *tage; /* string_buf is used only if tcd != NULL, and is always set then */ char *string_buf = NULL; @@ -268,6 +268,9 @@ int libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls, int subsys, int mask, if (tcd == NULL) /* arch may not log in IRQ context */ goto console; + if (tcd->tcd_cur_pages == 0) + header.ph_flags |= PH_FLAG_FIRST_RECORD; + if (tcd->tcd_shutting_down) { cfs_trace_put_tcd(tcd); tcd = NULL; @@ -961,7 +964,6 @@ static int tracefiled(void *arg) struct tracefiled_ctl *tctl = arg; struct cfs_trace_page *tage; struct cfs_trace_page *tmp; - struct ptldebug_header *hdr; cfs_file_t *filp; int last_loop = 0; int rc; @@ -1002,13 +1004,6 @@ static int tracefiled(void *arg) CFS_MMSPACE_OPEN; - /* mark the first header, so we can sort in chunks */ - tage = cfs_tage_from_list(pc.pc_pages.next); - __LASSERT_TAGE_INVARIANT(tage); - - hdr = cfs_page_address(tage->page); - hdr->ph_flags |= PH_FLAG_FIRST_RECORD; - cfs_list_for_each_entry_safe_typed(tage, tmp, &pc.pc_pages, struct cfs_trace_page, linkage) { diff --git a/libcfs/libcfs/winnt/winnt-tracefile.c b/libcfs/libcfs/winnt/winnt-tracefile.c index 6e63f49..6a99d7b 100644 --- a/libcfs/libcfs/winnt/winnt-tracefile.c +++ b/libcfs/libcfs/winnt/winnt-tracefile.c @@ -176,6 +176,7 @@ cfs_set_ptldebug_header(struct ptldebug_header *header, int subsys, int mask, header->ph_subsys = subsys; header->ph_mask = mask; header->ph_cpu_id = cfs_smp_processor_id(); + header->ph_type = cfs_trace_buf_idx_get(); header->ph_sec = (__u32)tv.tv_sec; header->ph_usec = tv.tv_usec; header->ph_stack = stack; diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 91200c6..ca17a3d 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -367,8 +367,10 @@ static void print_rec(struct dbg_line ***linevp, int used, int fdout) int bytes; ssize_t bytes_written; - bytes = sprintf(out, "%08x:%08x:%u:%u.%06llu:%u:%u:%u:(%s:%u:%s()) %s", - hdr->ph_subsys, hdr->ph_mask, hdr->ph_cpu_id, + bytes = sprintf(out, "%08x:%08x:%u.%u%s:%u.%06llu:%u:%u:%u:(%s:%u:%s()) %s", + hdr->ph_subsys, hdr->ph_mask, + hdr->ph_cpu_id, hdr->ph_type, + hdr->ph_flags & PH_FLAG_FIRST_RECORD ? "F" : "", hdr->ph_sec, (unsigned long long)hdr->ph_usec, hdr->ph_stack, hdr->ph_pid, hdr->ph_extern_pid, line->file, hdr->ph_line_num, line->fn, line->text); @@ -415,6 +417,7 @@ static void dump_hdr(unsigned long long offset, struct ptldebug_header *hdr) fprintf(stderr, " subsystem = %x\n", hdr->ph_subsys); fprintf(stderr, " mask = %x\n", hdr->ph_mask); fprintf(stderr, " cpu_id = %u\n", hdr->ph_cpu_id); + fprintf(stderr, " type = %u\n", hdr->ph_type); fprintf(stderr, " seconds = %u\n", hdr->ph_sec); fprintf(stderr, " microseconds = %lu\n", (long)hdr->ph_usec); fprintf(stderr, " stack = %u\n", hdr->ph_stack); @@ -454,7 +457,7 @@ static int parse_buffer(int fdin, int fdout) goto readhdr; if (hdr->ph_len > 4094 || /* is this header bogus? */ - hdr->ph_cpu_id > 65536 || + hdr->ph_type >= CFS_TCD_TYPE_MAX || hdr->ph_stack > 65536 || hdr->ph_sec < (1 << 30) || hdr->ph_usec > 1000000000 || diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index a4badd6..199c73c9 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -13,7 +13,7 @@ ONLY=${ONLY:-"$*"} # bug number for skipped test: # 15977 -ALWAYS_EXCEPT=" 39 $CONF_SANITY_EXCEPT" +ALWAYS_EXCEPT="$CONF_SANITY_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! SRCDIR=`dirname $0` diff --git a/lustre/tests/leak_finder.pl b/lustre/tests/leak_finder.pl index 45d2211..a7c7abb 100644 --- a/lustre/tests/leak_finder.pl +++ b/lustre/tests/leak_finder.pl @@ -11,10 +11,48 @@ my $debug_line = 0; my $total = 0; my $max = 0; -while ($line = <>) { +my @parsed; +my %cpu; +my $start_time = 0; + +if (!defined($ARGV[0])) { + print "No log file specified\n"; + exit +} + +open(INFILE, $ARGV[0]); +while ($line = ) { + if ($line =~ m/^(.*)\((.*):(\d+):(.*)\(\)\)/) { + @parsed = split(":", $1); + if (substr ($parsed[2], -1, 1) eq "F") { + chop $parsed[2]; + $cpu{$parsed[2]} = 0; + } else { + if (!defined($cpu{$parsed[2]})) { + $cpu{$parsed[2]} = $parsed[3]; + } + } + } +} + +foreach $time (values %cpu) { + if ($start_time < $time) { + $start_time = $time; + } +} + +print "Starting analysis since $start_time\n"; + +seek(INFILE, 0, 0); +while ($line = ) { $debug_line++; my ($file, $func, $lno, $name, $size, $addr, $type); - if ($line =~ m/^.*(\.).*\((.*):(\d+):(.*)\(\)\) (k|v|slab-)(.*) '(.*)': (\d+) at ([\da-f]+)/){ + if ($line =~ m/^(.*)\((.*):(\d+):(.*)\(\)\) (k|v|slab-)(.*) '(.*)': (\d+) at ([\da-f]+)/){ + @parsed = split(":", $1); + if ($parsed[3] <= $start_time) { + next; + } + $file = $2; $lno = $3; $func = $4; @@ -69,6 +107,7 @@ while ($line = <>) { $total -= $size; } } +close(INFILE); # Sort leak output by allocation time my @sorted = sort { -- 1.8.3.1