From 848738a85d82bb71cd91304c60a6a298d854157d Mon Sep 17 00:00:00 2001 From: Mr NeilBrown Date: Thu, 11 Feb 2021 09:49:54 +1100 Subject: [PATCH] LU-14428 libcfs: separate daemon_list from cfs_trace_data cfs_trace_data provides a fifo for trace messages. To minimize locking, there is a separate fifo for each CPU, and even for different interrupt levels per-cpu. When a page is remove from the fifo to br written to a file, the page is added to a "daemon_list". Trace message on the daemon_list have already been logged to a file, but can be easily dumped to the console when a bug occurs. The daemon_list is always accessed from a single thread at a time, so the per-CPU facilities for cfs_trace_data are not needed. However daemon_list is currently managed per-cpu as part of cfs_trace_data. This patch moves the daemon_list of pages out to a separate structure - a simple linked list, protected by cfs_tracefile_sem. Rather than using a 'cfs_trace_page' to hold linkage information and content size, we use page->lru for linkage and page->private for the size of the content in each page. This is a step towards replacing cfs_trace_data with the Linux ring_buffer which provides similar functionality with even less locking. In the current code, if the daemon which writes trace data to a file cannot keep up with load, excess pages are moved to the daemon_list temporarily before being discarded. With the patch, these page are simply discarded immediately. If the daemon thread cannot keep up, that is a configuration problem and temporarily preserving a few pages is unlikely to really help. Signed-off-by: Mr NeilBrown Change-Id: Ie894f7751cadacb515215f18182163ea5d26e969 Reviewed-on: https://review.whamcloud.com/41493 Reviewed-by: James Simmons Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- contrib/debug_tools/epython_scripts/dk.py | 1 - libcfs/libcfs/tracefile.c | 231 ++++++++++++++++-------------- libcfs/libcfs/tracefile.h | 23 +-- 3 files changed, 124 insertions(+), 131 deletions(-) diff --git a/contrib/debug_tools/epython_scripts/dk.py b/contrib/debug_tools/epython_scripts/dk.py index 2d9c135..1806e9a 100644 --- a/contrib/debug_tools/epython_scripts/dk.py +++ b/contrib/debug_tools/epython_scripts/dk.py @@ -88,7 +88,6 @@ def walk_array(options): for i in xrange(sys_info.CPUS): u = cfstd_array[i] walk_pages(tmpfd, options, u.tcd.tcd_pages, trace_page_struct) - walk_pages(tmpfd, options, u.tcd.tcd_daemon_pages, trace_page_struct) walk_pages(tmpfd, options, u.tcd.tcd_stock_pages, trace_page_struct) tmpfd.close() diff --git a/libcfs/libcfs/tracefile.c b/libcfs/libcfs/tracefile.c index 5db7394..c117d1a 100644 --- a/libcfs/libcfs/tracefile.c +++ b/libcfs/libcfs/tracefile.c @@ -57,6 +57,13 @@ enum cfs_trace_buf_type { union cfs_trace_data_union (*cfs_trace_data[CFS_TCD_TYPE_CNT])[NR_CPUS] __cacheline_aligned; +/* Pages containing records already processed by daemon. + * Link via ->lru, use size in ->private + */ +static LIST_HEAD(daemon_pages); +static long daemon_pages_count; +static long daemon_pages_max; + char cfs_tracefile[TRACEFILE_NAME_SIZE]; long long cfs_tracefile_size = CFS_TRACEFILE_SIZE; @@ -65,9 +72,6 @@ struct task_struct *tctl_task; static atomic_t cfs_tage_allocated = ATOMIC_INIT(0); static DECLARE_RWSEM(cfs_tracefile_sem); -static void put_pages_on_tcd_daemon_list(struct page_collection *pc, - struct cfs_trace_cpu_data *tcd); - /* trace file lock routines */ /* The walking argument indicates the locking comes from all tcd types * iterator and we must lock it and dissable local irqs to avoid deadlocks @@ -265,10 +269,10 @@ static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd) if (pgcount-- == 0) break; - list_move_tail(&tage->linkage, &pc.pc_pages); + list_del(&tage->linkage); + cfs_tage_free(tage); tcd->tcd_cur_pages--; } - put_pages_on_tcd_daemon_list(&pc, tcd); } /* return a page that has 'len' bytes left at the end */ @@ -643,12 +647,6 @@ panic_collect_pages(struct page_collection *pc) cfs_tcd_for_each(tcd, i, j) { list_splice_init(&tcd->tcd_pages, &pc->pc_pages); tcd->tcd_cur_pages = 0; - - if (pc->pc_want_daemon_pages) { - list_splice_init(&tcd->tcd_daemon_pages, - &pc->pc_pages); - tcd->tcd_cur_daemon_pages = 0; - } } } @@ -661,11 +659,6 @@ static void collect_pages_on_all_cpus(struct page_collection *pc) cfs_tcd_for_each_type_lock(tcd, i, cpu) { list_splice_init(&tcd->tcd_pages, &pc->pc_pages); tcd->tcd_cur_pages = 0; - if (pc->pc_want_daemon_pages) { - list_splice_init(&tcd->tcd_daemon_pages, - &pc->pc_pages); - tcd->tcd_cur_daemon_pages = 0; - } } } } @@ -713,63 +706,17 @@ static void put_pages_back(struct page_collection *pc) put_pages_back_on_all_cpus(pc); } -/* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that - * we have a good amount of data at all times for dumping during an LBUG, even - * if we have been steadily writing (and otherwise discarding) pages via the - * debug daemon. */ -static void put_pages_on_tcd_daemon_list(struct page_collection *pc, - struct cfs_trace_cpu_data *tcd) -{ - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - - list_for_each_entry_safe(tage, tmp, &pc->pc_pages, linkage) { - __LASSERT_TAGE_INVARIANT(tage); - - if (tage->cpu != tcd->tcd_cpu || tage->type != tcd->tcd_type) - continue; - - cfs_tage_to_tail(tage, &tcd->tcd_daemon_pages); - tcd->tcd_cur_daemon_pages++; - - if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) { - struct cfs_trace_page *victim; - - __LASSERT(!list_empty(&tcd->tcd_daemon_pages)); - victim = cfs_tage_from_list(tcd->tcd_daemon_pages.next); - - __LASSERT_TAGE_INVARIANT(victim); - - list_del(&victim->linkage); - cfs_tage_free(victim); - tcd->tcd_cur_daemon_pages--; - } - } -} - -static void put_pages_on_daemon_list(struct page_collection *pc) -{ - struct cfs_trace_cpu_data *tcd; - int i, cpu; - - for_each_possible_cpu(cpu) { - cfs_tcd_for_each_type_lock(tcd, i, cpu) - put_pages_on_tcd_daemon_list(pc, tcd); - } -} - #ifdef LNET_DUMP_ON_PANIC void cfs_trace_debug_print(void) { struct page_collection pc; struct cfs_trace_page *tage; struct cfs_trace_page *tmp; + struct page *page; - pc.pc_want_daemon_pages = 1; collect_pages(&pc); list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { char *p, *file, *fn; - struct page *page; __LASSERT_TAGE_INVARIANT(tage); @@ -795,16 +742,45 @@ void cfs_trace_debug_print(void) list_del(&tage->linkage); cfs_tage_free(tage); } + down_write(&cfs_tracefile_sem); + while ((page = list_first_entry_or_null(&daemon_pages, + struct page, lru)) != NULL) { + char *p, *file, *fn; + + p = page_address(page); + while (p < ((char *)page_address(page) + page->private)) { + struct ptldebug_header *hdr; + int len; + + hdr = (void *)p; + p += sizeof(*hdr); + file = p; + p += strlen(file) + 1; + fn = p; + p += strlen(fn) + 1; + len = hdr->ph_len - (int)(p - (char *)hdr); + + cfs_print_to_console(hdr, D_EMERG, file, fn, + "%.*s", len, p); + + p += len; + } + list_del_init(&page->lru); + daemon_pages_count -= 1; + put_page(page); + } + up_write(&cfs_tracefile_sem); } #endif /* LNET_DUMP_ON_PANIC */ int cfs_tracefile_dump_all_pages(char *filename) { - struct page_collection pc; - struct file *filp; - struct cfs_trace_page *tage; - struct cfs_trace_page *tmp; - char *buf; + struct page_collection pc; + struct file *filp; + struct cfs_trace_page *tage; + struct cfs_trace_page *tmp; + char *buf; + struct page *page; int rc; down_write(&cfs_tracefile_sem); @@ -814,16 +790,15 @@ int cfs_tracefile_dump_all_pages(char *filename) rc = PTR_ERR(filp); filp = NULL; pr_err("LustreError: can't open %s for dump: rc = %d\n", - filename, rc); + filename, rc); goto out; } - pc.pc_want_daemon_pages = 1; - collect_pages(&pc); + collect_pages(&pc); if (list_empty(&pc.pc_pages)) { - rc = 0; - goto close; - } + rc = 0; + goto close; + } /* ok, for now, just write the pages. in the future we'll be building * iobufs with the pages and calling generic_direct_IO */ @@ -842,9 +817,21 @@ int cfs_tracefile_dump_all_pages(char *filename) break; } list_del(&tage->linkage); - cfs_tage_free(tage); - } - + cfs_tage_free(tage); + } + while ((page = list_first_entry_or_null(&daemon_pages, + struct page, lru)) != NULL) { + buf = page_address(page); + rc = cfs_kernel_write(filp, buf, page->private, &filp->f_pos); + if (rc != (int)page->private) { + pr_warn("Lustre: wanted to write %u but wrote %d\n", + (int)page->private, rc); + break; + } + list_del(&page->lru); + daemon_pages_count -= 1; + put_page(page); + } rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1); if (rc) pr_err("LustreError: sync returns: rc = %d\n", rc); @@ -859,8 +846,8 @@ void cfs_trace_flush_pages(void) { struct page_collection pc; struct cfs_trace_page *tage; + struct page *page; - pc.pc_want_daemon_pages = 1; collect_pages(&pc); while (!list_empty(&pc.pc_pages)) { tage = list_first_entry(&pc.pc_pages, @@ -870,6 +857,15 @@ void cfs_trace_flush_pages(void) list_del(&tage->linkage); cfs_tage_free(tage); } + + down_write(&cfs_tracefile_sem); + while ((page = list_first_entry_or_null(&daemon_pages, + struct page, lru)) != NULL) { + list_del(&page->lru); + daemon_pages_count -= 1; + put_page(page); + } + up_write(&cfs_tracefile_sem); } int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob, @@ -1000,6 +996,7 @@ int cfs_trace_set_debug_mb(int mb) cfs_tcd_for_each(tcd, i, j) tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100; + daemon_pages_max = pages; up_write(&cfs_tracefile_sem); return mb; @@ -1032,9 +1029,9 @@ static int tracefiled(void *arg) int last_loop = 0; int rc; - pc.pc_want_daemon_pages = 0; - while (!last_loop) { + LIST_HEAD(for_daemon_pages); + int for_daemon_pages_count = 0; schedule_timeout_interruptible(cfs_time_seconds(1)); if (kthread_should_stop()) last_loop = 1; @@ -1056,37 +1053,55 @@ static int tracefiled(void *arg) } } up_read(&cfs_tracefile_sem); - if (filp == NULL) { - put_pages_on_daemon_list(&pc); - __LASSERT(list_empty(&pc.pc_pages)); - continue; - } list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) { - struct dentry *de = file_dentry(filp); - static loff_t f_pos; - __LASSERT_TAGE_INVARIANT(tage); - if (f_pos >= (off_t)cfs_tracefile_size) - f_pos = 0; - else if (f_pos > i_size_read(de->d_inode)) - f_pos = i_size_read(de->d_inode); - - buf = kmap(tage->page); - rc = cfs_kernel_write(filp, buf, tage->used, &f_pos); - kunmap(tage->page); - if (rc != (int)tage->used) { - pr_warn("Lustre: wanted to write %u but wrote %d\n", - tage->used, rc); - put_pages_back(&pc); - __LASSERT(list_empty(&pc.pc_pages)); - break; + if (filp) { + struct dentry *de = file_dentry(filp); + static loff_t f_pos; + + if (f_pos >= (off_t)cfs_tracefile_size) + f_pos = 0; + else if (f_pos > i_size_read(de->d_inode)) + f_pos = i_size_read(de->d_inode); + + buf = kmap(tage->page); + rc = cfs_kernel_write(filp, buf, tage->used, + &f_pos); + kunmap(tage->page); + if (rc != (int)tage->used) { + pr_warn("Lustre: wanted to write %u but wrote %d\n", + tage->used, rc); + put_pages_back(&pc); + __LASSERT(list_empty(&pc.pc_pages)); + break; + } } + list_del_init(&tage->linkage); + list_add_tail(&tage->page->lru, &for_daemon_pages); + for_daemon_pages_count += 1; + + tage->page->private = (int)tage->used; + kfree(tage); + atomic_dec(&cfs_tage_allocated); } - filp_close(filp, NULL); - put_pages_on_daemon_list(&pc); + if (filp) + filp_close(filp, NULL); + + down_write(&cfs_tracefile_sem); + list_splice_tail(&for_daemon_pages, &daemon_pages); + daemon_pages_count += for_daemon_pages_count; + while (daemon_pages_count > daemon_pages_max) { + struct page *p = list_first_entry(&daemon_pages, + struct page, lru); + list_del(&p->lru); + put_page(p); + daemon_pages_count -= 1; + } + up_write(&cfs_tracefile_sem); + if (!list_empty(&pc.pc_pages)) { int i; @@ -1178,14 +1193,13 @@ int cfs_tracefile_init(int max_pages) INIT_LIST_HEAD(&tcd->tcd_pages); INIT_LIST_HEAD(&tcd->tcd_stock_pages); - INIT_LIST_HEAD(&tcd->tcd_daemon_pages); tcd->tcd_cur_pages = 0; tcd->tcd_cur_stock_pages = 0; - tcd->tcd_cur_daemon_pages = 0; tcd->tcd_max_pages = (max_pages * factor) / 100; LASSERT(tcd->tcd_max_pages > 0); tcd->tcd_shutting_down = 0; } + daemon_pages_max = max_pages; return 0; @@ -1242,6 +1256,7 @@ static void cfs_trace_cleanup(void) void cfs_tracefile_exit(void) { - cfs_trace_stop_thread(); - cfs_trace_cleanup(); + cfs_trace_stop_thread(); + cfs_trace_flush_pages(); + cfs_trace_cleanup(); } diff --git a/libcfs/libcfs/tracefile.h b/libcfs/libcfs/tracefile.h index 9ced175..406a8d5 100644 --- a/libcfs/libcfs/tracefile.h +++ b/libcfs/libcfs/tracefile.h @@ -87,22 +87,7 @@ union cfs_trace_data_union { unsigned long tcd_cur_pages; /* - * pages with trace records already processed by - * tracefiled. These pages are kept in memory, so that some - * portion of log can be written in the event of LBUG. This - * list is maintained in LRU order. - * - * Pages are moved to ->tcd_daemon_pages by tracefiled() - * (put_pages_on_daemon_list()). LRU pages from this list are - * discarded when list grows too large. - */ - struct list_head tcd_daemon_pages; - /* number of pages on ->tcd_daemon_pages */ - unsigned long tcd_cur_daemon_pages; - - /* - * Maximal number of pages allowed on ->tcd_pages and - * ->tcd_daemon_pages each. + * Maximal number of pages allowed on ->tcd_pages * Always TCD_MAX_PAGES * tcd_pages_factor / 100 in current * implementation. */ @@ -147,12 +132,6 @@ union cfs_trace_data_union { * be moved there */ struct page_collection { struct list_head pc_pages; - /* - * if this flag is set, collect_pages() will spill both - * ->tcd_daemon_pages and ->tcd_pages to the ->pc_pages. Otherwise, - * only ->tcd_pages are spilled. - */ - int pc_want_daemon_pages; }; /* -- 1.8.3.1