1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Zach Brown <zab@clusterfs.com>
6 * Author: Phil Schwan <phil@clusterfs.com>
8 * This file is part of Lustre, http://www.lustre.org.
10 * Lustre is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Lustre is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Lustre; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include <linux/kernel.h>
25 #include <linux/module.h>
26 #include <linux/init.h>
27 #include <linux/rwsem.h>
28 #include <linux/proc_fs.h>
29 #include <linux/file.h>
30 #include <linux/smp.h>
31 #include <linux/ctype.h>
32 #include <asm/uaccess.h>
34 #include <linux/mm_inline.h>
37 #define DEBUG_SUBSYSTEM S_PORTALS
39 #include <linux/kp30.h>
40 #include <linux/portals_compat25.h>
41 #include <linux/libcfs.h>
43 #define TCD_MAX_PAGES (5 << (20 - PAGE_SHIFT))
45 /* XXX move things up to the top, comment */
48 struct trace_cpu_data {
49 struct list_head tcd_pages;
50 unsigned long tcd_cur_pages;
52 struct list_head tcd_daemon_pages;
53 unsigned long tcd_cur_daemon_pages;
55 unsigned long tcd_max_pages;
56 int tcd_shutting_down;
58 char __pad[SMP_CACHE_BYTES];
59 } trace_data[NR_CPUS] __cacheline_aligned;
61 struct page_collection {
62 struct list_head pc_pages;
64 int pc_want_daemon_pages;
67 struct tracefiled_ctl {
68 struct completion tctl_start;
69 struct completion tctl_stop;
70 wait_queue_head_t tctl_waitq;
72 atomic_t tctl_shutdown;
75 #define TRACEFILE_SIZE (500 << 20)
76 static DECLARE_RWSEM(tracefile_sem);
77 static char *tracefile = NULL;
78 static long long tracefile_size = TRACEFILE_SIZE;
79 static struct tracefiled_ctl trace_tctl;
80 static DECLARE_MUTEX(trace_thread_sem);
81 static int thread_running = 0;
84 #define get_cpu() smp_processor_id()
85 #define put_cpu() do { } while (0)
88 #define trace_get_tcd(FLAGS) ({ \
89 struct trace_cpu_data *__ret; \
90 int __cpu = get_cpu(); \
91 local_irq_save(FLAGS); \
92 __ret = &trace_data[__cpu].tcd; \
96 #define trace_put_tcd(TCD, FLAGS) do { \
97 local_irq_restore(FLAGS); \
101 static void put_pages_on_daemon_list_on_cpu(void *info);
103 /* return a page that has 'len' bytes left at the end */
104 static struct page *trace_get_page(struct trace_cpu_data *tcd,
107 struct page *page = NULL;
109 if (len > PAGE_SIZE) {
110 printk(KERN_ERR "cowardly refusing to write %lu bytes in a "
115 if (!list_empty(&tcd->tcd_pages)) {
116 page = list_entry(tcd->tcd_pages.prev, struct page,
118 if (page->index + len <= PAGE_SIZE)
122 if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
123 page = alloc_page(GFP_ATOMIC);
125 /* the kernel should print a message for us. fall back
126 * to using the last page in the ring buffer. */
130 page->mapping = (void *)(long)smp_processor_id();
131 list_add_tail(&PAGE_LIST(page), &tcd->tcd_pages);
132 tcd->tcd_cur_pages++;
134 if (tcd->tcd_cur_pages > 8 && thread_running) {
135 struct tracefiled_ctl *tctl = &trace_tctl;
136 wake_up(&tctl->tctl_waitq);
142 if (thread_running) {
143 int pgcount = tcd->tcd_cur_pages / 10;
144 struct page_collection pc;
145 struct list_head *pos, *tmp;
146 printk(KERN_WARNING "debug daemon buffer overflowed; discarding"
147 " 10%% of pages (%d)\n", pgcount + 1);
149 INIT_LIST_HEAD(&pc.pc_pages);
150 spin_lock_init(&pc.pc_lock);
152 list_for_each_safe(pos, tmp, &tcd->tcd_pages) {
158 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
159 list_del(&PAGE_LIST(page));
160 list_add_tail(&PAGE_LIST(page), &pc.pc_pages);
161 tcd->tcd_cur_pages--;
163 put_pages_on_daemon_list_on_cpu(&pc);
165 LASSERT(!list_empty(&tcd->tcd_pages));
167 page = list_entry(tcd->tcd_pages.next, struct page, PAGE_LIST_ENTRY);
170 list_del(&PAGE_LIST(page));
171 list_add_tail(&PAGE_LIST(page), &tcd->tcd_pages);
175 static void print_to_console(struct ptldebug_header *hdr, int mask, char *buf,
176 int len, char *file, const char *fn)
178 char *prefix = NULL, *ptype = NULL;
180 if ((mask & D_EMERG) != 0) {
181 prefix = "LustreError";
183 } else if ((mask & D_ERROR) != 0) {
184 prefix = "LustreError";
186 } else if ((mask & D_WARNING) != 0) {
188 ptype = KERN_WARNING;
189 } else if (portal_printk) {
194 printk("%s%s: %d:%d:(%s:%d:%s()) %.*s", ptype, prefix, hdr->ph_pid,
195 hdr->ph_extern_pid, file, hdr->ph_line_num, fn, len, buf);
198 void portals_debug_msg(int subsys, int mask, char *file, const char *fn,
199 const int line, unsigned long stack, char *format, ...)
201 struct trace_cpu_data *tcd;
202 struct ptldebug_header header;
204 char *debug_buf = format;
205 int known_size, needed = 85 /* average message length */, max_nob;
210 if (*(format + strlen(format) - 1) != '\n')
211 printk(KERN_INFO "format at %s:%d:%s doesn't end in newline\n",
214 tcd = trace_get_tcd(flags);
215 if (tcd->tcd_shutting_down)
218 do_gettimeofday(&tv);
220 header.ph_subsys = subsys;
221 header.ph_mask = mask;
222 header.ph_cpu_id = smp_processor_id();
223 header.ph_sec = (__u32)tv.tv_sec;
224 header.ph_usec = tv.tv_usec;
225 header.ph_stack = stack;
226 header.ph_pid = current->pid;
227 header.ph_line_num = line;
229 #if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
230 header.ph_extern_pid = current->thread.extern_pid;
231 #elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
232 header.ph_extern_pid = current->thread.mode.tt.extern_pid;
234 header.ph_extern_pid = 0;
237 known_size = sizeof(header) + strlen(file) + strlen(fn) + 2; // nulls
240 page = trace_get_page(tcd, needed + known_size);
243 if (needed + known_size > PAGE_SIZE)
245 needed = strlen(format);
249 debug_buf = page_address(page) + page->index + known_size;
251 max_nob = PAGE_SIZE - page->index - known_size;
252 LASSERT(max_nob > 0);
253 va_start(ap, format);
254 needed = vsnprintf(debug_buf, max_nob, format, ap);
257 if (needed > max_nob) /* overflow. oh poop. */
260 header.ph_len = known_size + needed;
261 debug_buf = page_address(page) + page->index;
263 memcpy(debug_buf, &header, sizeof(header));
264 page->index += sizeof(header);
265 debug_buf += sizeof(header);
267 strcpy(debug_buf, file);
268 page->index += strlen(file) + 1;
269 debug_buf += strlen(file) + 1;
271 strcpy(debug_buf, fn);
272 page->index += strlen(fn) + 1;
273 debug_buf += strlen(fn) + 1;
275 page->index += needed;
276 if (page->index > PAGE_SIZE)
277 printk(KERN_EMERG "page->index == %lu in portals_debug_msg\n",
281 if ((mask & (D_EMERG | D_ERROR | D_WARNING)) || portal_printk)
282 print_to_console(&header, mask, debug_buf, needed, file, fn);
284 trace_put_tcd(tcd, flags);
286 EXPORT_SYMBOL(portals_debug_msg);
288 static void collect_pages_on_cpu(void *info)
290 struct trace_cpu_data *tcd;
292 struct page_collection *pc = info;
294 tcd = trace_get_tcd(flags);
296 spin_lock(&pc->pc_lock);
297 list_splice(&tcd->tcd_pages, &pc->pc_pages);
298 INIT_LIST_HEAD(&tcd->tcd_pages);
299 tcd->tcd_cur_pages = 0;
300 if (pc->pc_want_daemon_pages) {
301 list_splice(&tcd->tcd_daemon_pages, &pc->pc_pages);
302 INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
303 tcd->tcd_cur_daemon_pages = 0;
305 spin_unlock(&pc->pc_lock);
307 trace_put_tcd(tcd, flags);
310 static void collect_pages(struct page_collection *pc)
312 /* needs to be fixed up for preempt */
313 INIT_LIST_HEAD(&pc->pc_pages);
314 collect_pages_on_cpu(pc);
315 smp_call_function(collect_pages_on_cpu, pc, 0, 1);
318 static void put_pages_back_on_cpu(void *info)
320 struct page_collection *pc = info;
321 struct trace_cpu_data *tcd;
322 struct list_head *pos, *tmp, *cur_head;
325 tcd = trace_get_tcd(flags);
327 cur_head = tcd->tcd_pages.next;
329 spin_lock(&pc->pc_lock);
330 list_for_each_safe(pos, tmp, &pc->pc_pages) {
333 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
334 LASSERT(page->index <= PAGE_SIZE);
335 LASSERT(page_count(page) > 0);
337 if ((unsigned long)page->mapping != smp_processor_id())
340 list_del(&PAGE_LIST(page));
341 list_add_tail(&PAGE_LIST(page), cur_head);
342 tcd->tcd_cur_pages++;
344 spin_unlock(&pc->pc_lock);
346 trace_put_tcd(tcd, flags);
349 static void put_pages_back(struct page_collection *pc)
351 /* needs to be fixed up for preempt */
352 put_pages_back_on_cpu(pc);
353 smp_call_function(put_pages_back_on_cpu, pc, 0, 1);
356 /* Add pages to a per-cpu debug daemon ringbuffer. This buffer makes sure that
357 * we have a good amount of data at all times for dumping during an LBUG, even
358 * if we have been steadily writing (and otherwise discarding) pages via the
360 static void put_pages_on_daemon_list_on_cpu(void *info)
362 struct page_collection *pc = info;
363 struct trace_cpu_data *tcd;
364 struct list_head *pos, *tmp;
367 tcd = trace_get_tcd(flags);
369 spin_lock(&pc->pc_lock);
370 list_for_each_safe(pos, tmp, &pc->pc_pages) {
373 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
374 LASSERT(page->index <= PAGE_SIZE);
375 LASSERT(page_count(page) > 0);
376 if ((unsigned long)page->mapping != smp_processor_id())
379 list_del(&PAGE_LIST(page));
380 list_add_tail(&PAGE_LIST(page), &tcd->tcd_daemon_pages);
381 tcd->tcd_cur_daemon_pages++;
383 if (tcd->tcd_cur_daemon_pages > tcd->tcd_max_pages) {
384 LASSERT(!list_empty(&tcd->tcd_daemon_pages));
385 page = list_entry(tcd->tcd_daemon_pages.next,
386 struct page, PAGE_LIST_ENTRY);
388 LASSERT(page->index <= PAGE_SIZE);
389 LASSERT(page_count(page) > 0);
392 list_del(&PAGE_LIST(page));
393 page->mapping = NULL;
395 tcd->tcd_cur_daemon_pages--;
398 spin_unlock(&pc->pc_lock);
400 trace_put_tcd(tcd, flags);
403 static void put_pages_on_daemon_list(struct page_collection *pc)
405 put_pages_on_daemon_list_on_cpu(pc);
406 smp_call_function(put_pages_on_daemon_list_on_cpu, pc, 0, 1);
409 void trace_debug_print(void)
411 struct page_collection pc;
412 struct list_head *pos, *tmp;
414 spin_lock_init(&pc.pc_lock);
417 list_for_each_safe(pos, tmp, &pc.pc_pages) {
421 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
422 LASSERT(page->index <= PAGE_SIZE);
423 LASSERT(page_count(page) > 0);
425 p = page_address(page);
426 while (p < ((char *)page_address(page) + PAGE_SIZE)) {
427 struct ptldebug_header *hdr;
432 p += strlen(file) + 1;
435 len = hdr->ph_len - (p - (char *)hdr);
437 print_to_console(hdr, D_EMERG, p, len, file, fn);
440 list_del(&PAGE_LIST(page));
441 page->mapping = NULL;
446 int tracefile_dump_all_pages(char *filename)
448 struct page_collection pc;
450 struct list_head *pos, *tmp;
454 down_write(&tracefile_sem);
456 filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
459 printk(KERN_ERR "LustreError: can't open %s for dump: rc %d\n",
464 spin_lock_init(&pc.pc_lock);
465 pc.pc_want_daemon_pages = 1;
467 if (list_empty(&pc.pc_pages)) {
472 /* ok, for now, just write the pages. in the future we'll be building
473 * iobufs with the pages and calling generic_direct_IO */
476 list_for_each_safe(pos, tmp, &pc.pc_pages) {
479 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
480 LASSERT(page->index <= PAGE_SIZE);
481 LASSERT(page_count(page) > 0);
483 rc = filp->f_op->write(filp, page_address(page), page->index,
485 if (rc != page->index) {
486 printk(KERN_WARNING "wanted to write %lu but wrote "
487 "%d\n", page->index, rc);
491 list_del(&PAGE_LIST(page));
492 page->mapping = NULL;
496 rc = filp->f_op->fsync(filp, filp->f_dentry, 1);
498 printk(KERN_ERR "sync returns %d\n", rc);
502 up_write(&tracefile_sem);
506 void trace_flush_pages(void)
508 struct page_collection pc;
509 struct list_head *pos, *tmp;
511 spin_lock_init(&pc.pc_lock);
514 list_for_each_safe(pos, tmp, &pc.pc_pages) {
517 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
518 LASSERT(page->index <= PAGE_SIZE);
519 LASSERT(page_count(page) > 0);
521 list_del(&PAGE_LIST(page));
522 page->mapping = NULL;
527 int trace_dk(struct file *file, const char *buffer, unsigned long count,
534 name = kmalloc(count + 1, GFP_KERNEL);
538 if (copy_from_user(name, buffer, count)) {
543 if (name[0] != '/') {
548 /* be nice and strip out trailing '\n' */
549 for (off = count ; off > 2 && isspace(name[off - 1]); off--)
553 rc = tracefile_dump_all_pages(name);
559 EXPORT_SYMBOL(trace_dk);
561 static int tracefiled(void *arg)
563 struct page_collection pc;
564 struct tracefiled_ctl *tctl = arg;
565 struct list_head *pos, *tmp;
566 struct ptldebug_header *hdr;
572 /* we're started late enough that we pick up init's fs context */
573 /* this is so broken in uml? what on earth is going on? */
574 kportal_daemonize("ktracefiled");
577 spin_lock_init(&pc.pc_lock);
578 complete(&tctl->tctl_start);
583 init_waitqueue_entry(&__wait, current);
584 add_wait_queue(&tctl->tctl_waitq, &__wait);
585 set_current_state(TASK_INTERRUPTIBLE);
586 schedule_timeout(HZ);
587 remove_wait_queue(&tctl->tctl_waitq, &__wait);
589 if (atomic_read(&tctl->tctl_shutdown))
592 pc.pc_want_daemon_pages = 0;
594 if (list_empty(&pc.pc_pages))
598 down_read(&tracefile_sem);
599 if (tracefile != NULL) {
600 filp = filp_open(tracefile, O_CREAT|O_RDWR|O_LARGEFILE,
603 printk("couldn't open %s: %ld\n", tracefile,
608 up_read(&tracefile_sem);
610 put_pages_on_daemon_list(&pc);
617 /* mark the first header, so we can sort in chunks */
618 page = list_entry(pc.pc_pages.next, struct page,
620 LASSERT(page->index <= PAGE_SIZE);
621 LASSERT(page_count(page) > 0);
623 hdr = page_address(page);
624 hdr->ph_flags |= PH_FLAG_FIRST_RECORD;
626 list_for_each_safe(pos, tmp, &pc.pc_pages) {
628 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
629 LASSERT(page->index <= PAGE_SIZE);
630 LASSERT(page_count(page) > 0);
632 if (f_pos >= tracefile_size)
634 else if (f_pos > filp->f_dentry->d_inode->i_size)
635 f_pos = filp->f_dentry->d_inode->i_size;
637 rc = filp->f_op->write(filp, page_address(page),
638 page->index, &f_pos);
639 if (rc != page->index) {
640 printk(KERN_WARNING "wanted to write %lu but "
641 "wrote %d\n", page->index, rc);
648 put_pages_on_daemon_list(&pc);
650 complete(&tctl->tctl_stop);
654 int trace_start_thread(void)
656 struct tracefiled_ctl *tctl = &trace_tctl;
659 down(&trace_thread_sem);
663 init_completion(&tctl->tctl_start);
664 init_completion(&tctl->tctl_stop);
665 init_waitqueue_head(&tctl->tctl_waitq);
666 atomic_set(&tctl->tctl_shutdown, 0);
668 if (kernel_thread(tracefiled, tctl, 0) < 0) {
673 wait_for_completion(&tctl->tctl_start);
676 up(&trace_thread_sem);
680 void trace_stop_thread(void)
682 struct tracefiled_ctl *tctl = &trace_tctl;
684 down(&trace_thread_sem);
685 if (thread_running) {
686 printk(KERN_INFO "Shutting down debug daemon thread...\n");
687 atomic_set(&tctl->tctl_shutdown, 1);
688 wait_for_completion(&tctl->tctl_stop);
691 up(&trace_thread_sem);
694 int trace_write_daemon_file(struct file *file, const char *buffer,
695 unsigned long count, void *data)
701 name = kmalloc(count + 1, GFP_KERNEL);
705 if (copy_from_user(name, buffer, count)) {
710 /* be nice and strip out trailing '\n' */
711 for (off = count ; off > 2 && isspace(name[off - 1]); off--)
716 down_write(&tracefile_sem);
717 if (strcmp(name, "stop") == 0) {
721 } else if (strncmp(name, "size=", 5) == 0) {
722 tracefile_size = simple_strtoul(name + 5, NULL, 0);
723 if (tracefile_size < 10 || tracefile_size > 20480)
724 tracefile_size = TRACEFILE_SIZE;
726 tracefile_size <<= 20;
730 if (name[0] != '/') {
735 if (tracefile != NULL)
741 printk(KERN_INFO "Lustre: debug daemon will attempt to start writing "
742 "to %s (%lukB max)\n", tracefile, (long)(tracefile_size >> 10));
744 trace_start_thread();
747 up_write(&tracefile_sem);
754 int trace_read_daemon_file(char *page, char **start, off_t off, int count,
755 int *eof, void *data)
759 down_read(&tracefile_sem);
760 rc = snprintf(page, count, "%s", tracefile);
761 up_read(&tracefile_sem);
766 int trace_write_debug_mb(struct file *file, const char *buffer,
767 unsigned long count, void *data)
773 if (count >= sizeof(string)) {
774 printk(KERN_ERR "Lustre: value too large (length %lu bytes)\n",
779 if (copy_from_user(string, buffer, count))
782 max = simple_strtoul(string, NULL, 0);
786 if (max > (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5 || max >= 512) {
787 printk(KERN_ERR "Lustre: Refusing to set debug buffer size to "
788 "%dMB, which is more than 80%% of available RAM (%lu)\n",
789 max, (num_physpages >> (20 - 2 - PAGE_SHIFT)) / 5);
795 for (i = 0; i < NR_CPUS; i++) {
796 struct trace_cpu_data *tcd;
797 tcd = &trace_data[i].tcd;
798 tcd->tcd_max_pages = max << (20 - PAGE_SHIFT);
803 int trace_read_debug_mb(char *page, char **start, off_t off, int count,
804 int *eof, void *data)
806 struct trace_cpu_data *tcd;
810 tcd = trace_get_tcd(flags);
811 rc = snprintf(page, count, "%lu\n",
812 (tcd->tcd_max_pages >> (20 - PAGE_SHIFT)) * smp_num_cpus);
813 trace_put_tcd(tcd, flags);
818 int tracefile_init(void)
820 struct trace_cpu_data *tcd;
823 for (i = 0; i < NR_CPUS; i++) {
824 tcd = &trace_data[i].tcd;
825 INIT_LIST_HEAD(&tcd->tcd_pages);
826 INIT_LIST_HEAD(&tcd->tcd_daemon_pages);
827 tcd->tcd_cur_pages = 0;
828 tcd->tcd_cur_daemon_pages = 0;
829 tcd->tcd_max_pages = TCD_MAX_PAGES;
830 tcd->tcd_shutting_down = 0;
835 static void trace_cleanup_on_cpu(void *info)
837 struct trace_cpu_data *tcd;
838 struct list_head *pos, *tmp;
841 tcd = trace_get_tcd(flags);
843 tcd->tcd_shutting_down = 1;
845 list_for_each_safe(pos, tmp, &tcd->tcd_pages) {
848 page = list_entry(pos, struct page, PAGE_LIST_ENTRY);
849 LASSERT(page->index <= PAGE_SIZE);
850 LASSERT(page_count(page) > 0);
852 list_del(&PAGE_LIST(page));
853 page->mapping = NULL;
856 tcd->tcd_cur_pages = 0;
858 trace_put_tcd(tcd, flags);
861 static void trace_cleanup(void)
863 struct page_collection pc;
865 INIT_LIST_HEAD(&pc.pc_pages);
866 spin_lock_init(&pc.pc_lock);
868 trace_cleanup_on_cpu(&pc);
869 smp_call_function(trace_cleanup_on_cpu, &pc, 0, 1);
872 void tracefile_exit(void)