Whamcloud - gitweb
LU-14428 libcfs: separate daemon_list from cfs_trace_data
[fs/lustre-release.git] / libcfs / libcfs / tracefile.c
1 /*
2  * GPL HEADER START
3  *
4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 only,
8  * as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful, but
11  * WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License version 2 for more details (a copy is included
14  * in the LICENSE file that accompanied this code).
15  *
16  * You should have received a copy of the GNU General Public License
17  * version 2 along with this program; If not, see
18  * http://www.gnu.org/licenses/gpl-2.0.html
19  *
20  * GPL HEADER END
21  */
22 /*
23  * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
24  * Use is subject to license terms.
25  *
26  * Copyright (c) 2012, 2017, Intel Corporation.
27  */
28 /*
29  * This file is part of Lustre, http://www.lustre.org/
30  *
31  * libcfs/libcfs/tracefile.c
32  *
33  * Author: Zach Brown <zab@clusterfs.com>
34  * Author: Phil Schwan <phil@clusterfs.com>
35  */
36
37 #define DEBUG_SUBSYSTEM S_LNET
38 #include "tracefile.h"
39
40 #include <linux/ctype.h>
41 #include <linux/fs.h>
42 #include <linux/kthread.h>
43 #include <linux/pagemap.h>
44 #include <linux/poll.h>
45 #include <linux/tty.h>
46 #include <linux/uaccess.h>
47 #include <libcfs/linux/linux-fs.h>
48 #include <libcfs/libcfs.h>
49
50
51 enum cfs_trace_buf_type {
52         CFS_TCD_TYPE_PROC = 0,
53         CFS_TCD_TYPE_SOFTIRQ,
54         CFS_TCD_TYPE_IRQ,
55         CFS_TCD_TYPE_CNT
56 };
57
58 union cfs_trace_data_union (*cfs_trace_data[CFS_TCD_TYPE_CNT])[NR_CPUS] __cacheline_aligned;
59
60 /* Pages containing records already processed by daemon.
61  * Link via ->lru, use size in ->private
62  */
63 static LIST_HEAD(daemon_pages);
64 static long daemon_pages_count;
65 static long daemon_pages_max;
66
67 char cfs_tracefile[TRACEFILE_NAME_SIZE];
68 long long cfs_tracefile_size = CFS_TRACEFILE_SIZE;
69
70 struct task_struct *tctl_task;
71
72 static atomic_t cfs_tage_allocated = ATOMIC_INIT(0);
73 static DECLARE_RWSEM(cfs_tracefile_sem);
74
75 /* trace file lock routines */
76 /* The walking argument indicates the locking comes from all tcd types
77  * iterator and we must lock it and dissable local irqs to avoid deadlocks
78  * with other interrupt locks that might be happening. See LU-1311
79  * for details.
80  */
81 int cfs_trace_lock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
82         __acquires(&tcd->tcd_lock)
83 {
84         __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_CNT);
85         if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
86                 spin_lock_irqsave(&tcd->tcd_lock, tcd->tcd_lock_flags);
87         else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
88                 spin_lock_bh(&tcd->tcd_lock);
89         else if (unlikely(walking))
90                 spin_lock_irq(&tcd->tcd_lock);
91         else
92                 spin_lock(&tcd->tcd_lock);
93         return 1;
94 }
95
96 void cfs_trace_unlock_tcd(struct cfs_trace_cpu_data *tcd, int walking)
97         __releases(&tcd->tcd_lock)
98 {
99         __LASSERT(tcd->tcd_type < CFS_TCD_TYPE_CNT);
100         if (tcd->tcd_type == CFS_TCD_TYPE_IRQ)
101                 spin_unlock_irqrestore(&tcd->tcd_lock, tcd->tcd_lock_flags);
102         else if (tcd->tcd_type == CFS_TCD_TYPE_SOFTIRQ)
103                 spin_unlock_bh(&tcd->tcd_lock);
104         else if (unlikely(walking))
105                 spin_unlock_irq(&tcd->tcd_lock);
106         else
107                 spin_unlock(&tcd->tcd_lock);
108 }
109
110 #define cfs_tcd_for_each(tcd, i, j)                                     \
111         for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i]; i++)     \
112                 for (j = 0, ((tcd) = &(*cfs_trace_data[i])[j].tcd);     \
113                      j < num_possible_cpus();                           \
114                      j++, (tcd) = &(*cfs_trace_data[i])[j].tcd)
115
116 #define cfs_tcd_for_each_type_lock(tcd, i, cpu)                         \
117         for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i] &&        \
118              (tcd = &(*cfs_trace_data[i])[cpu].tcd) &&                  \
119              cfs_trace_lock_tcd(tcd, 1); cfs_trace_unlock_tcd(tcd, 1), i++)
120
121 enum cfs_trace_buf_type cfs_trace_buf_idx_get(void)
122 {
123         if (in_irq())
124                 return CFS_TCD_TYPE_IRQ;
125         if (in_softirq())
126                 return CFS_TCD_TYPE_SOFTIRQ;
127         return CFS_TCD_TYPE_PROC;
128 }
129
130 static inline struct cfs_trace_cpu_data *
131 cfs_trace_get_tcd(void)
132 {
133         struct cfs_trace_cpu_data *tcd =
134                 &(*cfs_trace_data[cfs_trace_buf_idx_get()])[get_cpu()].tcd;
135
136         cfs_trace_lock_tcd(tcd, 0);
137
138         return tcd;
139 }
140
141 static inline void cfs_trace_put_tcd(struct cfs_trace_cpu_data *tcd)
142 {
143         cfs_trace_unlock_tcd(tcd, 0);
144
145         put_cpu();
146 }
147
148 static inline struct cfs_trace_page *
149 cfs_tage_from_list(struct list_head *list)
150 {
151         return list_entry(list, struct cfs_trace_page, linkage);
152 }
153
154 static struct cfs_trace_page *cfs_tage_alloc(gfp_t gfp)
155 {
156         struct page            *page;
157         struct cfs_trace_page *tage;
158
159         /* My caller is trying to free memory */
160         if (!in_interrupt() && (current->flags & PF_MEMALLOC))
161                 return NULL;
162
163         /*
164          * Don't spam console with allocation failures: they will be reported
165          * by upper layer anyway.
166          */
167         gfp |= __GFP_NOWARN;
168         page = alloc_page(gfp);
169         if (page == NULL)
170                 return NULL;
171
172         tage = kmalloc(sizeof(*tage), gfp);
173         if (tage == NULL) {
174                 __free_page(page);
175                 return NULL;
176         }
177
178         tage->page = page;
179         atomic_inc(&cfs_tage_allocated);
180         return tage;
181 }
182
183 static void cfs_tage_free(struct cfs_trace_page *tage)
184 {
185         __LASSERT(tage != NULL);
186         __LASSERT(tage->page != NULL);
187
188         __free_page(tage->page);
189         kfree(tage);
190         atomic_dec(&cfs_tage_allocated);
191 }
192
193 static void cfs_tage_to_tail(struct cfs_trace_page *tage,
194                              struct list_head *queue)
195 {
196         __LASSERT(tage != NULL);
197         __LASSERT(queue != NULL);
198
199         list_move_tail(&tage->linkage, queue);
200 }
201
202 /* return a page that has 'len' bytes left at the end */
203 static struct cfs_trace_page *
204 cfs_trace_get_tage_try(struct cfs_trace_cpu_data *tcd, unsigned long len)
205 {
206         struct cfs_trace_page *tage;
207         struct task_struct *tsk;
208
209         if (tcd->tcd_cur_pages > 0) {
210                 __LASSERT(!list_empty(&tcd->tcd_pages));
211                 tage = cfs_tage_from_list(tcd->tcd_pages.prev);
212                 if (tage->used + len <= PAGE_SIZE)
213                         return tage;
214         }
215
216         if (tcd->tcd_cur_pages < tcd->tcd_max_pages) {
217                 if (tcd->tcd_cur_stock_pages > 0) {
218                         tage = cfs_tage_from_list(tcd->tcd_stock_pages.prev);
219                         --tcd->tcd_cur_stock_pages;
220                         list_del_init(&tage->linkage);
221                 } else {
222                         tage = cfs_tage_alloc(GFP_ATOMIC);
223                         if (unlikely(tage == NULL)) {
224                                 if ((!(current->flags & PF_MEMALLOC) ||
225                                      in_interrupt()) && printk_ratelimit())
226                                         pr_warn("Lustre: cannot allocate a tage (%ld)\n",
227                                                 tcd->tcd_cur_pages);
228                                 return NULL;
229                         }
230                 }
231
232                 tage->used = 0;
233                 tage->cpu = smp_processor_id();
234                 tage->type = tcd->tcd_type;
235                 list_add_tail(&tage->linkage, &tcd->tcd_pages);
236                 tcd->tcd_cur_pages++;
237
238                 tsk = tctl_task;
239                 if (tcd->tcd_cur_pages > 8 && tsk)
240                         /*
241                          * wake up tracefiled to process some pages.
242                          */
243                         wake_up_process(tsk);
244
245                 return tage;
246         }
247         return NULL;
248 }
249
250 static void cfs_tcd_shrink(struct cfs_trace_cpu_data *tcd)
251 {
252         int pgcount = tcd->tcd_cur_pages / 10;
253         struct page_collection pc;
254         struct cfs_trace_page *tage;
255         struct cfs_trace_page *tmp;
256
257         /*
258          * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
259          * from here: this will lead to infinite recursion.
260          */
261
262         if (printk_ratelimit())
263                 pr_warn("Lustre: debug daemon buffer overflowed; discarding 10%% of pages (%d of %ld)\n",
264                         pgcount + 1, tcd->tcd_cur_pages);
265
266         INIT_LIST_HEAD(&pc.pc_pages);
267
268         list_for_each_entry_safe(tage, tmp, &tcd->tcd_pages, linkage) {
269                 if (pgcount-- == 0)
270                         break;
271
272                 list_del(&tage->linkage);
273                 cfs_tage_free(tage);
274                 tcd->tcd_cur_pages--;
275         }
276 }
277
278 /* return a page that has 'len' bytes left at the end */
279 static struct cfs_trace_page *cfs_trace_get_tage(struct cfs_trace_cpu_data *tcd,
280                                                  unsigned long len)
281 {
282         struct cfs_trace_page *tage;
283
284         /*
285          * XXX nikita: do NOT call portals_debug_msg() (CDEBUG/ENTRY/EXIT)
286          * from here: this will lead to infinite recursion.
287          */
288
289         if (len > PAGE_SIZE) {
290                 pr_err("LustreError: cowardly refusing to write %lu bytes in a page\n",
291                        len);
292                 return NULL;
293         }
294
295         tage = cfs_trace_get_tage_try(tcd, len);
296         if (tage != NULL)
297                 return tage;
298         if (tctl_task)
299                 cfs_tcd_shrink(tcd);
300         if (tcd->tcd_cur_pages > 0) {
301                 tage = cfs_tage_from_list(tcd->tcd_pages.next);
302                 tage->used = 0;
303                 cfs_tage_to_tail(tage, &tcd->tcd_pages);
304         }
305         return tage;
306 }
307
308 static void cfs_set_ptldebug_header(struct ptldebug_header *header,
309                                     struct libcfs_debug_msg_data *msgdata,
310                                     unsigned long stack)
311 {
312         struct timespec64 ts;
313
314         ktime_get_real_ts64(&ts);
315
316         header->ph_subsys = msgdata->msg_subsys;
317         header->ph_mask = msgdata->msg_mask;
318         header->ph_cpu_id = smp_processor_id();
319         header->ph_type = cfs_trace_buf_idx_get();
320         /* y2038 safe since all user space treats this as unsigned, but
321          * will overflow in 2106
322          */
323         header->ph_sec = (u32)ts.tv_sec;
324         header->ph_usec = ts.tv_nsec / NSEC_PER_USEC;
325         header->ph_stack = stack;
326         header->ph_pid = current->pid;
327         header->ph_line_num = msgdata->msg_line;
328         header->ph_extern_pid = 0;
329 }
330
331 /**
332  * tty_write_msg - write a message to a certain tty, not just the console.
333  * @tty: the destination tty_struct
334  * @msg: the message to write
335  *
336  * tty_write_message is not exported, so write a same function for it
337  *
338  */
339 static void tty_write_msg(struct tty_struct *tty, const char *msg)
340 {
341         mutex_lock(&tty->atomic_write_lock);
342         tty_lock(tty);
343         if (tty->ops->write && tty->count > 0)
344                 tty->ops->write(tty, msg, strlen(msg));
345         tty_unlock(tty);
346         mutex_unlock(&tty->atomic_write_lock);
347         wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
348 }
349
350 static void cfs_tty_write_message(const char *prefix, int mask, const char *msg)
351 {
352         struct tty_struct *tty;
353
354         tty = get_current_tty();
355         if (!tty)
356                 return;
357
358         tty_write_msg(tty, prefix);
359         if ((mask & D_EMERG) || (mask & D_ERROR))
360                 tty_write_msg(tty, "Error");
361         tty_write_msg(tty, ": ");
362         tty_write_msg(tty, msg);
363         tty_kref_put(tty);
364 }
365
366 static void cfs_vprint_to_console(struct ptldebug_header *hdr, int mask,
367                                   struct va_format *vaf, const char *file,
368                                   const char *fn)
369 {
370         char *prefix = "Lustre";
371
372         if (hdr->ph_subsys == S_LND || hdr->ph_subsys == S_LNET)
373                 prefix = "LNet";
374
375         if (mask & D_CONSOLE) {
376                 if (mask & D_EMERG)
377                         pr_emerg("%sError: %pV", prefix, vaf);
378                 else if (mask & D_ERROR)
379                         pr_err("%sError: %pV", prefix, vaf);
380                 else if (mask & D_WARNING)
381                         pr_warn("%s: %pV", prefix, vaf);
382                 else if (mask & libcfs_printk)
383                         pr_info("%s: %pV", prefix, vaf);
384         } else {
385                 if (mask & D_EMERG)
386                         pr_emerg("%sError: %d:%d:(%s:%d:%s()) %pV", prefix,
387                                  hdr->ph_pid, hdr->ph_extern_pid, file,
388                                  hdr->ph_line_num, fn, vaf);
389                 else if (mask & D_ERROR)
390                         pr_err("%sError: %d:%d:(%s:%d:%s()) %pV", prefix,
391                                hdr->ph_pid, hdr->ph_extern_pid, file,
392                                hdr->ph_line_num, fn, vaf);
393                 else if (mask & D_WARNING)
394                         pr_warn("%s: %d:%d:(%s:%d:%s()) %pV", prefix,
395                                 hdr->ph_pid, hdr->ph_extern_pid, file,
396                                 hdr->ph_line_num, fn, vaf);
397                 else if (mask & (D_CONSOLE | libcfs_printk))
398                         pr_info("%s: %pV", prefix, vaf);
399         }
400
401         if (mask & D_TTY)
402                 /* tty_write_msg doesn't handle formatting */
403                 cfs_tty_write_message(prefix, mask, vaf->fmt);
404 }
405
406 static void cfs_print_to_console(struct ptldebug_header *hdr, int mask,
407                                  const char *file, const char *fn,
408                                  const char *fmt, ...)
409 {
410         struct va_format vaf;
411         va_list args;
412
413         va_start(args, fmt);
414         vaf.fmt = fmt;
415         vaf.va = &args;
416         cfs_vprint_to_console(hdr, mask, &vaf, file, fn);
417 }
418
419 int libcfs_debug_msg(struct libcfs_debug_msg_data *msgdata,
420                      const char *format, ...)
421 {
422         struct cfs_trace_cpu_data *tcd = NULL;
423         struct ptldebug_header header = {0};
424         struct cfs_trace_page *tage;
425         /* string_buf is used only if tcd != NULL, and is always set then */
426         char *string_buf = NULL;
427         char *debug_buf;
428         int known_size;
429         int needed = 85; /* seeded with average message length */
430         int max_nob;
431         va_list ap;
432         int retry;
433         int mask = msgdata->msg_mask;
434         char *file = (char *)msgdata->msg_file;
435         struct cfs_debug_limit_state *cdls = msgdata->msg_cdls;
436
437         if (strchr(file, '/'))
438                 file = strrchr(file, '/') + 1;
439
440         tcd = cfs_trace_get_tcd();
441
442         /* cfs_trace_get_tcd() grabs a lock, which disables preemption and
443          * pins us to a particular CPU.  This avoids an smp_processor_id()
444          * warning on Linux when debugging is enabled.
445          */
446         cfs_set_ptldebug_header(&header, msgdata, CDEBUG_STACK());
447
448         if (!tcd)                /* arch may not log in IRQ context */
449                 goto console;
450
451         if (tcd->tcd_cur_pages == 0)
452                 header.ph_flags |= PH_FLAG_FIRST_RECORD;
453
454         if (tcd->tcd_shutting_down) {
455                 cfs_trace_put_tcd(tcd);
456                 tcd = NULL;
457                 goto console;
458         }
459
460         known_size = strlen(file) + 1;
461         if (msgdata->msg_fn)
462                 known_size += strlen(msgdata->msg_fn) + 1;
463
464         if (libcfs_debug_binary)
465                 known_size += sizeof(header);
466
467         /*
468          * May perform an additional pass to update 'needed' and increase
469          * tage buffer size to match vsnprintf reported size required
470          * On the second pass (retry=1) use vscnprintf [which returns
471          * number of bytes written not including the terminating nul]
472          * to clarify `needed` is used as number of bytes written
473          * for the remainder of this function
474          */
475         for (retry = 0; retry < 2; retry++) {
476                 tage = cfs_trace_get_tage(tcd, needed + known_size + 1);
477                 if (!tage) {
478                         if (needed + known_size > PAGE_SIZE)
479                                 mask |= D_ERROR;
480
481                         cfs_trace_put_tcd(tcd);
482                         tcd = NULL;
483                         goto console;
484                 }
485
486                 string_buf = (char *)page_address(tage->page) +
487                              tage->used + known_size;
488
489                 max_nob = PAGE_SIZE - tage->used - known_size;
490                 if (max_nob <= 0) {
491                         pr_emerg("LustreError: negative max_nob: %d\n",
492                                  max_nob);
493                         mask |= D_ERROR;
494                         cfs_trace_put_tcd(tcd);
495                         tcd = NULL;
496                         goto console;
497                 }
498
499                 va_start(ap, format);
500                 if (retry)
501                         needed = vscnprintf(string_buf, max_nob, format, ap);
502                 else
503                         needed = vsnprintf(string_buf, max_nob, format, ap);
504                 va_end(ap);
505
506                 if (needed < max_nob) /* well. printing ok.. */
507                         break;
508         }
509
510         /* `needed` is actual bytes written to string_buf */
511         if (*(string_buf + needed - 1) != '\n') {
512                 pr_info("Lustre: format at %s:%d:%s doesn't end in newline\n",
513                         file, msgdata->msg_line, msgdata->msg_fn);
514         } else if (mask & D_TTY) {
515                 /* TTY needs '\r\n' to move carriage to leftmost position */
516                 if (needed < 2 || *(string_buf + needed - 2) != '\r')
517                         pr_info("Lustre: format at %s:%d:%s doesn't end in '\\r\\n'\n",
518                                 file, msgdata->msg_line, msgdata->msg_fn);
519                 if (strnchr(string_buf, needed, '%'))
520                         pr_info("Lustre: format at %s:%d:%s mustn't contain %%\n",
521                                 file, msgdata->msg_line, msgdata->msg_fn);
522         }
523
524         header.ph_len = known_size + needed;
525         debug_buf = (char *)page_address(tage->page) + tage->used;
526
527         if (libcfs_debug_binary) {
528                 memcpy(debug_buf, &header, sizeof(header));
529                 tage->used += sizeof(header);
530                 debug_buf += sizeof(header);
531         }
532
533         strlcpy(debug_buf, file, PAGE_SIZE - tage->used);
534         tage->used += strlen(file) + 1;
535         debug_buf += strlen(file) + 1;
536
537         if (msgdata->msg_fn) {
538                 strlcpy(debug_buf, msgdata->msg_fn, PAGE_SIZE - tage->used);
539                 tage->used += strlen(msgdata->msg_fn) + 1;
540                 debug_buf += strlen(msgdata->msg_fn) + 1;
541         }
542
543         __LASSERT(debug_buf == string_buf);
544
545         tage->used += needed;
546         __LASSERT(tage->used <= PAGE_SIZE);
547
548 console:
549         if ((mask & libcfs_printk) == 0) {
550                 /* no console output requested */
551                 if (tcd != NULL)
552                         cfs_trace_put_tcd(tcd);
553                 return 1;
554         }
555
556         if (cdls != NULL) {
557                 if (libcfs_console_ratelimit &&
558                     cdls->cdls_next != 0 &&     /* not first time ever */
559                     time_before(jiffies, cdls->cdls_next)) {
560                         /* skipping a console message */
561                         cdls->cdls_count++;
562                         if (tcd != NULL)
563                                 cfs_trace_put_tcd(tcd);
564                         return 1;
565                 }
566
567                 if (time_after(jiffies, cdls->cdls_next +
568                                         libcfs_console_max_delay +
569                                         cfs_time_seconds(10))) {
570                         /* last timeout was a long time ago */
571                         cdls->cdls_delay /= libcfs_console_backoff * 4;
572                 } else {
573                         cdls->cdls_delay *= libcfs_console_backoff;
574                 }
575
576                 if (cdls->cdls_delay < libcfs_console_min_delay)
577                         cdls->cdls_delay = libcfs_console_min_delay;
578                 else if (cdls->cdls_delay > libcfs_console_max_delay)
579                         cdls->cdls_delay = libcfs_console_max_delay;
580
581                 /* ensure cdls_next is never zero after it's been seen */
582                 cdls->cdls_next = (jiffies + cdls->cdls_delay) | 1;
583         }
584
585         if (tcd) {
586                 cfs_print_to_console(&header, mask, file, msgdata->msg_fn,
587                                      "%s", string_buf);
588                 cfs_trace_put_tcd(tcd);
589         } else {
590                 struct va_format vaf;
591
592                 va_start(ap, format);
593                 vaf.fmt = format;
594                 vaf.va = &ap;
595                 cfs_vprint_to_console(&header, mask,
596                                       &vaf, file, msgdata->msg_fn);
597                 va_end(ap);
598         }
599
600         if (cdls != NULL && cdls->cdls_count != 0) {
601                 /* Do not allow print this to TTY */
602                 cfs_print_to_console(&header, mask & ~D_TTY, file,
603                                      msgdata->msg_fn,
604                                      "Skipped %d previous similar message%s\n",
605                                      cdls->cdls_count,
606                                      (cdls->cdls_count > 1) ? "s" : "");
607
608                 cdls->cdls_count = 0;
609         }
610
611         return 0;
612 }
613 EXPORT_SYMBOL(libcfs_debug_msg);
614
615 void
616 cfs_trace_assertion_failed(const char *str,
617                            struct libcfs_debug_msg_data *msgdata)
618 {
619         struct ptldebug_header hdr;
620
621         libcfs_panic_in_progress = 1;
622         libcfs_catastrophe = 1;
623         smp_mb();
624
625         cfs_set_ptldebug_header(&hdr, msgdata, CDEBUG_STACK());
626
627         cfs_print_to_console(&hdr, D_EMERG, msgdata->msg_file, msgdata->msg_fn,
628                              "%s", str);
629
630         panic("Lustre debug assertion failure\n");
631
632         /* not reached */
633 }
634
635 static void
636 panic_collect_pages(struct page_collection *pc)
637 {
638         /* Do the collect_pages job on a single CPU: assumes that all other
639          * CPUs have been stopped during a panic.  If this isn't true for some
640          * arch, this will have to be implemented separately in each arch.  */
641         int                        i;
642         int                        j;
643         struct cfs_trace_cpu_data *tcd;
644
645         INIT_LIST_HEAD(&pc->pc_pages);
646
647         cfs_tcd_for_each(tcd, i, j) {
648                 list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
649                 tcd->tcd_cur_pages = 0;
650         }
651 }
652
653 static void collect_pages_on_all_cpus(struct page_collection *pc)
654 {
655         struct cfs_trace_cpu_data *tcd;
656         int i, cpu;
657
658         for_each_possible_cpu(cpu) {
659                 cfs_tcd_for_each_type_lock(tcd, i, cpu) {
660                         list_splice_init(&tcd->tcd_pages, &pc->pc_pages);
661                         tcd->tcd_cur_pages = 0;
662                 }
663         }
664 }
665
666 static void collect_pages(struct page_collection *pc)
667 {
668         INIT_LIST_HEAD(&pc->pc_pages);
669
670         if (libcfs_panic_in_progress)
671                 panic_collect_pages(pc);
672         else
673                 collect_pages_on_all_cpus(pc);
674 }
675
676 static void put_pages_back_on_all_cpus(struct page_collection *pc)
677 {
678         struct cfs_trace_cpu_data *tcd;
679         struct list_head *cur_head;
680         struct cfs_trace_page *tage;
681         struct cfs_trace_page *tmp;
682         int i, cpu;
683
684         for_each_possible_cpu(cpu) {
685                 cfs_tcd_for_each_type_lock(tcd, i, cpu) {
686                         cur_head = tcd->tcd_pages.next;
687
688                         list_for_each_entry_safe(tage, tmp, &pc->pc_pages,
689                                                  linkage) {
690
691                                 __LASSERT_TAGE_INVARIANT(tage);
692
693                                 if (tage->cpu != cpu || tage->type != i)
694                                         continue;
695
696                                 cfs_tage_to_tail(tage, cur_head);
697                                 tcd->tcd_cur_pages++;
698                         }
699                 }
700         }
701 }
702
703 static void put_pages_back(struct page_collection *pc)
704 {
705         if (!libcfs_panic_in_progress)
706                 put_pages_back_on_all_cpus(pc);
707 }
708
709 #ifdef LNET_DUMP_ON_PANIC
710 void cfs_trace_debug_print(void)
711 {
712         struct page_collection pc;
713         struct cfs_trace_page *tage;
714         struct cfs_trace_page *tmp;
715         struct page *page;
716
717         collect_pages(&pc);
718         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
719                 char *p, *file, *fn;
720
721                 __LASSERT_TAGE_INVARIANT(tage);
722
723                 page = tage->page;
724                 p = page_address(page);
725                 while (p < ((char *)page_address(page) + tage->used)) {
726                         struct ptldebug_header *hdr;
727                         int len;
728                         hdr = (void *)p;
729                         p += sizeof(*hdr);
730                         file = p;
731                         p += strlen(file) + 1;
732                         fn = p;
733                         p += strlen(fn) + 1;
734                         len = hdr->ph_len - (int)(p - (char *)hdr);
735
736                         cfs_print_to_console(hdr, D_EMERG, file, fn,
737                                              "%.*s", len, p);
738
739                         p += len;
740                 }
741
742                 list_del(&tage->linkage);
743                 cfs_tage_free(tage);
744         }
745         down_write(&cfs_tracefile_sem);
746         while ((page = list_first_entry_or_null(&daemon_pages,
747                                                 struct page, lru)) != NULL) {
748                 char *p, *file, *fn;
749
750                 p = page_address(page);
751                 while (p < ((char *)page_address(page) + page->private)) {
752                         struct ptldebug_header *hdr;
753                         int len;
754
755                         hdr = (void *)p;
756                         p += sizeof(*hdr);
757                         file = p;
758                         p += strlen(file) + 1;
759                         fn = p;
760                         p += strlen(fn) + 1;
761                         len = hdr->ph_len - (int)(p - (char *)hdr);
762
763                         cfs_print_to_console(hdr, D_EMERG, file, fn,
764                                              "%.*s", len, p);
765
766                         p += len;
767                 }
768                 list_del_init(&page->lru);
769                 daemon_pages_count -= 1;
770                 put_page(page);
771         }
772         up_write(&cfs_tracefile_sem);
773 }
774 #endif /* LNET_DUMP_ON_PANIC */
775
776 int cfs_tracefile_dump_all_pages(char *filename)
777 {
778         struct page_collection pc;
779         struct file *filp;
780         struct cfs_trace_page *tage;
781         struct cfs_trace_page *tmp;
782         char *buf;
783         struct page *page;
784         int rc;
785
786         down_write(&cfs_tracefile_sem);
787
788         filp = filp_open(filename, O_CREAT|O_EXCL|O_WRONLY|O_LARGEFILE, 0600);
789         if (IS_ERR(filp)) {
790                 rc = PTR_ERR(filp);
791                 filp = NULL;
792                 pr_err("LustreError: can't open %s for dump: rc = %d\n",
793                        filename, rc);
794                 goto out;
795         }
796
797         collect_pages(&pc);
798         if (list_empty(&pc.pc_pages)) {
799                 rc = 0;
800                 goto close;
801         }
802
803         /* ok, for now, just write the pages.  in the future we'll be building
804          * iobufs with the pages and calling generic_direct_IO */
805         list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
806
807                 __LASSERT_TAGE_INVARIANT(tage);
808
809                 buf = kmap(tage->page);
810                 rc = cfs_kernel_write(filp, buf, tage->used, &filp->f_pos);
811                 kunmap(tage->page);
812                 if (rc != (int)tage->used) {
813                         pr_warn("Lustre: wanted to write %u but wrote %d\n",
814                                 tage->used, rc);
815                         put_pages_back(&pc);
816                         __LASSERT(list_empty(&pc.pc_pages));
817                         break;
818                 }
819                 list_del(&tage->linkage);
820                 cfs_tage_free(tage);
821         }
822         while ((page = list_first_entry_or_null(&daemon_pages,
823                                                 struct page, lru)) != NULL) {
824                 buf = page_address(page);
825                 rc = cfs_kernel_write(filp, buf, page->private, &filp->f_pos);
826                 if (rc != (int)page->private) {
827                         pr_warn("Lustre: wanted to write %u but wrote %d\n",
828                                 (int)page->private, rc);
829                         break;
830                 }
831                 list_del(&page->lru);
832                 daemon_pages_count -= 1;
833                 put_page(page);
834         }
835         rc = vfs_fsync_range(filp, 0, LLONG_MAX, 1);
836         if (rc)
837                 pr_err("LustreError: sync returns: rc = %d\n", rc);
838 close:
839         filp_close(filp, NULL);
840 out:
841         up_write(&cfs_tracefile_sem);
842         return rc;
843 }
844
845 void cfs_trace_flush_pages(void)
846 {
847         struct page_collection pc;
848         struct cfs_trace_page *tage;
849         struct page *page;
850
851         collect_pages(&pc);
852         while (!list_empty(&pc.pc_pages)) {
853                 tage = list_first_entry(&pc.pc_pages,
854                                         struct cfs_trace_page, linkage);
855                 __LASSERT_TAGE_INVARIANT(tage);
856
857                 list_del(&tage->linkage);
858                 cfs_tage_free(tage);
859         }
860
861         down_write(&cfs_tracefile_sem);
862         while ((page = list_first_entry_or_null(&daemon_pages,
863                                                 struct page, lru)) != NULL) {
864                 list_del(&page->lru);
865                 daemon_pages_count -= 1;
866                 put_page(page);
867         }
868         up_write(&cfs_tracefile_sem);
869 }
870
871 int cfs_trace_copyout_string(char __user *usr_buffer, int usr_buffer_nob,
872                              const char *knl_buffer, char *append)
873 {
874         /* NB if 'append' != NULL, it's a single character to append to the
875          * copied out string - usually "\n", for /proc entries and "" (i.e. a
876          * terminating zero byte) for sysctl entries */
877         int   nob = strlen(knl_buffer);
878
879         if (nob > usr_buffer_nob)
880                 nob = usr_buffer_nob;
881
882         if (copy_to_user(usr_buffer, knl_buffer, nob))
883                 return -EFAULT;
884
885         if (append != NULL && nob < usr_buffer_nob) {
886                 if (copy_to_user(usr_buffer + nob, append, 1))
887                         return -EFAULT;
888
889                 nob++;
890         }
891
892         return nob;
893 }
894 EXPORT_SYMBOL(cfs_trace_copyout_string);
895
896 int cfs_trace_dump_debug_buffer_usrstr(void __user *usr_str, int usr_str_nob)
897 {
898         char *str;
899         char *path;
900         int rc;
901
902         str = memdup_user_nul(usr_str, usr_str_nob);
903         if (IS_ERR(str))
904                 return PTR_ERR(str);
905
906         path = strim(str);
907         if (path[0] != '/')
908                 rc = -EINVAL;
909         else
910                 rc = cfs_tracefile_dump_all_pages(path);
911         kfree(str);
912
913         return rc;
914 }
915
916 int cfs_trace_daemon_command(char *str)
917 {
918         int       rc = 0;
919
920         down_write(&cfs_tracefile_sem);
921
922         if (strcmp(str, "stop") == 0) {
923                 up_write(&cfs_tracefile_sem);
924                 cfs_trace_stop_thread();
925                 down_write(&cfs_tracefile_sem);
926                 memset(cfs_tracefile, 0, sizeof(cfs_tracefile));
927
928         } else if (strncmp(str, "size=", 5) == 0) {
929                 unsigned long tmp;
930
931                 rc = kstrtoul(str + 5, 10, &tmp);
932                 if (!rc) {
933                         if (tmp < 10 || tmp > 20480)
934                                 cfs_tracefile_size = CFS_TRACEFILE_SIZE;
935                         else
936                                 cfs_tracefile_size = tmp << 20;
937                 }
938         } else if (strlen(str) >= sizeof(cfs_tracefile)) {
939                 rc = -ENAMETOOLONG;
940         } else if (str[0] != '/') {
941                 rc = -EINVAL;
942         } else {
943                 strcpy(cfs_tracefile, str);
944
945                 pr_info("Lustre: debug daemon will attempt to start writing to %s (%lukB max)\n",
946                         cfs_tracefile, (long)(cfs_tracefile_size >> 10));
947
948                 cfs_trace_start_thread();
949         }
950
951         up_write(&cfs_tracefile_sem);
952         return rc;
953 }
954
955 int cfs_trace_daemon_command_usrstr(void __user *usr_str, int usr_str_nob)
956 {
957         char *str;
958         int   rc;
959
960         str = memdup_user_nul(usr_str, usr_str_nob);
961         if (IS_ERR(str))
962                 return PTR_ERR(str);
963
964         rc = cfs_trace_daemon_command(strim(str));
965         kfree(str);
966
967         return rc;
968 }
969
970 int cfs_trace_set_debug_mb(int mb)
971 {
972         int i;
973         int j;
974         unsigned long pages;
975         unsigned long total_mb = (cfs_totalram_pages() >> (20 - PAGE_SHIFT));
976         unsigned long limit = max_t(unsigned long, 512, (total_mb * 4) / 5);
977         struct cfs_trace_cpu_data *tcd;
978
979         if (mb < num_possible_cpus()) {
980                 pr_warn("Lustre: %d MB is too small for debug buffer size, setting it to %d MB.\n",
981                         mb, num_possible_cpus());
982                 mb = num_possible_cpus();
983         }
984
985         if (mb > limit) {
986                 pr_warn("Lustre: %d MB is too large for debug buffer size, setting it to %lu MB.\n",
987                         mb, limit);
988                 mb = limit;
989         }
990
991         mb /= num_possible_cpus();
992         pages = mb << (20 - PAGE_SHIFT);
993
994         down_write(&cfs_tracefile_sem);
995
996         cfs_tcd_for_each(tcd, i, j)
997                 tcd->tcd_max_pages = (pages * tcd->tcd_pages_factor) / 100;
998
999         daemon_pages_max = pages;
1000         up_write(&cfs_tracefile_sem);
1001
1002         return mb;
1003 }
1004
1005 int cfs_trace_get_debug_mb(void)
1006 {
1007         int i;
1008         int j;
1009         struct cfs_trace_cpu_data *tcd;
1010         int total_pages = 0;
1011
1012         down_read(&cfs_tracefile_sem);
1013
1014         cfs_tcd_for_each(tcd, i, j)
1015                 total_pages += tcd->tcd_max_pages;
1016
1017         up_read(&cfs_tracefile_sem);
1018
1019         return (total_pages >> (20 - PAGE_SHIFT)) + 1;
1020 }
1021
1022 static int tracefiled(void *arg)
1023 {
1024         struct page_collection pc;
1025         struct cfs_trace_page *tage;
1026         struct cfs_trace_page *tmp;
1027         struct file *filp;
1028         char *buf;
1029         int last_loop = 0;
1030         int rc;
1031
1032         while (!last_loop) {
1033                 LIST_HEAD(for_daemon_pages);
1034                 int for_daemon_pages_count = 0;
1035                 schedule_timeout_interruptible(cfs_time_seconds(1));
1036                 if (kthread_should_stop())
1037                         last_loop = 1;
1038                 collect_pages(&pc);
1039                 if (list_empty(&pc.pc_pages))
1040                         continue;
1041
1042                 filp = NULL;
1043                 down_read(&cfs_tracefile_sem);
1044                 if (cfs_tracefile[0] != 0) {
1045                         filp = filp_open(cfs_tracefile,
1046                                          O_CREAT | O_RDWR | O_LARGEFILE,
1047                                          0600);
1048                         if (IS_ERR(filp)) {
1049                                 rc = PTR_ERR(filp);
1050                                 filp = NULL;
1051                                 pr_warn("Lustre: couldn't open %s: rc = %d\n",
1052                                         cfs_tracefile, rc);
1053                         }
1054                 }
1055                 up_read(&cfs_tracefile_sem);
1056
1057                 list_for_each_entry_safe(tage, tmp, &pc.pc_pages, linkage) {
1058                         __LASSERT_TAGE_INVARIANT(tage);
1059
1060                         if (filp) {
1061                                 struct dentry *de = file_dentry(filp);
1062                                 static loff_t f_pos;
1063
1064                                 if (f_pos >= (off_t)cfs_tracefile_size)
1065                                         f_pos = 0;
1066                                 else if (f_pos > i_size_read(de->d_inode))
1067                                         f_pos = i_size_read(de->d_inode);
1068
1069                                 buf = kmap(tage->page);
1070                                 rc = cfs_kernel_write(filp, buf, tage->used,
1071                                                       &f_pos);
1072                                 kunmap(tage->page);
1073                                 if (rc != (int)tage->used) {
1074                                         pr_warn("Lustre: wanted to write %u but wrote %d\n",
1075                                                 tage->used, rc);
1076                                         put_pages_back(&pc);
1077                                         __LASSERT(list_empty(&pc.pc_pages));
1078                                         break;
1079                                 }
1080                         }
1081                         list_del_init(&tage->linkage);
1082                         list_add_tail(&tage->page->lru, &for_daemon_pages);
1083                         for_daemon_pages_count += 1;
1084
1085                         tage->page->private = (int)tage->used;
1086                         kfree(tage);
1087                         atomic_dec(&cfs_tage_allocated);
1088                 }
1089
1090                 if (filp)
1091                         filp_close(filp, NULL);
1092
1093                 down_write(&cfs_tracefile_sem);
1094                 list_splice_tail(&for_daemon_pages, &daemon_pages);
1095                 daemon_pages_count += for_daemon_pages_count;
1096                 while (daemon_pages_count > daemon_pages_max) {
1097                         struct page *p = list_first_entry(&daemon_pages,
1098                                                           struct page, lru);
1099                         list_del(&p->lru);
1100                         put_page(p);
1101                         daemon_pages_count -= 1;
1102                 }
1103                 up_write(&cfs_tracefile_sem);
1104
1105                 if (!list_empty(&pc.pc_pages)) {
1106                         int i;
1107
1108                         pr_alert("Lustre: trace pages aren't empty\n");
1109                         pr_err("Lustre: total cpus(%d): ", num_possible_cpus());
1110                         for (i = 0; i < num_possible_cpus(); i++)
1111                                 if (cpu_online(i))
1112                                         pr_cont("%d(on) ", i);
1113                                 else
1114                                         pr_cont("%d(off) ", i);
1115                         pr_cont("\n");
1116
1117                         i = 0;
1118                         list_for_each_entry_safe(tage, tmp, &pc.pc_pages,
1119                                                  linkage)
1120                                 pr_err("Lustre: page %d belongs to cpu %d\n",
1121                                        ++i, tage->cpu);
1122                         pr_err("Lustre: There are %d pages unwritten\n", i);
1123                 }
1124                 __LASSERT(list_empty(&pc.pc_pages));
1125         }
1126
1127         return 0;
1128 }
1129
1130 int cfs_trace_start_thread(void)
1131 {
1132         struct task_struct *tsk;
1133         int rc = 0;
1134
1135         if (tctl_task)
1136                 return 0;
1137
1138         tsk = kthread_create(tracefiled, NULL, "ktracefiled");
1139         if (IS_ERR(tsk))
1140                 rc = -ECHILD;
1141         else if (cmpxchg(&tctl_task, NULL, tsk) != NULL)
1142                 /* already running */
1143                 kthread_stop(tsk);
1144         else
1145                 wake_up_process(tsk);
1146
1147         return rc;
1148 }
1149
1150 void cfs_trace_stop_thread(void)
1151 {
1152         struct task_struct *tsk;
1153
1154         tsk = xchg(&tctl_task, NULL);
1155         if (tsk) {
1156                 pr_info("Lustre: shutting down debug daemon thread...\n");
1157                 kthread_stop(tsk);
1158         }
1159 }
1160
1161 /* percents to share the total debug memory for each type */
1162 static unsigned int pages_factor[CFS_TCD_TYPE_CNT] = {
1163         80, /* 80% pages for CFS_TCD_TYPE_PROC */
1164         10, /* 10% pages for CFS_TCD_TYPE_SOFTIRQ */
1165         10  /* 10% pages for CFS_TCD_TYPE_IRQ */
1166 };
1167
1168 int cfs_tracefile_init(int max_pages)
1169 {
1170         struct cfs_trace_cpu_data *tcd;
1171         int i;
1172         int j;
1173
1174         /* initialize trace_data */
1175         memset(cfs_trace_data, 0, sizeof(cfs_trace_data));
1176         for (i = 0; i < CFS_TCD_TYPE_CNT; i++) {
1177                 cfs_trace_data[i] =
1178                         kmalloc_array(num_possible_cpus(),
1179                                       sizeof(union cfs_trace_data_union),
1180                                       GFP_KERNEL);
1181                 if (!cfs_trace_data[i])
1182                         goto out_trace_data;
1183         }
1184
1185         /* arch related info initialized */
1186         cfs_tcd_for_each(tcd, i, j) {
1187                 int factor = pages_factor[i];
1188
1189                 spin_lock_init(&tcd->tcd_lock);
1190                 tcd->tcd_pages_factor = factor;
1191                 tcd->tcd_type = i;
1192                 tcd->tcd_cpu = j;
1193
1194                 INIT_LIST_HEAD(&tcd->tcd_pages);
1195                 INIT_LIST_HEAD(&tcd->tcd_stock_pages);
1196                 tcd->tcd_cur_pages = 0;
1197                 tcd->tcd_cur_stock_pages = 0;
1198                 tcd->tcd_max_pages = (max_pages * factor) / 100;
1199                 LASSERT(tcd->tcd_max_pages > 0);
1200                 tcd->tcd_shutting_down = 0;
1201         }
1202         daemon_pages_max = max_pages;
1203
1204         return 0;
1205
1206 out_trace_data:
1207         for (i = 0; cfs_trace_data[i]; i++) {
1208                 kfree(cfs_trace_data[i]);
1209                 cfs_trace_data[i] = NULL;
1210         }
1211         pr_err("lnet: Not enough memory\n");
1212         return -ENOMEM;
1213 }
1214
1215 static void trace_cleanup_on_all_cpus(void)
1216 {
1217         struct cfs_trace_cpu_data *tcd;
1218         struct cfs_trace_page *tage;
1219         int i, cpu;
1220
1221         for_each_possible_cpu(cpu) {
1222                 cfs_tcd_for_each_type_lock(tcd, i, cpu) {
1223                         if (!tcd->tcd_pages_factor)
1224                                 /* Not initialised */
1225                                 continue;
1226                         tcd->tcd_shutting_down = 1;
1227
1228                         while (!list_empty(&tcd->tcd_pages)) {
1229                                 tage = list_first_entry(&tcd->tcd_pages,
1230                                                         struct cfs_trace_page,
1231                                                         linkage);
1232                                 __LASSERT_TAGE_INVARIANT(tage);
1233
1234                                 list_del(&tage->linkage);
1235                                 cfs_tage_free(tage);
1236                         }
1237                         tcd->tcd_cur_pages = 0;
1238                 }
1239         }
1240 }
1241
1242 static void cfs_trace_cleanup(void)
1243 {
1244         struct page_collection pc;
1245         int i;
1246
1247         INIT_LIST_HEAD(&pc.pc_pages);
1248
1249         trace_cleanup_on_all_cpus();
1250
1251         for (i = 0; i < CFS_TCD_TYPE_CNT && cfs_trace_data[i]; i++) {
1252                 kfree(cfs_trace_data[i]);
1253                 cfs_trace_data[i] = NULL;
1254         }
1255 }
1256
1257 void cfs_tracefile_exit(void)
1258 {
1259         cfs_trace_stop_thread();
1260         cfs_trace_flush_pages();
1261         cfs_trace_cleanup();
1262 }