X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=libcfs%2Flibcfs%2Fwatchdog.c;h=47098317da4786fb108171c1433185a2fd10abca;hb=5169587d443611db5d1d699e4978162cbab219e3;hp=89d757c2afb7ae4c1c7135fe002d72ace9eb1bb8;hpb=e1b3d71a27c166bebd26ab33f7299c41bd75dab5;p=fs%2Flustre-release.git diff --git a/libcfs/libcfs/watchdog.c b/libcfs/libcfs/watchdog.c index 89d757c..4709831 100644 --- a/libcfs/libcfs/watchdog.c +++ b/libcfs/libcfs/watchdog.c @@ -1,28 +1,45 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: * - * Copyright (C) 2004 Cluster File Systems, Inc. - * Author: Jacob Berkman + * GPL HEADER START * - * This file is part of Lustre, http://www.lustre.org. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * - * Lustre is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. * - * Lustre is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). * - * You should have received a copy of the GNU General Public License - * along with Lustre; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * libcfs/libcfs/watchdog.c + * + * Author: Jacob Berkman */ #define DEBUG_SUBSYSTEM S_LNET -#include #include #include "tracefile.h" @@ -53,7 +70,7 @@ struct lc_watchdog { */ static struct completion lcw_start_completion; static struct completion lcw_stop_completion; -static wait_queue_head_t lcw_event_waitq; +static cfs_waitq_t lcw_event_waitq; /* * Set this and wake lcw_event_waitq to stop the dispatcher. @@ -77,7 +94,12 @@ static DECLARE_MUTEX(lcw_refcount_sem); */ static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; /* BH lock! */ static struct list_head lcw_pending_timers = \ - LIST_HEAD_INIT(lcw_pending_timers); + CFS_LIST_HEAD_INIT(lcw_pending_timers); + +/* Last time a watchdog expired */ +static cfs_time_t lcw_last_watchdog_time; +static int lcw_recent_watchdog_count; +static spinlock_t lcw_last_watchdog_lock = SPIN_LOCK_UNLOCKED; #ifdef HAVE_TASKLIST_LOCK static void @@ -90,11 +112,11 @@ lcw_dump(struct lc_watchdog *lcw) tsk = find_task_by_pid(lcw->lcw_pid); if (tsk == NULL) { - CWARN("Process %d was not found in the task list; " - "watchdog callback may be incomplete\n", (int)lcw->lcw_pid); + CWARN("Process " LPPID " was not found in the task list; " + "watchdog callback may be incomplete\n", lcw->lcw_pid); } else if (tsk != lcw->lcw_task) { - CWARN("The current process %d did not set the watchdog; " - "watchdog callback may be incomplete\n", (int)lcw->lcw_pid); + CWARN("The current process " LPPID " did not set the watchdog; " + "watchdog callback may be incomplete\n", lcw->lcw_pid); } else { libcfs_debug_dumpstack(tsk); } @@ -110,9 +132,11 @@ lcw_dump(struct lc_watchdog *lcw) } #endif -static void lcw_cb(unsigned long data) +static void lcw_cb(ulong_ptr_t data) { struct lc_watchdog *lcw = (struct lc_watchdog *)data; + cfs_time_t current_time; + cfs_duration_t delta_time; ENTRY; @@ -122,19 +146,43 @@ static void lcw_cb(unsigned long data) } lcw->lcw_state = LC_WATCHDOG_EXPIRED; + current_time = cfs_time_current(); + + /* Check to see if we should throttle the watchdog timer to avoid + * too many dumps going to the console thus triggering an NMI. + * Normally we would not hold the spin lock over the CWARN but in + * this case we hold it to ensure non ratelimited lcw_dumps are not + * interleaved on the console making them hard to read. */ + spin_lock_bh(&lcw_last_watchdog_lock); + delta_time = cfs_duration_sec(current_time - lcw_last_watchdog_time); + + if (delta_time < libcfs_watchdog_ratelimit && lcw_recent_watchdog_count > 3) { + CWARN("Refusing to fire watchdog for pid %d: it was inactive " + "for %ldms. Rate limiting 1 per %d seconds.\n", + (int)lcw->lcw_pid,cfs_duration_sec(lcw->lcw_time) * 1000, + libcfs_watchdog_ratelimit); + } else { + if (delta_time < libcfs_watchdog_ratelimit) { + lcw_recent_watchdog_count++; + } else { + memcpy(&lcw_last_watchdog_time, ¤t_time, + sizeof(current_time)); + lcw_recent_watchdog_count = 0; + } - /* NB this warning should appear on the console, but may not get into - * the logs since we're running in a softirq handler */ - - CWARN("Watchdog triggered for pid %d: it was inactive for %lds\n", - (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time)); - lcw_dump(lcw); + /* This warning should appear on the console, but may not get + * into the logs since we're running in a softirq handler */ + CWARN("Watchdog triggered for pid %d: it was inactive for %lds\n", + (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time)); + lcw_dump(lcw); + } + spin_unlock_bh(&lcw_last_watchdog_lock); spin_lock_bh(&lcw_pending_timers_lock); if (list_empty(&lcw->lcw_list)) { list_add(&lcw->lcw_list, &lcw_pending_timers); - wake_up(&lcw_event_waitq); + cfs_waitq_signal(&lcw_event_waitq); } spin_unlock_bh(&lcw_pending_timers_lock); @@ -173,7 +221,7 @@ static int lcw_dispatch_main(void *data) complete(&lcw_start_completion); while (1) { - wait_event_interruptible(lcw_event_waitq, is_watchdog_fired()); + cfs_wait_event_interruptible(lcw_event_waitq, is_watchdog_fired(), rc); CDEBUG(D_INFO, "Watchdog got woken up...\n"); if (test_bit(LCW_FLAG_STOP, &lcw_flags)) { CDEBUG(D_INFO, "LCW_FLAG_STOP was set, shutting down...\n"); @@ -197,9 +245,8 @@ static int lcw_dispatch_main(void *data) list_del_init(&lcw->lcw_list); spin_unlock_bh(&lcw_pending_timers_lock); - CDEBUG(D_INFO, "found lcw for pid %d: inactive for " - "%lds\n", (int)lcw->lcw_pid, - cfs_duration_sec(lcw->lcw_time)); + CDEBUG(D_INFO, "found lcw for pid " LPPID ": inactive for " + "%lds\n", lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time)); if (lcw->lcw_state != LC_WATCHDOG_DISABLED) lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data); @@ -223,7 +270,7 @@ static void lcw_dispatch_start(void) init_completion(&lcw_stop_completion); init_completion(&lcw_start_completion); - init_waitqueue_head(&lcw_event_waitq); + cfs_waitq_init(&lcw_event_waitq); CDEBUG(D_INFO, "starting dispatch thread\n"); rc = kernel_thread(lcw_dispatch_main, NULL, 0); @@ -246,7 +293,7 @@ static void lcw_dispatch_stop(void) CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n"); set_bit(LCW_FLAG_STOP, &lcw_flags); - wake_up(&lcw_event_waitq); + cfs_waitq_signal(&lcw_event_waitq); wait_for_completion(&lcw_stop_completion); @@ -275,12 +322,8 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms, lcw->lcw_data = data; lcw->lcw_state = LC_WATCHDOG_DISABLED; - INIT_LIST_HEAD(&lcw->lcw_list); - - lcw->lcw_timer.function = lcw_cb; - lcw->lcw_timer.data = (unsigned long)lcw; - lcw->lcw_timer.expires = jiffies + lcw->lcw_time; - init_timer(&lcw->lcw_timer); + CFS_INIT_LIST_HEAD(&lcw->lcw_list); + cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw); down(&lcw_refcount_sem); if (++lcw_refcount == 1) @@ -290,7 +333,8 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms, /* Keep this working in case we enable them by default */ if (lcw->lcw_state == LC_WATCHDOG_ENABLED) { do_gettimeofday(&lcw->lcw_last_touched); - add_timer(&lcw->lcw_timer); + cfs_timer_arm(&lcw->lcw_timer, lcw->lcw_time + + cfs_time_current()); } RETURN(lcw); @@ -305,7 +349,7 @@ static void lcw_update_time(struct lc_watchdog *lcw, const char *message) do_gettimeofday(&newtime); if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) { cfs_timeval_sub(&newtime, &lcw->lcw_last_touched, &timediff); - CWARN("Expired watchdog for pid %d %s after %lu.%.4lus\n", + CWARN("Expired watchdog for pid " LPPID " %s after %lu.%.4lus\n", lcw->lcw_pid, message, timediff.tv_sec, @@ -326,8 +370,8 @@ void lc_watchdog_touch_ms(struct lc_watchdog *lcw, int timeout_ms) lcw_update_time(lcw, "touched"); lcw->lcw_state = LC_WATCHDOG_ENABLED; - mod_timer(&lcw->lcw_timer, jiffies + - cfs_time_seconds(timeout_ms) / 1000); + cfs_timer_arm(&lcw->lcw_timer, cfs_time_current() + + cfs_time_seconds(timeout_ms) / 1000); EXIT; } @@ -336,7 +380,7 @@ EXPORT_SYMBOL(lc_watchdog_touch_ms); /* deprecated - use above instead */ void lc_watchdog_touch(struct lc_watchdog *lcw) { - lc_watchdog_touch_ms(lcw, cfs_duration_sec(lcw->lcw_time) * 1000); + lc_watchdog_touch_ms(lcw, (int)cfs_duration_sec(lcw->lcw_time) * 1000); } EXPORT_SYMBOL(lc_watchdog_touch); @@ -362,7 +406,7 @@ void lc_watchdog_delete(struct lc_watchdog *lcw) ENTRY; LASSERT(lcw != NULL); - del_timer(&lcw->lcw_timer); + cfs_timer_disarm(&lcw->lcw_timer); lcw_update_time(lcw, "deleted"); @@ -388,7 +432,7 @@ EXPORT_SYMBOL(lc_watchdog_delete); void lc_watchdog_dumplog(pid_t pid, void *data) { - libcfs_debug_dumplog_internal((void *)((unsigned long)pid)); + libcfs_debug_dumplog_internal((void *)((long_ptr_t)pid)); } EXPORT_SYMBOL(lc_watchdog_dumplog); @@ -424,4 +468,3 @@ void lc_watchdog_delete(struct lc_watchdog *lcw) EXPORT_SYMBOL(lc_watchdog_delete); #endif -