Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / libcfs / libcfs / watchdog.c
index 89d757c..4709831 100644 (file)
@@ -1,28 +1,45 @@
 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
  * vim:expandtab:shiftwidth=8:tabstop=8:
  *
- * Copyright (C) 2004 Cluster File Systems, Inc.
- *   Author: Jacob Berkman <jacob@clusterfs.com>
+ * GPL HEADER START
  *
- *   This file is part of Lustre, http://www.lustre.org.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
- *   Lustre is free software; you can redistribute it and/or
- *   modify it under the terms of version 2 of the GNU General Public
- *   License as published by the Free Software Foundation.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
  *
- *   Lustre is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY; without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *   GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
  *
- *   You should have received a copy of the GNU General Public License
- *   along with Lustre; if not, write to the Free Software
- *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright  2008 Sun Microsystems, Inc. All rights reserved
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * libcfs/libcfs/watchdog.c
+ *
+ * Author: Jacob Berkman <jacob@clusterfs.com>
  */
 
 #define DEBUG_SUBSYSTEM S_LNET
 
-#include <libcfs/kp30.h>
 #include <libcfs/libcfs.h>
 #include "tracefile.h"
 
@@ -53,7 +70,7 @@ struct lc_watchdog {
  */
 static struct completion lcw_start_completion;
 static struct completion lcw_stop_completion;
-static wait_queue_head_t lcw_event_waitq;
+static cfs_waitq_t lcw_event_waitq;
 
 /*
  * Set this and wake lcw_event_waitq to stop the dispatcher.
@@ -77,7 +94,12 @@ static DECLARE_MUTEX(lcw_refcount_sem);
  */
 static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; /* BH lock! */
 static struct list_head lcw_pending_timers = \
-        LIST_HEAD_INIT(lcw_pending_timers);
+        CFS_LIST_HEAD_INIT(lcw_pending_timers);
+
+/* Last time a watchdog expired */
+static cfs_time_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+static spinlock_t lcw_last_watchdog_lock = SPIN_LOCK_UNLOCKED;
 
 #ifdef HAVE_TASKLIST_LOCK
 static void
@@ -90,11 +112,11 @@ lcw_dump(struct lc_watchdog *lcw)
         tsk = find_task_by_pid(lcw->lcw_pid);
 
         if (tsk == NULL) {
-                CWARN("Process %d was not found in the task list; "
-                      "watchdog callback may be incomplete\n", (int)lcw->lcw_pid);
+                CWARN("Process " LPPID " was not found in the task list; "
+                      "watchdog callback may be incomplete\n", lcw->lcw_pid);
         } else if (tsk != lcw->lcw_task) {
-                CWARN("The current process %d did not set the watchdog; "
-                      "watchdog callback may be incomplete\n", (int)lcw->lcw_pid);
+                CWARN("The current process " LPPID " did not set the watchdog; "
+                      "watchdog callback may be incomplete\n", lcw->lcw_pid);
         } else {
                 libcfs_debug_dumpstack(tsk);
         }
@@ -110,9 +132,11 @@ lcw_dump(struct lc_watchdog *lcw)
 }
 #endif
 
-static void lcw_cb(unsigned long data)
+static void lcw_cb(ulong_ptr_t data)
 {
         struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+        cfs_time_t current_time;
+        cfs_duration_t delta_time;
 
         ENTRY;
 
@@ -122,19 +146,43 @@ static void lcw_cb(unsigned long data)
         }
 
         lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+        current_time = cfs_time_current();
+
+        /* Check to see if we should throttle the watchdog timer to avoid
+         * too many dumps going to the console thus triggering an NMI.
+         * Normally we would not hold the spin lock over the CWARN but in
+         * this case we hold it to ensure non ratelimited lcw_dumps are not
+         * interleaved on the console making them hard to read. */
+        spin_lock_bh(&lcw_last_watchdog_lock);
+        delta_time = cfs_duration_sec(current_time - lcw_last_watchdog_time);
+
+        if (delta_time < libcfs_watchdog_ratelimit && lcw_recent_watchdog_count > 3) {
+                CWARN("Refusing to fire watchdog for pid %d: it was inactive "
+                      "for %ldms. Rate limiting 1 per %d seconds.\n",
+                      (int)lcw->lcw_pid,cfs_duration_sec(lcw->lcw_time) * 1000,
+                      libcfs_watchdog_ratelimit);
+        } else {
+                if (delta_time < libcfs_watchdog_ratelimit) {
+                        lcw_recent_watchdog_count++;
+                } else {
+                        memcpy(&lcw_last_watchdog_time, &current_time,
+                               sizeof(current_time));
+                        lcw_recent_watchdog_count = 0;
+                }
 
-        /* NB this warning should appear on the console, but may not get into
-         * the logs since we're running in a softirq handler */
-
-        CWARN("Watchdog triggered for pid %d: it was inactive for %lds\n",
-              (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time));
-        lcw_dump(lcw);
+               /* This warning should appear on the console, but may not get
+                * into the logs since we're running in a softirq handler */
+                CWARN("Watchdog triggered for pid %d: it was inactive for %lds\n",
+                      (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time));
+                lcw_dump(lcw);
+       }
 
+        spin_unlock_bh(&lcw_last_watchdog_lock);
         spin_lock_bh(&lcw_pending_timers_lock);
 
         if (list_empty(&lcw->lcw_list)) {
                 list_add(&lcw->lcw_list, &lcw_pending_timers);
-                wake_up(&lcw_event_waitq);
+                cfs_waitq_signal(&lcw_event_waitq);
         }
 
         spin_unlock_bh(&lcw_pending_timers_lock);
@@ -173,7 +221,7 @@ static int lcw_dispatch_main(void *data)
         complete(&lcw_start_completion);
 
         while (1) {
-                wait_event_interruptible(lcw_event_waitq, is_watchdog_fired());
+                cfs_wait_event_interruptible(lcw_event_waitq, is_watchdog_fired(), rc);
                 CDEBUG(D_INFO, "Watchdog got woken up...\n");
                 if (test_bit(LCW_FLAG_STOP, &lcw_flags)) {
                         CDEBUG(D_INFO, "LCW_FLAG_STOP was set, shutting down...\n");
@@ -197,9 +245,8 @@ static int lcw_dispatch_main(void *data)
                         list_del_init(&lcw->lcw_list);
                         spin_unlock_bh(&lcw_pending_timers_lock);
 
-                        CDEBUG(D_INFO, "found lcw for pid %d: inactive for "
-                               "%lds\n", (int)lcw->lcw_pid,
-                               cfs_duration_sec(lcw->lcw_time));
+                        CDEBUG(D_INFO, "found lcw for pid " LPPID ": inactive for "
+                               "%lds\n", lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time));
 
                         if (lcw->lcw_state != LC_WATCHDOG_DISABLED)
                                 lcw->lcw_callback(lcw->lcw_pid, lcw->lcw_data);
@@ -223,7 +270,7 @@ static void lcw_dispatch_start(void)
 
         init_completion(&lcw_stop_completion);
         init_completion(&lcw_start_completion);
-        init_waitqueue_head(&lcw_event_waitq);
+        cfs_waitq_init(&lcw_event_waitq);
 
         CDEBUG(D_INFO, "starting dispatch thread\n");
         rc = kernel_thread(lcw_dispatch_main, NULL, 0);
@@ -246,7 +293,7 @@ static void lcw_dispatch_stop(void)
         CDEBUG(D_INFO, "trying to stop watchdog dispatcher.\n");
 
         set_bit(LCW_FLAG_STOP, &lcw_flags);
-        wake_up(&lcw_event_waitq);
+        cfs_waitq_signal(&lcw_event_waitq);
 
         wait_for_completion(&lcw_stop_completion);
 
@@ -275,12 +322,8 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms,
         lcw->lcw_data     = data;
         lcw->lcw_state    = LC_WATCHDOG_DISABLED;
 
-        INIT_LIST_HEAD(&lcw->lcw_list);
-
-        lcw->lcw_timer.function = lcw_cb;
-        lcw->lcw_timer.data = (unsigned long)lcw;
-        lcw->lcw_timer.expires = jiffies + lcw->lcw_time;
-        init_timer(&lcw->lcw_timer);
+        CFS_INIT_LIST_HEAD(&lcw->lcw_list);
+        cfs_timer_init(&lcw->lcw_timer, lcw_cb, lcw);
 
         down(&lcw_refcount_sem);
         if (++lcw_refcount == 1)
@@ -290,7 +333,8 @@ struct lc_watchdog *lc_watchdog_add(int timeout_ms,
         /* Keep this working in case we enable them by default */
         if (lcw->lcw_state == LC_WATCHDOG_ENABLED) {
                 do_gettimeofday(&lcw->lcw_last_touched);
-                add_timer(&lcw->lcw_timer);
+                cfs_timer_arm(&lcw->lcw_timer, lcw->lcw_time + 
+                              cfs_time_current());
         }
 
         RETURN(lcw);
@@ -305,7 +349,7 @@ static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
         do_gettimeofday(&newtime);
         if (lcw->lcw_state == LC_WATCHDOG_EXPIRED) {
                 cfs_timeval_sub(&newtime, &lcw->lcw_last_touched, &timediff);
-                CWARN("Expired watchdog for pid %d %s after %lu.%.4lus\n",
+                CWARN("Expired watchdog for pid " LPPID " %s after %lu.%.4lus\n",
                       lcw->lcw_pid,
                       message,
                       timediff.tv_sec,
@@ -326,8 +370,8 @@ void lc_watchdog_touch_ms(struct lc_watchdog *lcw, int timeout_ms)
         lcw_update_time(lcw, "touched");
         lcw->lcw_state = LC_WATCHDOG_ENABLED;
 
-        mod_timer(&lcw->lcw_timer, jiffies +
-                  cfs_time_seconds(timeout_ms) / 1000);
+        cfs_timer_arm(&lcw->lcw_timer, cfs_time_current() +
+                      cfs_time_seconds(timeout_ms) / 1000);
 
         EXIT;
 }
@@ -336,7 +380,7 @@ EXPORT_SYMBOL(lc_watchdog_touch_ms);
 /* deprecated - use above instead */
 void lc_watchdog_touch(struct lc_watchdog *lcw)
 {
-        lc_watchdog_touch_ms(lcw, cfs_duration_sec(lcw->lcw_time) * 1000);
+        lc_watchdog_touch_ms(lcw, (int)cfs_duration_sec(lcw->lcw_time) * 1000);
 }
 EXPORT_SYMBOL(lc_watchdog_touch);
 
@@ -362,7 +406,7 @@ void lc_watchdog_delete(struct lc_watchdog *lcw)
         ENTRY;
         LASSERT(lcw != NULL);
 
-        del_timer(&lcw->lcw_timer);
+        cfs_timer_disarm(&lcw->lcw_timer);
 
         lcw_update_time(lcw, "deleted");
 
@@ -388,7 +432,7 @@ EXPORT_SYMBOL(lc_watchdog_delete);
 
 void lc_watchdog_dumplog(pid_t pid, void *data)
 {
-        libcfs_debug_dumplog_internal((void *)((unsigned long)pid));
+        libcfs_debug_dumplog_internal((void *)((long_ptr_t)pid));
 }
 EXPORT_SYMBOL(lc_watchdog_dumplog);
 
@@ -424,4 +468,3 @@ void lc_watchdog_delete(struct lc_watchdog *lcw)
 EXPORT_SYMBOL(lc_watchdog_delete);
 
 #endif
-