Whamcloud - gitweb
Branch HEAD
[fs/lustre-release.git] / libcfs / libcfs / watchdog.c
index 6c90188..4e5cc42 100644 (file)
@@ -96,14 +96,25 @@ static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; /* BH lock! */
 static struct list_head lcw_pending_timers = \
         CFS_LIST_HEAD_INIT(lcw_pending_timers);
 
-#ifdef HAVE_TASKLIST_LOCK
+/* Last time a watchdog expired */
+static cfs_time_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+static spinlock_t lcw_last_watchdog_lock = SPIN_LOCK_UNLOCKED;
+
 static void
 lcw_dump(struct lc_watchdog *lcw)
 {
         cfs_task_t *tsk;
+#if defined(HAVE_TASKLIST_LOCK)
+        read_lock(&tasklist_lock);
+#elif defined(HAVE_TASK_RCU)
+        rcu_read_lock();
+#else
+        CERROR("unable to dump stack because of missing export\n"); 
+        return;
+#endif
         ENTRY;
 
-        read_lock(&tasklist_lock);
         tsk = find_task_by_pid(lcw->lcw_pid);
 
         if (tsk == NULL) {
@@ -115,21 +126,20 @@ lcw_dump(struct lc_watchdog *lcw)
         } else {
                 libcfs_debug_dumpstack(tsk);
         }
-        
+
+#if defined(HAVE_TASKLIST_LOCK)
         read_unlock(&tasklist_lock);
+#elif defined(HAVE_TASK_RCU)
+        rcu_read_unlock();
+#endif
         EXIT;
 }
-#else
-static void
-lcw_dump(struct lc_watchdog *lcw)
-{
-        CERROR("unable to dump stack because of missing export\n");
-}
-#endif
 
 static void lcw_cb(ulong_ptr_t data)
 {
         struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+        cfs_time_t current_time;
+        cfs_duration_t delta_time;
 
         ENTRY;
 
@@ -139,14 +149,38 @@ static void lcw_cb(ulong_ptr_t data)
         }
 
         lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+        current_time = cfs_time_current();
+
+        /* Check to see if we should throttle the watchdog timer to avoid
+         * too many dumps going to the console thus triggering an NMI.
+         * Normally we would not hold the spin lock over the CWARN but in
+         * this case we hold it to ensure non ratelimited lcw_dumps are not
+         * interleaved on the console making them hard to read. */
+        spin_lock_bh(&lcw_last_watchdog_lock);
+        delta_time = cfs_duration_sec(current_time - lcw_last_watchdog_time);
+
+        if (delta_time < libcfs_watchdog_ratelimit && lcw_recent_watchdog_count > 3) {
+                CWARN("Refusing to fire watchdog for pid %d: it was inactive "
+                      "for %ldms. Rate limiting 1 per %d seconds.\n",
+                      (int)lcw->lcw_pid,cfs_duration_sec(lcw->lcw_time) * 1000,
+                      libcfs_watchdog_ratelimit);
+        } else {
+                if (delta_time < libcfs_watchdog_ratelimit) {
+                        lcw_recent_watchdog_count++;
+                } else {
+                        memcpy(&lcw_last_watchdog_time, &current_time,
+                               sizeof(current_time));
+                        lcw_recent_watchdog_count = 0;
+                }
 
-        /* NB this warning should appear on the console, but may not get into
-         * the logs since we're running in a softirq handler */
-
-        CWARN("Watchdog triggered for pid: " LPPID " it was inactive for %lds\n",
-              lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time));
-        lcw_dump(lcw);
+               /* This warning should appear on the console, but may not get
+                * into the logs since we're running in a softirq handler */
+                CWARN("Watchdog triggered for pid %d: it was inactive for %lds\n",
+                      (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time));
+                lcw_dump(lcw);
+       }
 
+        spin_unlock_bh(&lcw_last_watchdog_lock);
         spin_lock_bh(&lcw_pending_timers_lock);
 
         if (list_empty(&lcw->lcw_list)) {