PSDEV_LNET_DAEMON_FILE, /* spool kernel debug buffer to file */
PSDEV_LNET_DEBUG_MB, /* size of debug buffer */
PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */
+ PSDEV_LNET_WATCHDOG_RATELIMIT, /* ratelimit watchdog messages */
};
#else
#define CTL_LNET CTL_UNNUMBERED
DECLARE_PROC_HANDLER(proc_dobitmasks)
+static int min_watchdog_ratelimit = 0; /* disable ratelimiting */
+static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */
+
static int __proc_dump_kernel(void *data, int write,
loff_t pos, void *buffer, int nob)
{
.mode = 0644,
.proc_handler = &proc_debug_mb,
},
+ {
+ .ctl_name = PSDEV_LNET_WATCHDOG_RATELIMIT,
+ .procname = "watchdog_ratelimit",
+ .data = &libcfs_watchdog_ratelimit,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_minmax,
+ .extra1 = &min_watchdog_ratelimit,
+ .extra2 = &max_watchdog_ratelimit,
+ },
{0}
};
static struct list_head lcw_pending_timers = \
CFS_LIST_HEAD_INIT(lcw_pending_timers);
+/* Last time a watchdog expired */
+static cfs_time_t lcw_last_watchdog_time;
+static int lcw_recent_watchdog_count;
+static spinlock_t lcw_last_watchdog_lock = SPIN_LOCK_UNLOCKED;
+
#ifdef HAVE_TASKLIST_LOCK
static void
lcw_dump(struct lc_watchdog *lcw)
static void lcw_cb(ulong_ptr_t data)
{
struct lc_watchdog *lcw = (struct lc_watchdog *)data;
+ cfs_time_t current_time;
+ cfs_duration_t delta_time;
ENTRY;
}
lcw->lcw_state = LC_WATCHDOG_EXPIRED;
+ current_time = cfs_time_current();
+
+ /* Check to see if we should throttle the watchdog timer to avoid
+ * too many dumps going to the console thus triggering an NMI.
+ * Normally we would not hold the spin lock over the CWARN but in
+ * this case we hold it to ensure non ratelimited lcw_dumps are not
+ * interleaved on the console making them hard to read. */
+ spin_lock_bh(&lcw_last_watchdog_lock);
+ delta_time = cfs_duration_sec(current_time - lcw_last_watchdog_time);
+
+ if (delta_time < libcfs_watchdog_ratelimit && lcw_recent_watchdog_count > 3) {
+ CWARN("Refusing to fire watchdog for pid %d: it was inactive "
+ "for %ldms. Rate limiting 1 per %d seconds.\n",
+ (int)lcw->lcw_pid,cfs_duration_sec(lcw->lcw_time) * 1000,
+ libcfs_watchdog_ratelimit);
+ } else {
+ if (delta_time < libcfs_watchdog_ratelimit) {
+ lcw_recent_watchdog_count++;
+ } else {
+ memcpy(&lcw_last_watchdog_time, ¤t_time,
+ sizeof(current_time));
+ lcw_recent_watchdog_count = 0;
+ }
- /* NB this warning should appear on the console, but may not get into
- * the logs since we're running in a softirq handler */
-
- CWARN("Watchdog triggered for pid: " LPPID " it was inactive for %lds\n",
- lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time));
- lcw_dump(lcw);
+ /* This warning should appear on the console, but may not get
+ * into the logs since we're running in a softirq handler */
+ CWARN("Watchdog triggered for pid %d: it was inactive for %lds\n",
+ (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time));
+ lcw_dump(lcw);
+ }
+ spin_unlock_bh(&lcw_last_watchdog_lock);
spin_lock_bh(&lcw_pending_timers_lock);
if (list_empty(&lcw->lcw_list)) {