From: yangsheng Date: Tue, 10 Feb 2009 15:58:55 +0000 (+0000) Subject: Branch HEAD X-Git-Tag: v1_9_160~21 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=682d8ea7e9fe96f05b34c34ed38f41fb1476762f Branch HEAD b=10762 i=adilger, green Rate limit watchdog Author: Jim Garlick(LLNL) --- diff --git a/libcfs/include/libcfs/libcfs_debug.h b/libcfs/include/libcfs/libcfs_debug.h index d2663c8..ff6fbd5 100644 --- a/libcfs/include/libcfs/libcfs_debug.h +++ b/libcfs/include/libcfs/libcfs_debug.h @@ -50,6 +50,7 @@ extern unsigned int libcfs_stack; extern unsigned int libcfs_debug; extern unsigned int libcfs_printk; extern unsigned int libcfs_console_ratelimit; +extern unsigned int libcfs_watchdog_ratelimit; extern cfs_duration_t libcfs_console_max_delay; extern cfs_duration_t libcfs_console_min_delay; extern unsigned int libcfs_console_backoff; diff --git a/libcfs/libcfs/debug.c b/libcfs/libcfs/debug.c index 868be0a..9a3e086 100644 --- a/libcfs/libcfs/debug.c +++ b/libcfs/libcfs/debug.c @@ -102,6 +102,9 @@ EXPORT_SYMBOL(portal_enter_debugger); unsigned int libcfs_catastrophe; EXPORT_SYMBOL(libcfs_catastrophe); +unsigned int libcfs_watchdog_ratelimit = 300; +EXPORT_SYMBOL(libcfs_watchdog_ratelimit); + unsigned int libcfs_panic_on_lbug = 0; CFS_MODULE_PARM(libcfs_panic_on_lbug, "i", uint, 0644, "Lustre kernel panic on LBUG"); diff --git a/libcfs/libcfs/linux/linux-proc.c b/libcfs/libcfs/linux/linux-proc.c index ec6ab0c..765a888 100644 --- a/libcfs/libcfs/linux/linux-proc.c +++ b/libcfs/libcfs/linux/linux-proc.c @@ -103,6 +103,7 @@ enum { PSDEV_LNET_DAEMON_FILE, /* spool kernel debug buffer to file */ PSDEV_LNET_DEBUG_MB, /* size of debug buffer */ PSDEV_LNET_DEBUG_LOG_UPCALL, /* debug log upcall script */ + PSDEV_LNET_WATCHDOG_RATELIMIT, /* ratelimit watchdog messages */ }; #else #define CTL_LNET CTL_UNNUMBERED @@ -188,6 +189,9 @@ static int __proc_dobitmasks(void *data, int write, DECLARE_PROC_HANDLER(proc_dobitmasks) +static int min_watchdog_ratelimit = 0; /* disable ratelimiting */ +static int max_watchdog_ratelimit = (24*60*60); /* limit to once per day */ + static int __proc_dump_kernel(void *data, int write, loff_t pos, void *buffer, int nob) { @@ -456,6 +460,16 @@ static cfs_sysctl_table_t lnet_table[] = { .mode = 0644, .proc_handler = &proc_debug_mb, }, + { + .ctl_name = PSDEV_LNET_WATCHDOG_RATELIMIT, + .procname = "watchdog_ratelimit", + .data = &libcfs_watchdog_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &min_watchdog_ratelimit, + .extra2 = &max_watchdog_ratelimit, + }, {0} }; diff --git a/libcfs/libcfs/watchdog.c b/libcfs/libcfs/watchdog.c index 6c90188..4709831 100644 --- a/libcfs/libcfs/watchdog.c +++ b/libcfs/libcfs/watchdog.c @@ -96,6 +96,11 @@ static spinlock_t lcw_pending_timers_lock = SPIN_LOCK_UNLOCKED; /* BH lock! */ static struct list_head lcw_pending_timers = \ CFS_LIST_HEAD_INIT(lcw_pending_timers); +/* Last time a watchdog expired */ +static cfs_time_t lcw_last_watchdog_time; +static int lcw_recent_watchdog_count; +static spinlock_t lcw_last_watchdog_lock = SPIN_LOCK_UNLOCKED; + #ifdef HAVE_TASKLIST_LOCK static void lcw_dump(struct lc_watchdog *lcw) @@ -130,6 +135,8 @@ lcw_dump(struct lc_watchdog *lcw) static void lcw_cb(ulong_ptr_t data) { struct lc_watchdog *lcw = (struct lc_watchdog *)data; + cfs_time_t current_time; + cfs_duration_t delta_time; ENTRY; @@ -139,14 +146,38 @@ static void lcw_cb(ulong_ptr_t data) } lcw->lcw_state = LC_WATCHDOG_EXPIRED; + current_time = cfs_time_current(); + + /* Check to see if we should throttle the watchdog timer to avoid + * too many dumps going to the console thus triggering an NMI. + * Normally we would not hold the spin lock over the CWARN but in + * this case we hold it to ensure non ratelimited lcw_dumps are not + * interleaved on the console making them hard to read. */ + spin_lock_bh(&lcw_last_watchdog_lock); + delta_time = cfs_duration_sec(current_time - lcw_last_watchdog_time); + + if (delta_time < libcfs_watchdog_ratelimit && lcw_recent_watchdog_count > 3) { + CWARN("Refusing to fire watchdog for pid %d: it was inactive " + "for %ldms. Rate limiting 1 per %d seconds.\n", + (int)lcw->lcw_pid,cfs_duration_sec(lcw->lcw_time) * 1000, + libcfs_watchdog_ratelimit); + } else { + if (delta_time < libcfs_watchdog_ratelimit) { + lcw_recent_watchdog_count++; + } else { + memcpy(&lcw_last_watchdog_time, ¤t_time, + sizeof(current_time)); + lcw_recent_watchdog_count = 0; + } - /* NB this warning should appear on the console, but may not get into - * the logs since we're running in a softirq handler */ - - CWARN("Watchdog triggered for pid: " LPPID " it was inactive for %lds\n", - lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time)); - lcw_dump(lcw); + /* This warning should appear on the console, but may not get + * into the logs since we're running in a softirq handler */ + CWARN("Watchdog triggered for pid %d: it was inactive for %lds\n", + (int)lcw->lcw_pid, cfs_duration_sec(lcw->lcw_time)); + lcw_dump(lcw); + } + spin_unlock_bh(&lcw_last_watchdog_lock); spin_lock_bh(&lcw_pending_timers_lock); if (list_empty(&lcw->lcw_list)) {