From b2d6f009de92b94f06d8f48cf516d139110812b9 Mon Sep 17 00:00:00 2001 From: Dmitry Zogin Date: Wed, 16 Dec 2009 16:59:44 -0500 Subject: [PATCH] b=20722 Make watchdog timer messages more clear and descriptive. i=adilger --- lnet/libcfs/watchdog.c | 45 ++++++++++++++++++++++++++++---------------- lustre/ChangeLog | 4 ++++ lustre/tests/sanity-quota.sh | 2 +- 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/lnet/libcfs/watchdog.c b/lnet/libcfs/watchdog.c index 707914e..ee53fa7 100644 --- a/lnet/libcfs/watchdog.c +++ b/lnet/libcfs/watchdog.c @@ -115,9 +115,9 @@ lcw_dump(struct lc_watchdog *lcw) RETURN_EXIT; #endif if (lcw->lcw_task == NULL) { - CWARN("Process %d was not found in the task list; " - "watchdog callback may be incomplete\n", - (int)lcw->lcw_pid); + LCONSOLE_WARN("Process %d was not found in the task list; " + "watchdog callback may be incomplete\n", + (int)lcw->lcw_pid); } else { libcfs_debug_dumpstack(lcw->lcw_task); } @@ -161,10 +161,13 @@ static void lcw_cb(unsigned long data) if (delta_time < libcfs_watchdog_ratelimit && lcw_recent_watchdog_count > 3) { - CWARN("Refusing to fire watchdog for pid %d: it was inactive " - "for %lu.%.02lus. Rate limiting 3 per %d seconds.\n", - (int)lcw->lcw_pid, timediff.tv_sec, - timediff.tv_usec / 10000, libcfs_watchdog_ratelimit); + LCONSOLE_WARN("Service thread pid %u was inactive for " + "%lu.%.02lus. Watchdog stack traces are limited " + "to 3 per %d seconds, skipping this one.\n", + (int)lcw->lcw_pid, + timediff.tv_sec, + timediff.tv_usec / 10000, + libcfs_watchdog_ratelimit); } else { if (delta_time < libcfs_watchdog_ratelimit) { lcw_recent_watchdog_count++; @@ -176,9 +179,14 @@ static void lcw_cb(unsigned long data) /* This warning should appear on the console, but may not get * into the logs since we're running in a softirq handler */ - CWARN("Watchdog triggered for pid %d: it was inactive for " - "%lu.%.02lus\n", (int)lcw->lcw_pid, timediff.tv_sec, - timediff.tv_usec / 10000); + LCONSOLE_WARN("Service thread pid %u was inactive for " + "%lu.%.02lus. The thread might be hung, or it " + "might only be slow and will resume later. " + "Dumping the stack trace for debugging purposes:" + "\n", + (int)lcw->lcw_pid, + timediff.tv_sec, + timediff.tv_usec / 10000); lcw_dump(lcw); } @@ -358,9 +366,14 @@ static void lcw_update_time(struct lc_watchdog *lcw, const char *message) lcw->lcw_last_touched); cfs_duration_usec(delta_time, &timediff); - CWARN("Expired watchdog for pid %d %s after %lu.%.02lus\n", - lcw->lcw_pid, message, timediff.tv_sec, - timediff.tv_usec / 10000); + LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. " + "This indicates the system was overloaded (too " + "many service threads, or there were not enough " + "hardware resources).\n", + lcw->lcw_pid, + message, + timediff.tv_sec, + timediff.tv_usec / 10000); } lcw->lcw_last_touched = newtime; } @@ -374,7 +387,7 @@ void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout) list_del_init(&lcw->lcw_list); spin_unlock_bh(&lcw_pending_timers_lock); - lcw_update_time(lcw, "touched"); + lcw_update_time(lcw, "resumed"); lcw->lcw_state = LC_WATCHDOG_ENABLED; mod_timer(&lcw->lcw_timer, jiffies + cfs_time_seconds(timeout)); @@ -393,7 +406,7 @@ void lc_watchdog_disable(struct lc_watchdog *lcw) list_del_init(&lcw->lcw_list); spin_unlock_bh(&lcw_pending_timers_lock); - lcw_update_time(lcw, "disabled"); + lcw_update_time(lcw, "completed"); lcw->lcw_state = LC_WATCHDOG_DISABLED; EXIT; @@ -407,7 +420,7 @@ void lc_watchdog_delete(struct lc_watchdog *lcw) del_timer(&lcw->lcw_timer); - lcw_update_time(lcw, "deleted"); + lcw_update_time(lcw, "stopped"); spin_lock_bh(&lcw_pending_timers_lock); if (!list_empty(&lcw->lcw_list)) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index a28274c..1349516 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -11,6 +11,10 @@ tbd Sun Microsystems, Inc. * Recommended e2fsprogs version: 1.41.6.sun1 Severity : normal +Bugzilla : 20722 +Description: Make watchdog timer messages to be more clear and descriptive. + +Severity : normal Bugzilla : 21147 Description: quota type and version is not tolerant to resetting to the same value diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index d14cf67..f895912 100644 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -1641,7 +1641,7 @@ test_18b() { # check if watchdog is triggered do_facet ost1 dmesg > $TMP/lustre-log-${TESTNAME}.log watchdog=`awk '/test 18b/ {start = 1;} - /Watchdog triggered/ { + /Service thread pid/ && /was inactive/{ if (start) { print; } -- 1.8.3.1