Whamcloud - gitweb
b=20722 Make watchdog timer messages more clear and descriptive.
authorDmitry Zogin <dmitry.zogin@sun.com>
Wed, 16 Dec 2009 21:59:44 +0000 (16:59 -0500)
committerjohann <johann@granier.local>
Thu, 17 Dec 2009 12:33:06 +0000 (13:33 +0100)
i=adilger

lnet/libcfs/watchdog.c
lustre/ChangeLog
lustre/tests/sanity-quota.sh

index 707914e..ee53fa7 100644 (file)
@@ -115,9 +115,9 @@ lcw_dump(struct lc_watchdog *lcw)
         RETURN_EXIT;
 #endif
         if (lcw->lcw_task == NULL) {
-                CWARN("Process %d was not found in the task list; "
-                      "watchdog callback may be incomplete\n",
-                      (int)lcw->lcw_pid);
+                LCONSOLE_WARN("Process %d was not found in the task list; "
+                              "watchdog callback may be incomplete\n",
+                              (int)lcw->lcw_pid);
         } else {
                 libcfs_debug_dumpstack(lcw->lcw_task);
         }
@@ -161,10 +161,13 @@ static void lcw_cb(unsigned long data)
 
         if (delta_time < libcfs_watchdog_ratelimit &&
             lcw_recent_watchdog_count > 3) {
-                CWARN("Refusing to fire watchdog for pid %d: it was inactive "
-                      "for %lu.%.02lus. Rate limiting 3 per %d seconds.\n",
-                      (int)lcw->lcw_pid, timediff.tv_sec,
-                      timediff.tv_usec / 10000, libcfs_watchdog_ratelimit);
+                LCONSOLE_WARN("Service thread pid %u was inactive for "
+                              "%lu.%.02lus. Watchdog stack traces are limited "
+                              "to 3 per %d seconds, skipping this one.\n",
+                              (int)lcw->lcw_pid,
+                              timediff.tv_sec,
+                              timediff.tv_usec / 10000,
+                              libcfs_watchdog_ratelimit);
         } else {
                 if (delta_time < libcfs_watchdog_ratelimit) {
                         lcw_recent_watchdog_count++;
@@ -176,9 +179,14 @@ static void lcw_cb(unsigned long data)
 
                /* This warning should appear on the console, but may not get
                 * into the logs since we're running in a softirq handler */
-                CWARN("Watchdog triggered for pid %d: it was inactive for "
-                      "%lu.%.02lus\n", (int)lcw->lcw_pid, timediff.tv_sec,
-                      timediff.tv_usec / 10000);
+                LCONSOLE_WARN("Service thread pid %u was inactive for "
+                              "%lu.%.02lus. The thread might be hung, or it "
+                              "might only be slow and will resume later. "
+                              "Dumping the stack trace for debugging purposes:"
+                              "\n",
+                              (int)lcw->lcw_pid,
+                              timediff.tv_sec,
+                              timediff.tv_usec / 10000);
                 lcw_dump(lcw);
        }
 
@@ -358,9 +366,14 @@ static void lcw_update_time(struct lc_watchdog *lcw, const char *message)
                                                      lcw->lcw_last_touched);
                 cfs_duration_usec(delta_time, &timediff);
 
-                CWARN("Expired watchdog for pid %d %s after %lu.%.02lus\n",
-                      lcw->lcw_pid, message, timediff.tv_sec,
-                      timediff.tv_usec / 10000);
+                LCONSOLE_WARN("Service thread pid %u %s after %lu.%.02lus. "
+                              "This indicates the system was overloaded (too "
+                              "many service threads, or there were not enough "
+                              "hardware resources).\n",
+                              lcw->lcw_pid,
+                              message,
+                              timediff.tv_sec,
+                              timediff.tv_usec / 10000);
         }
         lcw->lcw_last_touched = newtime;
 }
@@ -374,7 +387,7 @@ void lc_watchdog_touch(struct lc_watchdog *lcw, int timeout)
         list_del_init(&lcw->lcw_list);
         spin_unlock_bh(&lcw_pending_timers_lock);
 
-        lcw_update_time(lcw, "touched");
+        lcw_update_time(lcw, "resumed");
         lcw->lcw_state = LC_WATCHDOG_ENABLED;
 
         mod_timer(&lcw->lcw_timer, jiffies + cfs_time_seconds(timeout));
@@ -393,7 +406,7 @@ void lc_watchdog_disable(struct lc_watchdog *lcw)
                 list_del_init(&lcw->lcw_list);
         spin_unlock_bh(&lcw_pending_timers_lock);
 
-        lcw_update_time(lcw, "disabled");
+        lcw_update_time(lcw, "completed");
         lcw->lcw_state = LC_WATCHDOG_DISABLED;
 
         EXIT;
@@ -407,7 +420,7 @@ void lc_watchdog_delete(struct lc_watchdog *lcw)
 
         del_timer(&lcw->lcw_timer);
 
-        lcw_update_time(lcw, "deleted");
+        lcw_update_time(lcw, "stopped");
 
         spin_lock_bh(&lcw_pending_timers_lock);
         if (!list_empty(&lcw->lcw_list))
index a28274c..1349516 100644 (file)
@@ -11,6 +11,10 @@ tbd Sun Microsystems, Inc.
        * Recommended e2fsprogs version: 1.41.6.sun1
 
 Severity   : normal
+Bugzilla   : 20722
+Description: Make watchdog timer messages to be more clear and descriptive.
+
+Severity   : normal
 Bugzilla   : 21147
 Description: quota type and version is not tolerant to resetting
             to the same value
index d14cf67..f895912 100644 (file)
@@ -1641,7 +1641,7 @@ test_18b() {
         # check if watchdog is triggered
         do_facet ost1 dmesg > $TMP/lustre-log-${TESTNAME}.log
         watchdog=`awk '/test 18b/ {start = 1;}
-                       /Watchdog triggered/ {
+                       /Service thread pid/ && /was inactive/{
                                if (start) {
                                        print;
                                }