From 594c79f2f855737fa415562a9bbb3fb13aee9ec9 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Tue, 8 Oct 2019 15:42:32 -0600 Subject: [PATCH 1/1] LU-12838 ptlrpc: fix watchdog ratelimit logic The ptlrpc-level watchdog ratelimiting is broken. The kernel prints: mdt00_009: service thread pid 18935 was inactive for 72s. Watchdog stack traces are limited to 3 per 300s, skipping... even though there hasn't been any stack trace printed before. It looks like the __ratelimit() return value is backward from what one would expect from normal English grammar, namely that if __ratelimit() returns true the action should NOT be limited. Fix the logic checking the __ratelimit() return value, and add a check in sanity test_422 (which forces a service thread timeout) to ensure that the watchdog sometimes prints a full stack. Fixes: fc9de679a4c2 ("LU-9859 libcfs: add watchdog for ptlrpc service threads") Test-Parameters: trivial Signed-off-by: Andreas Dilger Change-Id: I4a97dd361c12ac7c7a39c251551c21506b3ebbe5 Reviewed-on: https://review.whamcloud.com/36409 Reviewed-by: James Simmons Tested-by: jenkins Tested-by: Maloo Reviewed-by: Neil Brown Reviewed-by: Oleg Drokin --- lustre/ptlrpc/service.c | 3 ++- lustre/tests/sanity.sh | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 974969b..987982a 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -2656,7 +2656,8 @@ static void ptlrpc_watchdog_fire(struct work_struct *w) u64 ms_lapse = ktime_ms_delta(ktime_get(), thread->t_touched); u32 ms_frac = do_div(ms_lapse, MSEC_PER_SEC); - if (!__ratelimit(&watchdog_limit)) { + /* ___ratelimit() returns true if the action is NOT ratelimited */ + if (__ratelimit(&watchdog_limit)) { /* below message is checked in sanity-quota.sh test_6,18 */ LCONSOLE_WARN("%s: service thread pid %u was inactive for %llu.%03u seconds. The thread might be hung, or it might only be slow and will resume later. Dumping the stack trace for debugging purposes:\n", thread->t_task->comm, thread->t_task->pid, diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 2736f42..3f784ff 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -21413,6 +21413,10 @@ test_422() { wait at_max_set $amc client at_max_set $amo mds1 + + # LU-12838 - verify the ptlrpc thread watchdog is not always throttled + do_facet mds1 "dmesg | grep 'Dumping the stack trace for debugging'" || + error "Watchdog is always throttled" } run_test 422 "kill a process with RPC in progress" -- 1.8.3.1