From c73f731f252b2628dc17de315f79bbf5d86965e0 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Mon, 16 Sep 2024 18:23:16 -0600 Subject: [PATCH] LU-17428 tests: restore recovery-small/10a lru_max_age Restore the longer lru_max_age in recovery-small test_10a since this otherwise prevents the client from being evicted. Skip the console message check for stuck MDS threads on subsequent iterations of test_10a when the test is run in a loop, since message ratelimiting may prevent the console message from being printed. Test-Parameters: trivial testlist=recovery-small env=ONLY=10a,ONLY_REPEAT=10 Fixes: 357cae970c ("LU-17428 ldlm: reduce default lru_max_age") Signed-off-by: Andreas Dilger Change-Id: Ia8ba5f83aa001d3c810e13637754b0e169dc3b9b Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56377 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Timothy Day Reviewed-by: Arshad Hussain Reviewed-by: Oleg Drokin --- lustre/tests/recovery-small.sh | 11 ++++++++++- lustre/tests/test-framework.sh | 8 ++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 64856f4..674b8e8 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -139,6 +139,12 @@ run_test 9 "pause bulk on OST (bug 1420)" test_10a() { local before=$(date +%s) local evict + local lru_param="ldlm.namespaces.*mdc*.lru_max_age" + local old_max_age=($($LCTL get_param -n $lru_param)) + local old_ratelimit=$($LCTL get_param console_ratelimit) + + $LCTL set_param $lru_param=3900s console_ratelimit=0 + stack_trap "$LCTL set_param $lru_param=$old_max_age $old_ratelimit" do_facet client "stat $DIR > /dev/null" || error "failed to stat $DIR: $?" @@ -149,13 +155,16 @@ test_10a() { client_reconnect evict=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state | awk -F"[ [,]" '/EVICTED ]$/ { if (mx<$5) {mx=$5;} } END { print mx }') - [ ! -z "$evict" ] && [[ $evict -gt $before ]] || + [[ -n "$evict" ]] && (( $evict > $before )) || (do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state; error "no eviction: $evict before:$before") do_facet client checkstat -v -p 0777 $DIR || error "client checkstat failed: $?" + # console messages may be ratelimited on later iterations + (( ONLY_REPEAT_ITER == 1 )) || return 0 + # check that the thread watchdog is working properly do_facet mds1 dmesg | tac | sed "/${TESTNAME/_/ }/,$ d" | grep "[Ss]ervice thread pid .* was inactive" || diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index c352747..3439af0 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -7569,7 +7569,7 @@ run_one_logged() { local repeat_end_sec=$((SECONDS + ONLY_MINUTES * 60)) fi - local testiter=1 + export ONLY_REPEAT_ITER=1 while true; do local before_sub=$SECONDS @@ -7578,7 +7578,7 @@ run_one_logged() { if [[ -n "$append" ]]; then [[ -n "$tdir" ]] && rm -rvf $DIR/$tdir* [[ -n "$tfile" ]] && rm -vf $DIR/$tfile* - echo "subtest iteration $testiter/$repeat " \ + echo "subtest iteration $ONLY_REPEAT_ITER/$repeat " \ "($(((SECONDS-before)/60))/$ONLY_MINUTES min)" fi # loop around subshell so stack_trap EXIT triggers each time @@ -7620,10 +7620,10 @@ run_one_logged() { # no repeat options were set, break after the first iteration [[ -z "$repeat" && -z "$repeat_end_sec" ]] && break # break if any repeat options were set and have been met - [[ -n "$repeat" ]] && (( $testiter >= $repeat )) && break + [[ -n "$repeat" ]] && (( ONLY_REPEAT_ITER >= repeat )) && break [[ -n "$repeat_end_sec" ]] && (( $SECONDS >= $repeat_end_sec )) && break - ((testiter++)) + ((ONLY_REPEAT_ITER++)) done [[ $KPTR_ON_MOUNT ]] || kptr_restore -- 1.8.3.1