From e16e3d46ee8c44e691c5cd3d25161f2f297fa0fd Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Fri, 24 Jan 2020 02:20:38 -0700 Subject: [PATCH] LU-13169 tests: add ONLY_REPEAT parameter to repeat subtests Add the ONLY_REPEAT environment variable, to allow tests specified by ONLY to be run multiple times, to ensure that the test is passing consistently (or fixing an intermittent bug). This is faster than restarting the test session multiple times for only a few subtests. Have the iteration around the subshell started for run_one() so that any registered stack_trap EXIT calls are triggered between iterations, the fail_loc is reset, grant/health/error checks are done, and so on. Remove $tdir and $tfile files after each iteration to avoid failures with the subsequent subtest runs. For tests that do not follow the standard naming convention for test directories and files, they need to be updated to use $tdir and $tfile, which is good in any case. YAML output splits each iteration into a separate subtest for Maloo. The output from run_one() is appended to a single output file for all iterations so all output is captured instead of just the last one. The iterations will continue until $ONLY_REPEAT loops pass, or until the subtest hits an error. Trying to continue for all iterations in the face of errors would likely end up with all of later iterations failing also due to leftover state from the previous failure, and the goal is for the subtests to pass consistently. If we are trying to determine rates of intermittent failures, this can be computed using 1/num_passes about the same as num_failures/ONLY_REPEAT iterations. Rename variables in subtests to avoid clash with testnum, testname, and TESTNAME, and use them consistently in functions and subtests. Test-Parameters: testlist=sanity envdefinitions=ONLY=27l,ONLY_REPEAT=100 Signed-off-by: Andreas Dilger Change-Id: I5449590dc3e25c113b059974fb7b96c892434380 Reviewed-on: https://review.whamcloud.com/37321 Tested-by: jenkins Tested-by: Maloo Reviewed-by: James Nunez Reviewed-by: Charlie Olmstead Reviewed-by: Oleg Drokin --- lustre/tests/conf-sanity.sh | 10 +-- lustre/tests/functions.sh | 2 +- lustre/tests/recovery-small.sh | 2 +- lustre/tests/sanityn.sh | 8 +- lustre/tests/test-framework.sh | 172 +++++++++++++++++++++++------------------ 5 files changed, 108 insertions(+), 86 deletions(-) diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 1cc9a9c..c9355e6 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -8530,7 +8530,7 @@ test_123ab() { local orig_val orig_val=$(do_facet mgs $LCTL get_param jobid_name) - do_facet mgs $LCTL set_param -P jobid_name="testname" + do_facet mgs $LCTL set_param -P jobid_name="TESTNAME" yaml=$(do_facet mgs $LCTL --device MGS llog_print params | grep jobid_name | tail -n 1) @@ -8539,7 +8539,7 @@ test_123ab() { local val=$(awk '{ print $12 }' <<< "$yaml") #return to the default do_facet mgs $LCTL set_param -P jobid_name=$orig_val - [ $val = "testname" ] || error "bad value: $val" + [ $val = "TESTNAME" ] || error "bad value: $val" [ $param = "jobid_name," ] || error "Bad param: $param" } run_test 123ab "llog_print params output values from set_param -P" @@ -8655,7 +8655,7 @@ test_123F() { # set jobid_var to a different value for test local orig_val=$(do_facet mgs $LCTL get_param jobid_var) - do_facet mgs $LCTL set_param -P jobid_var="testname" + do_facet mgs $LCTL set_param -P jobid_var="TESTNAME" for i in $cfgfiles params; do do_facet mgs "lctl --device MGS llog_print ${i} >> $yaml_file" @@ -8678,8 +8678,8 @@ test_123F() { local set_val=$(do_facet mgs $LCTL get_param jobid_var) do_facet mgs $LCTL set_param -P $orig_val - [ $set_val == "jobid_var=testname" ] || - error "$set_val is not testname" + [ $set_val == "jobid_var=TESTNAME" ] || + error "$set_val is not TESTNAME" do_facet mgs rm "$yaml_file" cleanup diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh index 1bd6384..f3e8909 100644 --- a/lustre/tests/functions.sh +++ b/lustre/tests/functions.sh @@ -51,7 +51,7 @@ lrepl() { EOF # Prompt escapes don't work in read -p, sadly. - prompt=":test_${testnum:-UNKNOWN}:$(uname -n):$(basename $PWD)% " + prompt=":${TESTNAME:-UNKNOWN}:$(uname -n):$(basename $PWD)% " # We use read -r to get close to a shell experience while read -e -r -p "$prompt" rawline; do diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index b06a47c..7afd0c7 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -1229,7 +1229,7 @@ test_51() { for i in $SEQ do #echo failover in $i sec - log "test_$testnum: failover in $i sec" + log "$TESTNAME: failover in $i sec" sleep $i facet_failover $SINGLEMDS done diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 938bd1a..1618b82 100755 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -4800,7 +4800,7 @@ test_103() { [ $OST1_VERSION -lt $(version_code 2.10.50) ] && skip "Lockahead needs OST version at least 2.10.50" - local testnum=23 + local locktest=23 test_mkdir -p $DIR/$tdir @@ -4817,7 +4817,7 @@ test_103() { do_facet ost1 $LCTL set_param fail_loc=0x415 fail_val=2 echo "Incorrect size expected (no glimpse fix):" - lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile + lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $locktest -f $tfile rc=$? if [ $rc -eq 0 ]; then echo "This doesn't work 100%, but this is just reproducing the bug, not testing the fix, so OK to not fail test." @@ -4834,9 +4834,9 @@ test_103() { do_facet ost1 $LCTL set_param fail_loc=0x214 fail_val=2 # Write commit is still delayed by 2 seconds - lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile + lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $locktest -f $tfile rc=$? - [ $rc -eq 0 ] || error "Lockahead test${testnum} failed, ${rc}" + [ $rc -eq 0 ] || error "Lockahead test$locktest failed, $rc" # guarantee write commit timeout has expired sleep 2 diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 66da8b5..32dbc87 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2735,16 +2735,15 @@ start_client_load() { } start_client_loads () { - local -a clients=(${1//,/ }) - local numloads=${#CLIENT_LOADS[@]} - local testnum + local -a clients=(${1//,/ }) + local numloads=${#CLIENT_LOADS[@]} - for ((nodenum=0; nodenum < ${#clients[@]}; nodenum++ )); do - testnum=$((nodenum % numloads)) - start_client_load ${clients[nodenum]} ${CLIENT_LOADS[testnum]} - done - # bug 22169: wait the background threads to start - sleep 2 + for ((nodenum=0; nodenum < ${#clients[@]}; nodenum++ )); do + local load=$((nodenum % numloads)) + start_client_load ${clients[nodenum]} ${CLIENT_LOADS[load]} + done + # bug 22169: wait the background threads to start + sleep 2 } # only for remote client @@ -6176,6 +6175,7 @@ skip_noexit() { [[ -n "$TESTSUITELOG" ]] && echo "$TESTSUITE: SKIP: $TESTNAME $@" >> $TESTSUITELOG || true + unset TESTNAME } skip() { @@ -6226,59 +6226,63 @@ basetest() { export LAST_SKIPPED= export ALWAYS_SKIPPED= # -# Main entry into test-framework. This is called with the name and -# description of a test. The name is used to find the function to run +# Main entry into test-framework. This is called with the number and +# description of a test. The number is used to find the function to run # the test using "test_$name". # # This supports a variety of methods of specifying specific test to -# run or not run. These need to be documented... +# run or not run: +# - ONLY= env variable with space-separated list of test numbers to run +# - EXCEPT= env variable with space-separated list of test numbers to exclude # run_test() { assert_DIR - export base=$(basetest $1) - TESTNAME=test_$1 + local testnum=$1 + local testmsg=$2 + export base=$(basetest $testnum) + export TESTNAME=test_$testnum LAST_SKIPPED= ALWAYS_SKIPPED= # Check the EXCEPT, ALWAYS_EXCEPT and SLOW lists to see if we # need to skip the current test. If so, set the ALWAYS_SKIPPED flag. - local testname=EXCEPT_$1 - local testname_base=EXCEPT_$base - if [ ${!testname}x != x ]; then + local isexcept=EXCEPT_$testnum + local isexcept_base=EXCEPT_$base + if [ ${!isexcept}x != x ]; then ALWAYS_SKIPPED="y" - skip_message="skipping excluded test $1" - elif [ ${!testname_base}x != x ]; then + skip_message="skipping excluded test $testnum" + elif [ ${!isexcept_base}x != x ]; then ALWAYS_SKIPPED="y" - skip_message="skipping excluded test $1 (base $base)" + skip_message="skipping excluded test $testnum (base $base)" fi - testname=EXCEPT_ALWAYS_$1 - testname_base=EXCEPT_ALWAYS_$base - if [ ${!testname}x != x ]; then + isexcept=EXCEPT_ALWAYS_$testnum + isexcept_base=EXCEPT_ALWAYS_$base + if [ ${!isexcept}x != x ]; then ALWAYS_SKIPPED="y" - skip_message="skipping ALWAYS excluded test $1" - elif [ ${!testname_base}x != x ]; then + skip_message="skipping ALWAYS excluded test $testnum" + elif [ ${!isexcept_base}x != x ]; then ALWAYS_SKIPPED="y" - skip_message="skipping ALWAYS excluded test $1 (base $base)" + skip_message="skipping ALWAYS excluded test $testnum (base $base)" fi - testname=EXCEPT_SLOW_$1 - testname_base=EXCEPT_SLOW_$base - if [ ${!testname}x != x ]; then + isexcept=EXCEPT_SLOW_$testnum + isexcept_base=EXCEPT_SLOW_$base + if [ ${!isexcept}x != x ]; then ALWAYS_SKIPPED="y" - skip_message="skipping SLOW test $1" - elif [ ${!testname_base}x != x ]; then + skip_message="skipping SLOW test $testnum" + elif [ ${!isexcept_base}x != x ]; then ALWAYS_SKIPPED="y" - skip_message="skipping SLOW test $1 (base $base)" + skip_message="skipping SLOW test $testnum (base $base)" fi # If there are tests on the ONLY list, check if the current test # is on that list and, if so, check if the test is to be skipped # and if we are supposed to honor the skip lists. if [ -n "$ONLY" ]; then - testname=ONLY_$1 - testname_base=ONLY_$base - if [[ ${!testname}x != x || ${!testname_base}x != x ]]; then + local isonly=ONLY_$testnum + local isonly_base=ONLY_$base + if [[ ${!isonly}x != x || ${!isonly_base}x != x ]]; then if [[ -n "$ALWAYS_SKIPPED" && -n "$HONOR_EXCEPT" ]]; then LAST_SKIPPED="y" @@ -6288,7 +6292,7 @@ run_test() { [ -n "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED= ALWAYS_SKIPPED= - run_one_logged $1 "$2" + run_one_logged $testnum "$testmsg" return $? fi @@ -6303,10 +6307,9 @@ run_test() { skip_noexit "$skip_message" return 0 else - run_one_logged $1 "$2" + run_one_logged $testnum "$testmsg" return $? fi - } log() { @@ -6406,10 +6409,7 @@ group descriptors corrupted" # run_one() { local testnum=$1 - local message=$2 - export tfile=f${testnum}.${TESTSUITE} - export tdir=d${testnum}.${TESTSUITE} - export TESTNAME=test_$testnum + local testmsg="$2" local SAVE_UMASK=`umask` umask 0022 @@ -6417,7 +6417,7 @@ run_one() { $SETUP fi - banner "test $testnum: $message" + banner "test $testnum: $testmsg" test_${testnum} || error "test_$testnum failed with $?" cd $SAVE_PWD reset_fail_loc @@ -6428,9 +6428,6 @@ run_one() { ps auxww | grep -v grep | grep -q "multiop " && error "multiop still running" fi - unset TESTNAME - unset tdir - unset tfile umask $SAVE_UMASK $CLEANUP return 0 @@ -6443,49 +6440,74 @@ run_one() { # - test result is saved to data file # run_one_logged() { - local BEFORE=$(date +%s) - local TEST_ERROR - local name=${TESTSUITE}.test_${1}.test_log.$(hostname -s).log + local before=$SECONDS + local testnum=$1 + local testmsg=$2 + export tfile=f${testnum}.${TESTSUITE} + export tdir=d${testnum}.${TESTSUITE} + local name=$TESTSUITE.$TESTNAME.test_log.$(hostname -s).log local test_log=$LOGDIR/$name - local zfs_log_name=${TESTSUITE}.test_${1}.zfs_log + local zfs_log_name=$TESTSUITE.$TESTNAME.zfs_log local zfs_debug_log=$LOGDIR/$zfs_log_name - rm -rf $LOGDIR/err - rm -rf $LOGDIR/ignore - rm -rf $LOGDIR/skip local SAVE_UMASK=$(umask) + local rc=0 umask 0022 + rm -f $LOGDIR/err $LOGDIR/ignore $LOGDIR/skip echo - log_sub_test_begin test_${1} - (run_one $1 "$2") 2>&1 | tee -i $test_log - local RC=${PIPESTATUS[0]} - - [ $RC -ne 0 ] && [ ! -f $LOGDIR/err ] && - echo "test_$1 returned $RC" | tee $LOGDIR/err - - duration=$(($(date +%s) - $BEFORE)) - pass "$1" "(${duration}s)" + # if ${ONLY_$testnum} set, repeat $ONLY_REPEAT times, otherwise once + local isonly=ONLY_$testnum + local repeat=${!isonly:+$ONLY_REPEAT} + + for testiter in $(seq ${repeat:-1}); do + local before_sub=$SECONDS + log_sub_test_begin $TESTNAME + + # remove temp files between repetitions to avoid test failures + [ -n "$append" -a -n "$DIR" -a -n "$tdir" -a -n "$tfile" ] && + rm -rf $DIR/$tdir* $DIR/$tfile* + # loop around subshell so stack_trap EXIT triggers each time + (run_one $testnum "$testmsg") 2>&1 | tee -i $append $test_log + rc=${PIPESTATUS[0]} + local append=-a + local duration_sub=$((SECONDS - before_sub)) + local test_error + + [[ $rc != 0 && ! -f $LOGDIR/err ]] && + echo "$TESTNAME returned $rc" | tee $LOGDIR/err + + if [[ -f $LOGDIR/err ]]; then + test_error=$(cat $LOGDIR/err) + TEST_STATUS="FAIL" + elif [[ -f $LOGDIR/ignore ]]; then + test_error=$(cat $LOGDIR/ignore) + elif [[ -f $LOGDIR/skip ]]; then + test_error=$(cat $LOGDIR/skip) + TEST_STATUS="SKIP" + else + TEST_STATUS="PASS" + fi - if [[ -f $LOGDIR/err ]]; then - TEST_ERROR=$(cat $LOGDIR/err) - elif [[ -f $LOGDIR/ignore ]]; then - TEST_ERROR=$(cat $LOGDIR/ignore) - elif [[ -f $LOGDIR/skip ]]; then - TEST_ERROR=$(cat $LOGDIR/skip) - fi - log_sub_test_end $TEST_STATUS $duration "$RC" "$TEST_ERROR" + pass "$testnum" "($((SECONDS - before))s)" + log_sub_test_end $TEST_STATUS $duration_sub "$rc" "$test_error" + [[ $rc != 0 ]] && break + done - if [[ "$TEST_STATUS" != "SKIP" ]] && [[ -f $TF_SKIP ]]; then + if [[ "$TEST_STATUS" != "SKIP" && -f $TF_SKIP ]]; then rm -f $TF_SKIP fi if [ -f $LOGDIR/err ]; then log_zfs_info "$zfs_debug_log" - $FAIL_ON_ERROR && exit $RC + $FAIL_ON_ERROR && exit $rc fi umask $SAVE_UMASK + unset TESTNAME + unset tdir + unset tfile + return 0 } @@ -6507,9 +6529,9 @@ check_grant() { export base=$(basetest $1) [ "$CHECK_GRANT" == "no" ] && return 0 - testnamebase=GCHECK_ONLY_${base} - testname=GCHECK_ONLY_$1 - [ ${!testnamebase}x == x -a ${!testname}x == x ] && return 0 + local isonly_base=GCHECK_ONLY_${base} + local isonly=GCHECK_ONLY_$1 + [ ${!isonly_base}x == x -a ${!isonly}x == x ] && return 0 echo -n "checking grant......" -- 1.8.3.1