From e16e3d46ee8c44e691c5cd3d25161f2f297fa0fd Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@whamcloud.com>
Date: Fri, 24 Jan 2020 02:20:38 -0700
Subject: [PATCH] LU-13169 tests: add ONLY_REPEAT parameter to repeat subtests

Add the ONLY_REPEAT environment variable, to allow tests specified
by ONLY to be run multiple times, to ensure that the test is passing
consistently (or fixing an intermittent bug).  This is faster than
restarting the test session multiple times for only a few subtests.

Have the iteration around the subshell started for run_one() so that
any registered stack_trap EXIT calls are triggered between iterations,
the fail_loc is reset, grant/health/error checks are done, and so on.

Remove $tdir and $tfile files after each iteration to avoid failures
with the subsequent subtest runs.  For tests that do not follow the
standard naming convention for test directories and files, they need
to be updated to use $tdir and $tfile, which is good in any case.

YAML output splits each iteration into a separate subtest for Maloo.
The output from run_one() is appended to a single output file for all
iterations so all output is captured instead of just the last one.

The iterations will continue until $ONLY_REPEAT loops pass, or until
the subtest hits an error.  Trying to continue for all iterations in
the face of errors would likely end up with all of later iterations
failing also due to leftover state from the previous failure, and the
goal is for the subtests to pass consistently.  If we are trying to
determine rates of intermittent failures, this can be computed using
1/num_passes about the same as num_failures/ONLY_REPEAT iterations.

Rename variables in subtests to avoid clash with testnum, testname,
and TESTNAME, and use them consistently in functions and subtests.

Test-Parameters: testlist=sanity envdefinitions=ONLY=27l,ONLY_REPEAT=100
Signed-off-by: Andreas Dilger <adilger@dilger.ca>
Change-Id: I5449590dc3e25c113b059974fb7b96c892434380
Reviewed-on: https://review.whamcloud.com/37321
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: James Nunez <jnunez@whamcloud.com>
Reviewed-by: Charlie Olmstead <charlie@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/tests/conf-sanity.sh    |  10 +--
 lustre/tests/functions.sh      |   2 +-
 lustre/tests/recovery-small.sh |   2 +-
 lustre/tests/sanityn.sh        |   8 +-
 lustre/tests/test-framework.sh | 172 +++++++++++++++++++++++------------------
 5 files changed, 108 insertions(+), 86 deletions(-)

diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh
index 1cc9a9c..c9355e6 100644
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -8530,7 +8530,7 @@ test_123ab() {
 	local orig_val
 
 	orig_val=$(do_facet mgs $LCTL get_param jobid_name)
-	do_facet mgs $LCTL set_param -P jobid_name="testname"
+	do_facet mgs $LCTL set_param -P jobid_name="TESTNAME"
 
 	yaml=$(do_facet mgs $LCTL --device MGS llog_print params |
 	       grep jobid_name | tail -n 1)
@@ -8539,7 +8539,7 @@ test_123ab() {
 	local val=$(awk '{ print $12 }' <<< "$yaml")
 	#return to the default
 	do_facet mgs $LCTL set_param -P jobid_name=$orig_val
-	[ $val = "testname" ] || error "bad value: $val"
+	[ $val = "TESTNAME" ] || error "bad value: $val"
 	[ $param = "jobid_name," ] || error "Bad param: $param"
 }
 run_test 123ab "llog_print params output values from set_param -P"
@@ -8655,7 +8655,7 @@ test_123F() {
 
 	# set jobid_var to a different value for test
 	local orig_val=$(do_facet mgs $LCTL get_param jobid_var)
-	do_facet mgs $LCTL set_param -P jobid_var="testname"
+	do_facet mgs $LCTL set_param -P jobid_var="TESTNAME"
 
 	for i in $cfgfiles params; do
 		do_facet mgs "lctl --device MGS llog_print ${i} >> $yaml_file"
@@ -8678,8 +8678,8 @@ test_123F() {
 	local set_val=$(do_facet mgs $LCTL get_param jobid_var)
 	do_facet mgs $LCTL set_param -P $orig_val
 
-	[ $set_val == "jobid_var=testname" ] ||
-		error "$set_val is not testname"
+	[ $set_val == "jobid_var=TESTNAME" ] ||
+		error "$set_val is not TESTNAME"
 
 	do_facet mgs rm "$yaml_file"
 	cleanup
diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh
index 1bd6384..f3e8909 100644
--- a/lustre/tests/functions.sh
+++ b/lustre/tests/functions.sh
@@ -51,7 +51,7 @@ lrepl() {
 EOF
 
     # Prompt escapes don't work in read -p, sadly.
-    prompt=":test_${testnum:-UNKNOWN}:$(uname -n):$(basename $PWD)% "
+    prompt=":${TESTNAME:-UNKNOWN}:$(uname -n):$(basename $PWD)% "
 
     # We use read -r to get close to a shell experience
     while read -e -r -p "$prompt" rawline; do
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh
index b06a47c..7afd0c7 100755
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -1229,7 +1229,7 @@ test_51() {
         for i in $SEQ
         do
 		#echo failover in $i sec
-		log "test_$testnum: failover in $i sec"
+		log "$TESTNAME: failover in $i sec"
 		sleep $i
 		facet_failover $SINGLEMDS
         done
diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh
index 938bd1a..1618b82 100755
--- a/lustre/tests/sanityn.sh
+++ b/lustre/tests/sanityn.sh
@@ -4800,7 +4800,7 @@ test_103() {
 	[ $OST1_VERSION -lt $(version_code 2.10.50) ] &&
 		skip "Lockahead needs OST version at least 2.10.50"
 
-	local testnum=23
+	local locktest=23
 
 	test_mkdir -p $DIR/$tdir
 
@@ -4817,7 +4817,7 @@ test_103() {
 	do_facet ost1 $LCTL set_param fail_loc=0x415 fail_val=2
 
 	echo "Incorrect size expected (no glimpse fix):"
-	lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+	lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $locktest -f $tfile
 	rc=$?
 	if [ $rc -eq 0 ]; then
 		echo "This doesn't work 100%, but this is just reproducing the bug, not testing the fix, so OK to not fail test."
@@ -4834,9 +4834,9 @@ test_103() {
 	do_facet ost1 $LCTL set_param fail_loc=0x214 fail_val=2
 
 	# Write commit is still delayed by 2 seconds
-	lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $testnum -f $tfile
+	lockahead_test -d $DIR/$tdir -D $DIR2/$tdir -t $locktest -f $tfile
 	rc=$?
-	[ $rc -eq 0 ] || error "Lockahead test${testnum} failed, ${rc}"
+	[ $rc -eq 0 ] || error "Lockahead test$locktest failed, $rc"
 
 	# guarantee write commit timeout has expired
 	sleep 2
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
index 66da8b5..32dbc87 100755
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -2735,16 +2735,15 @@ start_client_load() {
 }
 
 start_client_loads () {
-    local -a clients=(${1//,/ })
-    local numloads=${#CLIENT_LOADS[@]}
-    local testnum
+	local -a clients=(${1//,/ })
+	local numloads=${#CLIENT_LOADS[@]}
 
-    for ((nodenum=0; nodenum < ${#clients[@]}; nodenum++ )); do
-        testnum=$((nodenum % numloads))
-        start_client_load ${clients[nodenum]} ${CLIENT_LOADS[testnum]}
-    done
-    # bug 22169: wait the background threads to start
-    sleep 2
+	for ((nodenum=0; nodenum < ${#clients[@]}; nodenum++ )); do
+		local load=$((nodenum % numloads))
+		start_client_load ${clients[nodenum]} ${CLIENT_LOADS[load]}
+	done
+	# bug 22169: wait the background threads to start
+	sleep 2
 }
 
 # only for remote client
@@ -6176,6 +6175,7 @@ skip_noexit() {
 
 	[[ -n "$TESTSUITELOG" ]] &&
 		echo "$TESTSUITE: SKIP: $TESTNAME $@" >> $TESTSUITELOG || true
+	unset TESTNAME
 }
 
 skip() {
@@ -6226,59 +6226,63 @@ basetest() {
 export LAST_SKIPPED=
 export ALWAYS_SKIPPED=
 #
-# Main entry into test-framework. This is called with the name and
-# description of a test. The name is used to find the function to run
+# Main entry into test-framework. This is called with the number and
+# description of a test. The number is used to find the function to run
 # the test using "test_$name".
 #
 # This supports a variety of methods of specifying specific test to
-# run or not run.  These need to be documented...
+# run or not run:
+# - ONLY= env variable with space-separated list of test numbers to run
+# - EXCEPT= env variable with space-separated list of test numbers to exclude
 #
 run_test() {
 	assert_DIR
-	export base=$(basetest $1)
-	TESTNAME=test_$1
+	local testnum=$1
+	local testmsg=$2
+	export base=$(basetest $testnum)
+	export TESTNAME=test_$testnum
 	LAST_SKIPPED=
 	ALWAYS_SKIPPED=
 
 	# Check the EXCEPT, ALWAYS_EXCEPT and SLOW lists to see if we
 	# need to skip the current test. If so, set the ALWAYS_SKIPPED flag.
-	local testname=EXCEPT_$1
-	local testname_base=EXCEPT_$base
-	if [ ${!testname}x != x ]; then
+	local isexcept=EXCEPT_$testnum
+	local isexcept_base=EXCEPT_$base
+	if [ ${!isexcept}x != x ]; then
 		ALWAYS_SKIPPED="y"
-		skip_message="skipping excluded test $1"
-	elif [ ${!testname_base}x != x ]; then
+		skip_message="skipping excluded test $testnum"
+	elif [ ${!isexcept_base}x != x ]; then
 		ALWAYS_SKIPPED="y"
-		skip_message="skipping excluded test $1 (base $base)"
+		skip_message="skipping excluded test $testnum (base $base)"
 	fi
 
-	testname=EXCEPT_ALWAYS_$1
-	testname_base=EXCEPT_ALWAYS_$base
-	if [ ${!testname}x != x ]; then
+	isexcept=EXCEPT_ALWAYS_$testnum
+	isexcept_base=EXCEPT_ALWAYS_$base
+	if [ ${!isexcept}x != x ]; then
 		ALWAYS_SKIPPED="y"
-		skip_message="skipping ALWAYS excluded test $1"
-	elif [ ${!testname_base}x != x ]; then
+		skip_message="skipping ALWAYS excluded test $testnum"
+	elif [ ${!isexcept_base}x != x ]; then
 		ALWAYS_SKIPPED="y"
-		skip_message="skipping ALWAYS excluded test $1 (base $base)"
+		skip_message="skipping ALWAYS excluded test $testnum (base $base)"
 	fi
 
-	testname=EXCEPT_SLOW_$1
-	testname_base=EXCEPT_SLOW_$base
-	if [ ${!testname}x != x ]; then
+	isexcept=EXCEPT_SLOW_$testnum
+	isexcept_base=EXCEPT_SLOW_$base
+	if [ ${!isexcept}x != x ]; then
 		ALWAYS_SKIPPED="y"
-		skip_message="skipping SLOW test $1"
-	elif [ ${!testname_base}x != x ]; then
+		skip_message="skipping SLOW test $testnum"
+	elif [ ${!isexcept_base}x != x ]; then
 		ALWAYS_SKIPPED="y"
-		skip_message="skipping SLOW test $1 (base $base)"
+		skip_message="skipping SLOW test $testnum (base $base)"
 	fi
 
 	# If there are tests on the ONLY list, check if the current test
 	# is on that list and, if so, check if the test is to be skipped
 	# and if we are supposed to honor the skip lists.
 	if [ -n "$ONLY" ]; then
-		testname=ONLY_$1
-		testname_base=ONLY_$base
-		if [[ ${!testname}x != x || ${!testname_base}x != x ]]; then
+		local isonly=ONLY_$testnum
+		local isonly_base=ONLY_$base
+		if [[ ${!isonly}x != x || ${!isonly_base}x != x ]]; then
 
 			if [[ -n "$ALWAYS_SKIPPED" && -n "$HONOR_EXCEPT" ]]; then
 				LAST_SKIPPED="y"
@@ -6288,7 +6292,7 @@ run_test() {
 				[ -n "$LAST_SKIPPED" ] &&
 					echo "" && LAST_SKIPPED=
 				ALWAYS_SKIPPED=
-				run_one_logged $1 "$2"
+				run_one_logged $testnum "$testmsg"
 				return $?
 			fi
 
@@ -6303,10 +6307,9 @@ run_test() {
 		skip_noexit "$skip_message"
 		return 0
 	else
-		run_one_logged $1 "$2"
+		run_one_logged $testnum "$testmsg"
 		return $?
 	fi
-
 }
 
 log() {
@@ -6406,10 +6409,7 @@ group descriptors corrupted"
 #
 run_one() {
 	local testnum=$1
-	local message=$2
-	export tfile=f${testnum}.${TESTSUITE}
-	export tdir=d${testnum}.${TESTSUITE}
-	export TESTNAME=test_$testnum
+	local testmsg="$2"
 	local SAVE_UMASK=`umask`
 	umask 0022
 
@@ -6417,7 +6417,7 @@ run_one() {
 		$SETUP
 	fi
 
-	banner "test $testnum: $message"
+	banner "test $testnum: $testmsg"
 	test_${testnum} || error "test_$testnum failed with $?"
 	cd $SAVE_PWD
 	reset_fail_loc
@@ -6428,9 +6428,6 @@ run_one() {
 		ps auxww | grep -v grep | grep -q "multiop " &&
 					error "multiop still running"
 	fi
-	unset TESTNAME
-	unset tdir
-	unset tfile
 	umask $SAVE_UMASK
 	$CLEANUP
 	return 0
@@ -6443,49 +6440,74 @@ run_one() {
 #  - test result is saved to data file
 #
 run_one_logged() {
-	local BEFORE=$(date +%s)
-	local TEST_ERROR
-	local name=${TESTSUITE}.test_${1}.test_log.$(hostname -s).log
+	local before=$SECONDS
+	local testnum=$1
+	local testmsg=$2
+	export tfile=f${testnum}.${TESTSUITE}
+	export tdir=d${testnum}.${TESTSUITE}
+	local name=$TESTSUITE.$TESTNAME.test_log.$(hostname -s).log
 	local test_log=$LOGDIR/$name
-	local zfs_log_name=${TESTSUITE}.test_${1}.zfs_log
+	local zfs_log_name=$TESTSUITE.$TESTNAME.zfs_log
 	local zfs_debug_log=$LOGDIR/$zfs_log_name
-	rm -rf $LOGDIR/err
-	rm -rf $LOGDIR/ignore
-	rm -rf $LOGDIR/skip
 	local SAVE_UMASK=$(umask)
+	local rc=0
 	umask 0022
 
+	rm -f $LOGDIR/err $LOGDIR/ignore $LOGDIR/skip
 	echo
-	log_sub_test_begin test_${1}
-	(run_one $1 "$2") 2>&1 | tee -i $test_log
-	local RC=${PIPESTATUS[0]}
-
-	[ $RC -ne 0 ] && [ ! -f $LOGDIR/err ] &&
-		echo "test_$1 returned $RC" | tee $LOGDIR/err
-
-	duration=$(($(date +%s) - $BEFORE))
-	pass "$1" "(${duration}s)"
+	# if ${ONLY_$testnum} set, repeat $ONLY_REPEAT times, otherwise once
+	local isonly=ONLY_$testnum
+	local repeat=${!isonly:+$ONLY_REPEAT}
+
+	for testiter in $(seq ${repeat:-1}); do
+		local before_sub=$SECONDS
+		log_sub_test_begin $TESTNAME
+
+		# remove temp files between repetitions to avoid test failures
+		[ -n "$append" -a -n "$DIR" -a -n "$tdir" -a -n "$tfile" ] &&
+			rm -rf $DIR/$tdir* $DIR/$tfile*
+		# loop around subshell so stack_trap EXIT triggers each time
+		(run_one $testnum "$testmsg") 2>&1 | tee -i $append $test_log
+		rc=${PIPESTATUS[0]}
+		local append=-a
+		local duration_sub=$((SECONDS - before_sub))
+		local test_error
+
+		[[ $rc != 0 && ! -f $LOGDIR/err ]] &&
+			echo "$TESTNAME returned $rc" | tee $LOGDIR/err
+
+		if [[ -f $LOGDIR/err ]]; then
+			test_error=$(cat $LOGDIR/err)
+			TEST_STATUS="FAIL"
+		elif [[ -f $LOGDIR/ignore ]]; then
+			test_error=$(cat $LOGDIR/ignore)
+		elif [[ -f $LOGDIR/skip ]]; then
+			test_error=$(cat $LOGDIR/skip)
+			TEST_STATUS="SKIP"
+		else
+			TEST_STATUS="PASS"
+		fi
 
-	if [[ -f $LOGDIR/err ]]; then
-		TEST_ERROR=$(cat $LOGDIR/err)
-	elif [[ -f $LOGDIR/ignore ]]; then
-		TEST_ERROR=$(cat $LOGDIR/ignore)
-	elif [[ -f $LOGDIR/skip ]]; then
-		TEST_ERROR=$(cat $LOGDIR/skip)
-	fi
-	log_sub_test_end $TEST_STATUS $duration "$RC" "$TEST_ERROR"
+		pass "$testnum" "($((SECONDS - before))s)"
+		log_sub_test_end $TEST_STATUS $duration_sub "$rc" "$test_error"
+		[[ $rc != 0 ]] && break
+	done
 
-	if [[ "$TEST_STATUS" != "SKIP" ]] && [[ -f $TF_SKIP ]]; then
+	if [[ "$TEST_STATUS" != "SKIP" && -f $TF_SKIP ]]; then
 		rm -f $TF_SKIP
 	fi
 
 	if [ -f $LOGDIR/err ]; then
 		log_zfs_info "$zfs_debug_log"
-		$FAIL_ON_ERROR && exit $RC
+		$FAIL_ON_ERROR && exit $rc
 	fi
 
 	umask $SAVE_UMASK
 
+	unset TESTNAME
+	unset tdir
+	unset tfile
+
 	return 0
 }
 
@@ -6507,9 +6529,9 @@ check_grant() {
 	export base=$(basetest $1)
 	[ "$CHECK_GRANT" == "no" ] && return 0
 
-	testnamebase=GCHECK_ONLY_${base}
-	testname=GCHECK_ONLY_$1
-	[ ${!testnamebase}x == x -a ${!testname}x == x ] && return 0
+	local isonly_base=GCHECK_ONLY_${base}
+	local isonly=GCHECK_ONLY_$1
+	[ ${!isonly_base}x == x -a ${!isonly}x == x ] && return 0
 
 	echo -n "checking grant......"
 
-- 
1.8.3.1