From a2e38010077df3d95c9942d4f9cde829f75e7267 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@whamcloud.com>
Date: Thu, 28 Feb 2019 17:54:41 -0700
Subject: [PATCH] LU-5338 tests: sanity-lfsck 11b allow larger last_id

In sanity-lfsck test_11b allow the OST to report the last_id
larger than the previously-used OID on the MDS.  This may
happen if the OST preallocates additional objects or skips
some objects during recovery.

Add wait_update_cond() and wait_update_facet_cond() to
allow checking an arbitrary condition against the expected
result rather than only exactly equal to the expected value.

Remove the wait_result() helper function, which is essentially
the same thing as wait_update_facet(), and only has a few users.

Test-Parameters: trivial mdscount=2 mdtcount=4 testlist=sanity-lfsck
Test-Parameters: fstype=zfs mdscount=2 mdtcount=4 testlist=sanity-lfsck
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I1443d16699d8115fce664a331134ca6076ecab07
Reviewed-on: https://review.whamcloud.com/34349
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Hongchao Zhang <hongchao@whamcloud.com>
Reviewed-by: Yang Sheng <ys@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
---
 lustre/tests/sanity-hsm.sh     |   2 +-
 lustre/tests/sanity-lfsck.sh   |  35 ++++++------
 lustre/tests/sanity.sh         |   5 +-
 lustre/tests/test-framework.sh | 120 ++++++++++++++++++++++++++++-------------
 4 files changed, 102 insertions(+), 60 deletions(-)

diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh
index 6d377d9..499f96e 100755
--- a/lustre/tests/sanity-hsm.sh
+++ b/lustre/tests/sanity-hsm.sh
@@ -506,7 +506,7 @@ wait_all_done() {
 	[[ -n $fid ]] && cmd+=" | grep '$fid'"
 	cmd+=" | egrep 'WAITING|STARTED'"
 
-	wait_result $SINGLEMDS "$cmd" "" $timeout ||
+	wait_update_facet --verbose mds1 "$cmd" "" $timeout ||
 		error "requests did not complete"
 }
 
diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh
index f19d464..bddd2ba 100644
--- a/lustre/tests/sanity-lfsck.sh
+++ b/lustre/tests/sanity-lfsck.sh
@@ -1381,9 +1381,8 @@ test_11b() {
 	local proc_path="${FSNAME}-OST0000-osc-MDT0000"
 	local seq=$(do_facet mds1 $LCTL get_param -n \
 		    osp.${proc_path}.prealloc_last_seq)
-	local lastid1=$(do_facet ost1 "lctl get_param -n \
-		obdfilter.${ost1_svc}.last_id" | grep $seq |
-		awk -F: '{ print $2 }')
+	local id_used=$(do_facet mds1 $LCTL get_param -n \
+			osp.${proc_path}.prealloc_last_id)
 
 	umount_client $MOUNT
 	stop ost1 || error "(1) Fail to stop ost1"
@@ -1395,23 +1394,23 @@ test_11b() {
 		error "(2) Fail to start ost1"
 
 	for ((i = 0; i < 60; i++)); do
-		lastid2=$(do_facet ost1 "lctl get_param -n \
-			obdfilter.${ost1_svc}.last_id" | grep $seq |
-			awk -F: '{ print $2 }')
-		[ ! -z $lastid2 ] && break;
+		id_ost1=$(do_facet ost1 \
+			  "$LCTL get_param -n obdfilter.$ost1_svc.last_id" |
+			  awk -F: "/$seq/ { print \$2 }")
+		[ -n "$id_ost1" ] && break
 		sleep 1
 	done
 
 	echo "the on-disk LAST_ID should be smaller than the expected one"
-	[ $lastid1 -gt $lastid2 ] ||
-		error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]"
+	[ $id_used -gt $id_ost1 ] ||
+		error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'"
 
 	echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID"
 	$START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!"
 
-	wait_update_facet ost1 "$LCTL get_param -n \
-		obdfilter.${OST_DEV}.lfsck_layout |
-		awk '/^status/ { print \\\$2 }'" "completed" 32 || {
+	wait_update_facet ost1 \
+		"$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout |
+		 awk '/^status/ { print \\\$2 }'" "completed" 32 || {
 		$SHOW_LAYOUT_ON_OST
 		error "(6) unexpected status"
 	}
@@ -1422,12 +1421,12 @@ test_11b() {
 		error "(8) Fail to start ost1"
 
 	echo "the on-disk LAST_ID should have been rebuilt"
-	wait_update_facet ost1 "$LCTL get_param -n \
-		obdfilter.${ost1_svc}.last_id | grep $seq |
-		awk -F: '{ print \\\$2 }'" "$lastid1" 60 || {
-		do_facet ost1 $LCTL get_param -n \
-		obdfilter.${ost1_svc}.last_id
-		error "(9) expect lastid1 $seq:$lastid1"
+	# last_id may be larger than $id_used if objects were created/skipped
+	wait_update_facet_cond ost1 \
+		"$LCTL get_param -n obdfilter.$ost1_svc.last_id |
+		 awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || {
+		do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id
+		error "(9) expect last_id >= id_used $seq:$id_used"
 	}
 
 	do_facet ost1 $LCTL set_param fail_loc=0
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 34c8753..31b36c7 100755
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -8769,9 +8769,8 @@ test_77k() { # LU-10906
 	local i
 
 	[ "$ORIG_CSUM" ] || ORIG_CSUM=$(eval $get_checksum)
-	stack_trap "wait_update $HOSTNAME '$get_checksum' $ORIG_CSUM" EXIT
-	stack_trap "do_facet mgs $LCTL set_param -P $cksum_param=$ORIG_CSUM" \
-		EXIT
+	stack_trap "wait_update $HOSTNAME '$get_checksum' $ORIG_CSUM || true"
+	stack_trap "do_facet mgs $LCTL set_param -P $cksum_param=$ORIG_CSUM"
 
 	for i in 0 1; do
 		do_facet mgs $LCTL set_param -P $cksum_param=$i ||
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
index fcbd4d1..84d2671 100755
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -2983,54 +2983,103 @@ cleanup_check() {
 	return 0
 }
 
-wait_update () {
+##
+# wait for a command to return the expected result
+#
+# This will run @check on @node repeatedly until the output matches @expect
+# based on the supplied condition, or until @max_wait seconds have elapsed,
+# whichever comes first.  @cond may be one of the normal bash operators,
+# "-gt", "-ge", "-eq", "-le", "-lt", "==", "!=", or "=~", and must be quoted
+# in the caller to avoid unintentional evaluation by the shell in the caller.
+#
+# If @max_wait is not specified, the condition will be checked for up to 90s.
+#
+# If --verbose is passed as the first argument, the result is printed on each
+# value change, otherwise it is only printed after every 10s interval.
+#
+# Using wait_update_cond() or related helper function is preferable to adding
+# a "long enough" wait for some state to change in the background, since
+# "long enough" may be too short due to tunables, system config, or running in
+# a VM, and must by necessity wait too long for most cases or risk failure.
+#
+# usage: wait_update_cond [--verbose] node check cond expect [max_wait]
+wait_update_cond() {
 	local verbose=false
-	if [[ "$1" == "--verbose" ]]; then
-		shift
-		verbose=true
-	fi
+	[[ "$1" == "--verbose" ]] && verbose=true && shift
 
 	local node=$1
-	local TEST=$2
-	local FINAL=$3
-	local MAX=${4:-90}
-	local RESULT
-	local PREV_RESULT
-	local WAIT=0
+	local check="$2"
+	local cond="$3"
+	local expect="$4"
+	local max_wait=${5:-90}
+	local result
+	local prev_result
+	local waited=0
+	local begin=$SECONDS
 	local sleep=1
 	local print=10
 
-	PREV_RESULT=$(do_node $node "$TEST")
-	while [ true ]; do
-		RESULT=$(do_node $node "$TEST")
-		if [[ "$RESULT" == "$FINAL" ]]; then
-			[[ -z "$RESULT" || $WAIT -le $sleep ]] ||
-				echo "Updated after ${WAIT}s: wanted '$FINAL'"\
-				     "got '$RESULT'"
+	while (( $waited <= $max_wait )); do
+		result=$(do_node $node "$check")
+
+		eval [[ "'$result'" $cond "'$expect'" ]]
+		if [[ $? == 0 ]]; then
+			[[ -z "$result" || $waited -le $sleep ]] ||
+				echo "Updated after ${waited}s: want '$expect' got '$result'"
 			return 0
 		fi
-		if [[ $verbose && "$RESULT" != "$PREV_RESULT" ]]; then
-			echo "Changed after ${WAIT}s: from '$PREV_RESULT'"\
-			     "to '$RESULT'"
-			PREV_RESULT=$RESULT
+		if $verbose && [[ "$result" != "$prev_result" ]]; then
+			[[ -n "$prev_result" ]] &&
+				echo "Changed after ${waited}s: from '$prev_result' to '$result'"
+			prev_result="$result"
 		fi
-		[[ $WAIT -ge $MAX ]] && break
-		[[ $((WAIT % print)) -eq 0 ]] &&
-			echo "Waiting $((MAX - WAIT)) secs for update"
-		WAIT=$((WAIT + sleep))
+		(( $waited % $print == 0 )) &&
+			echo "Waiting $((max_wait - waited))s for '$expect'"
 		sleep $sleep
+		waited=$((SECONDS - begin))
 	done
-	echo "Update not seen after ${MAX}s: wanted '$FINAL' got '$RESULT'"
+	echo "Update not seen after ${max_wait}s: want '$expect' got '$result'"
 	return 3
 }
 
+# usage: wait_update [--verbose] node check expect [max_wait]
+wait_update() {
+	local verbose=
+	[ "$1" = "--verbose" ] && verbose="$1" && shift
+
+	local node="$1"
+	local check="$2"
+	local expect="$3"
+	local max_wait=$4
+
+	wait_update_cond $verbose $node "$check" "==" "$expect" $max_wait
+}
+
+# usage: wait_update_facet_cond [--verbose] facet check cond expect [max_wait]
+wait_update_facet_cond() {
+	local verbose=
+	[ "$1" = "--verbose" ] && verbose="$1" && shift
+
+	local node=$(facet_active_host $1)
+	local check="$2"
+	local cond="$3"
+	local expect="$4"
+	local max_wait=$5
+
+	wait_update_cond $verbose $node "$check" "$cond" "$expect" $max_wait
+}
+
+# usage: wait_update_facet [--verbose] facet check expect [max_wait]
 wait_update_facet() {
 	local verbose=
 	[ "$1" = "--verbose" ] && verbose="$1" && shift
 
-	local facet=$1
-	shift
-	wait_update $verbose $(facet_active_host $facet) "$@"
+	local node=$(facet_active_host $1)
+	local check="$2"
+	local expect="$3"
+	local max_wait=$4
+
+	wait_update_cond $verbose $node "$check" "==" "$expect" $max_wait
 }
 
 sync_all_data() {
@@ -10270,20 +10319,15 @@ mdts_set_param() {
 	return $rc
 }
 
-wait_result() {
-	local facet=$1
-	shift
-	wait_update --verbose $(facet_active_host $facet) "$@"
-}
-
 mdts_check_param() {
 	local key="$1"
 	local target="$2"
 	local timeout="$3"
 	local mdtno
+
 	for mdtno in $(seq 1 $MDSCOUNT); do
 		local idx=$(($mdtno - 1))
-		wait_result mds${mdtno} \
+		wait_update_facet --verbose mds${mdtno} \
 			"$LCTL get_param -n $MDT_PREFIX${idx}.$key" "$target" \
 			$timeout ||
 			error "$key state is not '$target' on mds${mdtno}"
@@ -10334,7 +10378,7 @@ wait_request_state() {
 	local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions"
 	cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d="
 
-	wait_result $mds "$cmd" "$state" 200 ||
+	wait_update_facet --verbose $mds "$cmd" "$state" 200 ||
 		error "request on $fid is not $state on $mds"
 }
 
-- 
1.8.3.1