From a2e38010077df3d95c9942d4f9cde829f75e7267 Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Thu, 28 Feb 2019 17:54:41 -0700 Subject: [PATCH] LU-5338 tests: sanity-lfsck 11b allow larger last_id In sanity-lfsck test_11b allow the OST to report the last_id larger than the previously-used OID on the MDS. This may happen if the OST preallocates additional objects or skips some objects during recovery. Add wait_update_cond() and wait_update_facet_cond() to allow checking an arbitrary condition against the expected result rather than only exactly equal to the expected value. Remove the wait_result() helper function, which is essentially the same thing as wait_update_facet(), and only has a few users. Test-Parameters: trivial mdscount=2 mdtcount=4 testlist=sanity-lfsck Test-Parameters: fstype=zfs mdscount=2 mdtcount=4 testlist=sanity-lfsck Signed-off-by: Andreas Dilger Change-Id: I1443d16699d8115fce664a331134ca6076ecab07 Reviewed-on: https://review.whamcloud.com/34349 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Hongchao Zhang Reviewed-by: Yang Sheng Reviewed-by: Oleg Drokin --- lustre/tests/sanity-hsm.sh | 2 +- lustre/tests/sanity-lfsck.sh | 35 ++++++------ lustre/tests/sanity.sh | 5 +- lustre/tests/test-framework.sh | 120 ++++++++++++++++++++++++++++------------- 4 files changed, 102 insertions(+), 60 deletions(-) diff --git a/lustre/tests/sanity-hsm.sh b/lustre/tests/sanity-hsm.sh index 6d377d9..499f96e 100755 --- a/lustre/tests/sanity-hsm.sh +++ b/lustre/tests/sanity-hsm.sh @@ -506,7 +506,7 @@ wait_all_done() { [[ -n $fid ]] && cmd+=" | grep '$fid'" cmd+=" | egrep 'WAITING|STARTED'" - wait_result $SINGLEMDS "$cmd" "" $timeout || + wait_update_facet --verbose mds1 "$cmd" "" $timeout || error "requests did not complete" } diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index f19d464..bddd2ba 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -1381,9 +1381,8 @@ test_11b() { local proc_path="${FSNAME}-OST0000-osc-MDT0000" local seq=$(do_facet mds1 $LCTL get_param -n \ osp.${proc_path}.prealloc_last_seq) - local lastid1=$(do_facet ost1 "lctl get_param -n \ - obdfilter.${ost1_svc}.last_id" | grep $seq | - awk -F: '{ print $2 }') + local id_used=$(do_facet mds1 $LCTL get_param -n \ + osp.${proc_path}.prealloc_last_id) umount_client $MOUNT stop ost1 || error "(1) Fail to stop ost1" @@ -1395,23 +1394,23 @@ test_11b() { error "(2) Fail to start ost1" for ((i = 0; i < 60; i++)); do - lastid2=$(do_facet ost1 "lctl get_param -n \ - obdfilter.${ost1_svc}.last_id" | grep $seq | - awk -F: '{ print $2 }') - [ ! -z $lastid2 ] && break; + id_ost1=$(do_facet ost1 \ + "$LCTL get_param -n obdfilter.$ost1_svc.last_id" | + awk -F: "/$seq/ { print \$2 }") + [ -n "$id_ost1" ] && break sleep 1 done echo "the on-disk LAST_ID should be smaller than the expected one" - [ $lastid1 -gt $lastid2 ] || - error "(4) expect lastid1 [ $lastid1 ] > lastid2 [ $lastid2 ]" + [ $id_used -gt $id_ost1 ] || + error "(4) expect id_used '$id_used' > id_ost1 '$id_ost1'" echo "trigger LFSCK for layout on ost1 to rebuild the on-disk LAST_ID" $START_LAYOUT_ON_OST -r || error "(5) Fail to start LFSCK on OST!" - wait_update_facet ost1 "$LCTL get_param -n \ - obdfilter.${OST_DEV}.lfsck_layout | - awk '/^status/ { print \\\$2 }'" "completed" 32 || { + wait_update_facet ost1 \ + "$LCTL get_param -n obdfilter.$ost1_svc.lfsck_layout | + awk '/^status/ { print \\\$2 }'" "completed" 32 || { $SHOW_LAYOUT_ON_OST error "(6) unexpected status" } @@ -1422,12 +1421,12 @@ test_11b() { error "(8) Fail to start ost1" echo "the on-disk LAST_ID should have been rebuilt" - wait_update_facet ost1 "$LCTL get_param -n \ - obdfilter.${ost1_svc}.last_id | grep $seq | - awk -F: '{ print \\\$2 }'" "$lastid1" 60 || { - do_facet ost1 $LCTL get_param -n \ - obdfilter.${ost1_svc}.last_id - error "(9) expect lastid1 $seq:$lastid1" + # last_id may be larger than $id_used if objects were created/skipped + wait_update_facet_cond ost1 \ + "$LCTL get_param -n obdfilter.$ost1_svc.last_id | + awk -F: '/$seq/ { print \\\$2 }'" "-ge" "$id_used" 60 || { + do_facet ost1 $LCTL get_param obdfilter.$ost1_svc.last_id + error "(9) expect last_id >= id_used $seq:$id_used" } do_facet ost1 $LCTL set_param fail_loc=0 diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 34c8753..31b36c7 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -8769,9 +8769,8 @@ test_77k() { # LU-10906 local i [ "$ORIG_CSUM" ] || ORIG_CSUM=$(eval $get_checksum) - stack_trap "wait_update $HOSTNAME '$get_checksum' $ORIG_CSUM" EXIT - stack_trap "do_facet mgs $LCTL set_param -P $cksum_param=$ORIG_CSUM" \ - EXIT + stack_trap "wait_update $HOSTNAME '$get_checksum' $ORIG_CSUM || true" + stack_trap "do_facet mgs $LCTL set_param -P $cksum_param=$ORIG_CSUM" for i in 0 1; do do_facet mgs $LCTL set_param -P $cksum_param=$i || diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index fcbd4d1..84d2671 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2983,54 +2983,103 @@ cleanup_check() { return 0 } -wait_update () { +## +# wait for a command to return the expected result +# +# This will run @check on @node repeatedly until the output matches @expect +# based on the supplied condition, or until @max_wait seconds have elapsed, +# whichever comes first. @cond may be one of the normal bash operators, +# "-gt", "-ge", "-eq", "-le", "-lt", "==", "!=", or "=~", and must be quoted +# in the caller to avoid unintentional evaluation by the shell in the caller. +# +# If @max_wait is not specified, the condition will be checked for up to 90s. +# +# If --verbose is passed as the first argument, the result is printed on each +# value change, otherwise it is only printed after every 10s interval. +# +# Using wait_update_cond() or related helper function is preferable to adding +# a "long enough" wait for some state to change in the background, since +# "long enough" may be too short due to tunables, system config, or running in +# a VM, and must by necessity wait too long for most cases or risk failure. +# +# usage: wait_update_cond [--verbose] node check cond expect [max_wait] +wait_update_cond() { local verbose=false - if [[ "$1" == "--verbose" ]]; then - shift - verbose=true - fi + [[ "$1" == "--verbose" ]] && verbose=true && shift local node=$1 - local TEST=$2 - local FINAL=$3 - local MAX=${4:-90} - local RESULT - local PREV_RESULT - local WAIT=0 + local check="$2" + local cond="$3" + local expect="$4" + local max_wait=${5:-90} + local result + local prev_result + local waited=0 + local begin=$SECONDS local sleep=1 local print=10 - PREV_RESULT=$(do_node $node "$TEST") - while [ true ]; do - RESULT=$(do_node $node "$TEST") - if [[ "$RESULT" == "$FINAL" ]]; then - [[ -z "$RESULT" || $WAIT -le $sleep ]] || - echo "Updated after ${WAIT}s: wanted '$FINAL'"\ - "got '$RESULT'" + while (( $waited <= $max_wait )); do + result=$(do_node $node "$check") + + eval [[ "'$result'" $cond "'$expect'" ]] + if [[ $? == 0 ]]; then + [[ -z "$result" || $waited -le $sleep ]] || + echo "Updated after ${waited}s: want '$expect' got '$result'" return 0 fi - if [[ $verbose && "$RESULT" != "$PREV_RESULT" ]]; then - echo "Changed after ${WAIT}s: from '$PREV_RESULT'"\ - "to '$RESULT'" - PREV_RESULT=$RESULT + if $verbose && [[ "$result" != "$prev_result" ]]; then + [[ -n "$prev_result" ]] && + echo "Changed after ${waited}s: from '$prev_result' to '$result'" + prev_result="$result" fi - [[ $WAIT -ge $MAX ]] && break - [[ $((WAIT % print)) -eq 0 ]] && - echo "Waiting $((MAX - WAIT)) secs for update" - WAIT=$((WAIT + sleep)) + (( $waited % $print == 0 )) && + echo "Waiting $((max_wait - waited))s for '$expect'" sleep $sleep + waited=$((SECONDS - begin)) done - echo "Update not seen after ${MAX}s: wanted '$FINAL' got '$RESULT'" + echo "Update not seen after ${max_wait}s: want '$expect' got '$result'" return 3 } +# usage: wait_update [--verbose] node check expect [max_wait] +wait_update() { + local verbose= + [ "$1" = "--verbose" ] && verbose="$1" && shift + + local node="$1" + local check="$2" + local expect="$3" + local max_wait=$4 + + wait_update_cond $verbose $node "$check" "==" "$expect" $max_wait +} + +# usage: wait_update_facet_cond [--verbose] facet check cond expect [max_wait] +wait_update_facet_cond() { + local verbose= + [ "$1" = "--verbose" ] && verbose="$1" && shift + + local node=$(facet_active_host $1) + local check="$2" + local cond="$3" + local expect="$4" + local max_wait=$5 + + wait_update_cond $verbose $node "$check" "$cond" "$expect" $max_wait +} + +# usage: wait_update_facet [--verbose] facet check expect [max_wait] wait_update_facet() { local verbose= [ "$1" = "--verbose" ] && verbose="$1" && shift - local facet=$1 - shift - wait_update $verbose $(facet_active_host $facet) "$@" + local node=$(facet_active_host $1) + local check="$2" + local expect="$3" + local max_wait=$4 + + wait_update_cond $verbose $node "$check" "==" "$expect" $max_wait } sync_all_data() { @@ -10270,20 +10319,15 @@ mdts_set_param() { return $rc } -wait_result() { - local facet=$1 - shift - wait_update --verbose $(facet_active_host $facet) "$@" -} - mdts_check_param() { local key="$1" local target="$2" local timeout="$3" local mdtno + for mdtno in $(seq 1 $MDSCOUNT); do local idx=$(($mdtno - 1)) - wait_result mds${mdtno} \ + wait_update_facet --verbose mds${mdtno} \ "$LCTL get_param -n $MDT_PREFIX${idx}.$key" "$target" \ $timeout || error "$key state is not '$target' on mds${mdtno}" @@ -10334,7 +10378,7 @@ wait_request_state() { local cmd="$LCTL get_param -n ${MDT_PREFIX}${mdtidx}.hsm.actions" cmd+=" | awk '/'$fid'.*action='$request'/ {print \\\$13}' | cut -f2 -d=" - wait_result $mds "$cmd" "$state" 200 || + wait_update_facet --verbose $mds "$cmd" "$state" 200 || error "request on $fid is not $state on $mds" } -- 1.8.3.1