From: hongchao.zhang Date: Thu, 23 Jun 2011 03:24:31 +0000 (+0800) Subject: LU-357 racer test cleanup X-Git-Tag: 2.1.51~6 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=dc64eb8e2f3e8e9abbaab2762babcdd5c8446c4c;hp=e296c2d9f5d8ff8a7efdcef75bc3e8d379307440 LU-357 racer test cleanup 1, increase the test time to 300s(900s for SLOW) 2, fixing the problem of recursively calling racer.sh Change-Id: I91ac7e5c42ed5bc98b3a647c30d7e37af0573f09 Signed-off-by: Hongchao Zhang Reviewed-on: http://review.whamcloud.com/905 Reviewed-by: Yu Jian Reviewed-by: Fan Yong Tested-by: Hudson Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index e93a437..be7b4ac 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -13,7 +13,7 @@ noinst_SCRIPTS += sanity.sh rundbench acceptance-small.sh compile.sh noinst_SCRIPTS += conf-sanity.sh insanity.sh lfsck.sh oos.sh oos2.sh noinst_SCRIPTS += llog-test.sh recovery-small.sh replay-dual.sh sanity-quota.sh noinst_SCRIPTS += replay-ost-single.sh replay-single.sh run-llog.sh sanityn.sh -noinst_SCRIPTS += large-scale.sh racer.sh runracer replay-vbr.sh +noinst_SCRIPTS += large-scale.sh racer.sh replay-vbr.sh noinst_SCRIPTS += performance-sanity.sh mdsrate-create-small.sh noinst_SCRIPTS += mdsrate-create-large.sh mdsrate-lookup-1dir.sh noinst_SCRIPTS += mdsrate-lookup-10dirs.sh sanity-benchmark.sh diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index 0709852..e76985c 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -9,7 +9,7 @@ export OSKIPPED=0 # This is the default set of tests to run. DEFAULT_SUITES="runtests sanity sanity-benchmark sanityn lfsck liblustre - runracer replay-single conf-sanity recovery-small + racer replay-single conf-sanity recovery-small replay-ost-single replay-dual replay-vbr insanity sanity-quota sanity-sec sanity-gss performance-sanity large-scale recovery-mds-scale recovery-double-scale recovery-random-scale diff --git a/lustre/tests/racer.sh b/lustre/tests/racer.sh index f862012..66596ae 100644 --- a/lustre/tests/racer.sh +++ b/lustre/tests/racer.sh @@ -1,3 +1,62 @@ #!/bin/bash +#set -vx +set -e -bash $(dirname $0)/runracer $@ +ONLY=${ONLY:-"$*"} +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ +. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging + +racer=$LUSTRE/tests/racer/racer.sh +echo racer: $racer + +CLIENTS=${CLIENTS:-$HOSTNAME} +RACERDIRS=${RACERDIRS:-$DIR} +echo RACERDIRS=$RACERDIRS +for d in ${RACERDIRS}; do + RDIRS="$RDIRS $d/racer" + mkdir -p $d/racer +# lfs setstripe $d/racer -c -1 +done + +DURATION=${DURATION:-900} +[ "$SLOW" = "no" ] && DURATION=300 + +build_test_filter +check_and_setup_lustre + +# run racer +test_1() { + local rrc=0 + local rc=0 + local clients=${CLIENTS:-$(hostname)} + + check_progs_installed $clients $racer || \ + { skip_env "$racer not found" && return 0; } + + local rpids="" + for rdir in $RDIRS; do + do_nodes $clients "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" & + pid=$! + rpids="$rpids $pid" + done + + echo racers pids: $rpids + for pid in $rpids; do + wait $pid + rc=$? + echo "pid=$pid rc=$rc" + if [ $rc != 0 ]; then + rrc=$((rrc + 1)) + fi + done + + return $rrc +} +run_test 1 "racer on clients: ${CLIENTS:-$(hostname)} DURATION=$DURATION" + +complete $(basename $0) $SECONDS +check_and_cleanup_lustre +exit_status diff --git a/lustre/tests/racer/racer.sh b/lustre/tests/racer/racer.sh index 1274d02..efd3bcd 100755 --- a/lustre/tests/racer/racer.sh +++ b/lustre/tests/racer/racer.sh @@ -1,4 +1,5 @@ #!/bin/bash +#set -x MAX_FILES=${MAX_FILES:-20} DIR=${DIR:-$1} @@ -10,23 +11,52 @@ NUM_THREADS=${NUM_THREADS:-3} mkdir -p $DIR -RACER_PROGS="file_create dir_create file_rm file_rename file_link file_symlink +RACER_PROGS="file_create dir_create file_rm file_rename file_link file_symlink \ file_list file_concat" racer_cleanup() { + echo "racer cleanup" for P in $RACER_PROGS; do killall $P.sh done trap 0 + + local TOT_WAIT=0 + local MAX_WAIT=$DURATION + local SHORT_WAIT=5 + + local rc + while [[ $TOT_WAIT -le $MAX_WAIT ]]; do + rc=0 + echo sleeping $SHORT_WAIT sec ... + sleep $SHORT_WAIT + # this only checks whether processes exist + for P in $RACER_PROGS; do + killall -0 $P.sh + [[ $? -eq 0 ]] && (( rc+=1 )) + done + if [[ $rc -eq 0 ]]; then + echo there should be NO racer processes: + ps aux | grep -E "${RACER_PROGS// /|}" + return 0 + fi + echo -n "Waited $(( TOT_WAIT + SHORT_WAIT)), rc=$rc " + (( SHORT_WAIT+=SHORT_WAIT )) + (( TOT_WAIT+=SHORT_WAIT )) + done + ps aux | grep -E "${RACER_PROGS// /|}" + return 1 } +RC=0 + echo "Running $0 for $DURATION seconds. CTRL-C to exit" trap " echo \"Cleaning up\" racer_cleanup exit 0 -" 2 15 +" INT TERM cd `dirname $0` for N in `seq 1 $NUM_THREADS`; do @@ -36,11 +66,11 @@ for N in `seq 1 $NUM_THREADS`; do done sleep $DURATION -racer_cleanup +racer_cleanup || RC=$? # Check our to see whether our test DIR is still available. df $DIR -RC=$? +(( RC+=$? )) if [ $RC -eq 0 ]; then echo "We survived $0 for $DURATION seconds." fi diff --git a/lustre/tests/runracer b/lustre/tests/runracer deleted file mode 100644 index 344fed2..0000000 --- a/lustre/tests/runracer +++ /dev/null @@ -1,163 +0,0 @@ -#!/bin/bash -#set -vx -set -e - -ONLY=${ONLY:-"$*"} -LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} -. $LUSTRE/tests/test-framework.sh -init_test_env $@ -. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -init_logging - -racer=`which racer.sh 2> /dev/null` -echo racer: $racer -[ -z "$racer" ] && echo racer is not installed && exit 1 - -CLIENTS=${CLIENTS:-$HOSTNAME} -RACERDIRS=${RACERDIRS:-$DIR} -echo RACERDIRS=$RACERDIRS -for d in ${RACERDIRS}; do - RDIRS="$RDIRS $d/racer" - mkdir -p $d/racer -# lfs setstripe $d/racer -c -1 -done - -DURATION=${DURATION:-120} -PIDFILE=$TMP/racer.$$ - -assert_env CLIENTS - -timer_on () { - sleep $1 && kill -s ALRM $$ & - TIMERPID=$! - echo TIMERPID=$TIMERPID -} - -do_racer_cleanup () { - trap 0 - - local WAIT=0 - local INTERVAL=5 - local pids - local rc=0 - local TMAX - - local RDIR=$1 - - echo "DOING RACER CLEANUP ... " - - # Check if all processes are killed - - local clients=$CLIENTS - local num_clients=$(get_node_count ${clients//,/ }) - - if at_is_enabled; then - TMAX=$(at_max_get mds) - else - TMAX=$(lctl get_param -n timeout) - fi - - [ $TMAX -gt $((num_clients * 60)) ] || TMAX=$((num_clients * 60)) - # 1.Let chance to racer to kill all it's processes - # FIXME: not sure how long does it take for racer to kill all processes - # 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec - while [ $WAIT -lt $TMAX ]; do - running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true) - [ -z "$running" ] && rc=0 && break - echo "clients $clients are still running the racer processes. Waited $WAIT secs" - echo $running - rc=1 - [ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL)) - sleep $INTERVAL - WAIT=$((WAIT + INTERVAL)) - done - - # 2. Kill the remaining processes - if [ $rc -ne 0 ]; then - for C in ${clients//,/ } ; do - pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true) - if [ ! -z "$pids" ]; then - echo "client $C still running racer processes after $WAIT seconds. Killing $pids" - do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" - do_node $C kill -TERM $pids || true - # let processes to be killed, there maybe many threads to be killed, so give 20 sec gap - sleep 20 - # 3. Check if the processes were killed - # exit error if the processes still exist - for pid in $pids; do - do_node $C "ps -P $pid" && RC=1 || true - done - else - echo "All processes on client $C exited after $WAIT seconds. OK." - fi - done - else - echo "No racer processes running after $WAIT seconds. OK." - wait_remote_prog $racer 10 - fi -} - -racer_cleanup () { - if [ "$timeout" == "timeout" ]; then - echo $timeout killing RACERPID=$RACERPID - kill $RACERPID || true - sleep 2 # give chance racer to kill it's processes - local dir - for dir in $RDIRS; do - do_racer_cleanup $dir - done - else - echo "Racer completed before DURATION=$DURATION expired. Cleaning up..." - kill $TIMERPID || true - for dir in $RDIRS; do - do_racer_cleanup $dir - done - fi -} - -racer_timeout () { - timeout="timeout" - RACERPID=$(cat $PIDFILE) - rm -f $PIDFILE - racer_cleanup - echo "$0: completed $RC" - return $RC -} - -build_test_filter -check_and_setup_lustre -trap racer_timeout ALRM - -# run racer -test_1() { - RC=0 - - timer_on $((DURATION + 5)) - - RACERPID="" - for rdir in $RDIRS; do - do_nodes $CLIENTS "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" & - pid=$! - RACERPID="$RACERPID $pid" - done - - echo RACERPID=$RACERPID - echo $RACERPID > $PIDFILE - for rpid in $RACERPID; do - wait $rpid - rc=$? - echo "rpid=$rpid rc=$rc" - if [ $rc != 0 ]; then - RC=$((RC + 1)) - fi - done - - racer_cleanup - - return $RC -} -run_test 1 "racer on clients: $CLIENTS DURATION=$DURATION" - -complete $(basename $0) $SECONDS -check_and_cleanup_lustre -exit_status diff --git a/lustre/tests/test-groups/regression b/lustre/tests/test-groups/regression index 849ee8a..53d9235 100644 --- a/lustre/tests/test-groups/regression +++ b/lustre/tests/test-groups/regression @@ -4,7 +4,7 @@ sanity-benchmark sanityn lfsck liblustre -runracer +racer replay-single conf-sanity recovery-small