Whamcloud - gitweb
LU-357 racer test cleanup
authorhongchao.zhang <hongchao.zhang@whamcloud.com>
Thu, 23 Jun 2011 03:24:31 +0000 (11:24 +0800)
committerOleg Drokin <green@whamcloud.com>
Sun, 23 Oct 2011 02:57:10 +0000 (22:57 -0400)
 1, increase the test time to 300s(900s for SLOW)
 2, fixing the problem of recursively calling racer.sh

Change-Id: I91ac7e5c42ed5bc98b3a647c30d7e37af0573f09
Signed-off-by: Hongchao Zhang <hongchao.zhang@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/905
Reviewed-by: Yu Jian <yujian@whamcloud.com>
Reviewed-by: Fan Yong <yong.fan@whamcloud.com>
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/tests/Makefile.am
lustre/tests/acceptance-small.sh
lustre/tests/racer.sh
lustre/tests/racer/racer.sh
lustre/tests/runracer [deleted file]
lustre/tests/test-groups/regression

index e93a437..be7b4ac 100644 (file)
@@ -13,7 +13,7 @@ noinst_SCRIPTS += sanity.sh rundbench acceptance-small.sh compile.sh
 noinst_SCRIPTS += conf-sanity.sh insanity.sh lfsck.sh oos.sh oos2.sh
 noinst_SCRIPTS += llog-test.sh recovery-small.sh replay-dual.sh sanity-quota.sh
 noinst_SCRIPTS += replay-ost-single.sh replay-single.sh run-llog.sh sanityn.sh
-noinst_SCRIPTS += large-scale.sh racer.sh runracer replay-vbr.sh
+noinst_SCRIPTS += large-scale.sh racer.sh replay-vbr.sh
 noinst_SCRIPTS += performance-sanity.sh mdsrate-create-small.sh
 noinst_SCRIPTS += mdsrate-create-large.sh mdsrate-lookup-1dir.sh
 noinst_SCRIPTS += mdsrate-lookup-10dirs.sh sanity-benchmark.sh
index 0709852..e76985c 100755 (executable)
@@ -9,7 +9,7 @@ export OSKIPPED=0
 
 # This is the default set of tests to run.
 DEFAULT_SUITES="runtests sanity sanity-benchmark sanityn lfsck liblustre
-                runracer replay-single conf-sanity recovery-small
+                racer replay-single conf-sanity recovery-small
                 replay-ost-single replay-dual replay-vbr insanity sanity-quota
                 sanity-sec sanity-gss performance-sanity large-scale
                 recovery-mds-scale recovery-double-scale recovery-random-scale
index f862012..66596ae 100644 (file)
@@ -1,3 +1,62 @@
 #!/bin/bash
+#set -vx
+set -e
 
-bash $(dirname $0)/runracer $@
+ONLY=${ONLY:-"$*"}
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
+
+racer=$LUSTRE/tests/racer/racer.sh
+echo racer: $racer
+
+CLIENTS=${CLIENTS:-$HOSTNAME}
+RACERDIRS=${RACERDIRS:-$DIR}
+echo RACERDIRS=$RACERDIRS
+for d in ${RACERDIRS}; do
+       RDIRS="$RDIRS $d/racer"
+       mkdir -p $d/racer
+#      lfs setstripe $d/racer -c -1
+done
+
+DURATION=${DURATION:-900}
+[ "$SLOW" = "no" ] && DURATION=300
+
+build_test_filter
+check_and_setup_lustre
+
+# run racer
+test_1() {
+    local rrc=0
+    local rc=0
+    local clients=${CLIENTS:-$(hostname)}
+
+    check_progs_installed $clients $racer || \
+        { skip_env "$racer not found" && return 0; }
+
+    local rpids=""
+    for rdir in $RDIRS; do
+        do_nodes $clients "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" &
+        pid=$!
+        rpids="$rpids $pid"
+    done
+
+    echo racers pids: $rpids
+    for pid in $rpids; do
+        wait $pid
+        rc=$?
+        echo "pid=$pid rc=$rc"
+        if [ $rc != 0 ]; then
+            rrc=$((rrc + 1))
+        fi
+    done
+
+    return $rrc
+}
+run_test 1 "racer on clients: ${CLIENTS:-$(hostname)} DURATION=$DURATION"
+
+complete $(basename $0) $SECONDS
+check_and_cleanup_lustre
+exit_status
index 1274d02..efd3bcd 100755 (executable)
@@ -1,4 +1,5 @@
 #!/bin/bash
+#set -x
 
 MAX_FILES=${MAX_FILES:-20}
 DIR=${DIR:-$1}
@@ -10,23 +11,52 @@ NUM_THREADS=${NUM_THREADS:-3}
 
 mkdir -p $DIR
 
-RACER_PROGS="file_create dir_create file_rm file_rename file_link file_symlink
+RACER_PROGS="file_create dir_create file_rm file_rename file_link file_symlink \
 file_list file_concat"
 
 racer_cleanup()
 {
+       echo "racer cleanup"
        for P in $RACER_PROGS; do
                killall $P.sh
        done
        trap 0
+
+       local TOT_WAIT=0
+       local MAX_WAIT=$DURATION
+       local SHORT_WAIT=5
+
+       local rc
+       while [[ $TOT_WAIT -le $MAX_WAIT ]]; do
+               rc=0
+               echo sleeping $SHORT_WAIT sec ...
+               sleep $SHORT_WAIT
+               # this only checks whether processes exist
+               for P in $RACER_PROGS; do
+                       killall -0 $P.sh
+                       [[ $? -eq 0 ]] && (( rc+=1 ))
+               done
+               if [[ $rc -eq 0 ]]; then
+                       echo there should be NO racer processes:
+                       ps aux | grep -E "${RACER_PROGS// /|}"
+                       return 0
+               fi
+               echo -n "Waited $(( TOT_WAIT + SHORT_WAIT)), rc=$rc "
+               (( SHORT_WAIT+=SHORT_WAIT ))
+               (( TOT_WAIT+=SHORT_WAIT ))
+       done
+       ps aux | grep -E "${RACER_PROGS// /|}"
+       return 1
 }
 
+RC=0
+
 echo "Running $0 for $DURATION seconds. CTRL-C to exit"
 trap "
        echo \"Cleaning up\" 
        racer_cleanup
        exit 0
-" 2 15
+" INT TERM
 
 cd `dirname $0`
 for N in `seq 1 $NUM_THREADS`; do
@@ -36,11 +66,11 @@ for N in `seq 1 $NUM_THREADS`; do
 done
 
 sleep $DURATION
-racer_cleanup
+racer_cleanup || RC=$?
 
 # Check our to see whether our test DIR is still available.
 df $DIR
-RC=$?
+(( RC+=$? ))
 if [ $RC -eq 0 ]; then
     echo "We survived $0 for $DURATION seconds."
 fi
diff --git a/lustre/tests/runracer b/lustre/tests/runracer
deleted file mode 100644 (file)
index 344fed2..0000000
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/bin/bash
-#set -vx
-set -e
-
-ONLY=${ONLY:-"$*"}
-LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
-. $LUSTRE/tests/test-framework.sh
-init_test_env $@
-. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
-init_logging
-
-racer=`which racer.sh 2> /dev/null`
-echo racer: $racer
-[ -z "$racer" ] && echo racer is not installed && exit 1
-
-CLIENTS=${CLIENTS:-$HOSTNAME}
-RACERDIRS=${RACERDIRS:-$DIR}
-echo RACERDIRS=$RACERDIRS
-for d in ${RACERDIRS}; do
-       RDIRS="$RDIRS $d/racer"
-       mkdir -p $d/racer
-#      lfs setstripe $d/racer -c -1
-done
-
-DURATION=${DURATION:-120}
-PIDFILE=$TMP/racer.$$
-
-assert_env CLIENTS
-
-timer_on () {
-       sleep $1 && kill -s ALRM $$ &
-       TIMERPID=$!
-       echo TIMERPID=$TIMERPID
-}
-
-do_racer_cleanup () {
-       trap 0
-
-       local WAIT=0
-       local INTERVAL=5
-        local pids
-       local rc=0
-       local TMAX
-
-       local RDIR=$1
-
-       echo "DOING RACER CLEANUP ... "
-
-       # Check if all processes are killed
-
-       local clients=$CLIENTS
-       local num_clients=$(get_node_count ${clients//,/ })
-
-        if at_is_enabled; then
-            TMAX=$(at_max_get mds)
-        else
-            TMAX=$(lctl get_param -n timeout)
-        fi
-
-       [ $TMAX -gt $((num_clients * 60)) ] || TMAX=$((num_clients * 60))
-       # 1.Let chance to racer to kill all it's processes
-       # FIXME: not sure how long does it take for racer to kill all processes
-       # 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec
-       while [ $WAIT -lt $TMAX ]; do
-               running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true)
-               [ -z "$running" ] && rc=0 && break
-               echo "clients $clients are still running the racer processes. Waited $WAIT secs"
-               echo $running
-               rc=1
-               [ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL))
-               sleep $INTERVAL
-               WAIT=$((WAIT + INTERVAL))
-       done
-
-       # 2. Kill the remaining processes
-       if [ $rc -ne 0 ]; then
-               for C in ${clients//,/ } ; do
-                       pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true)
-                       if [ ! -z "$pids" ]; then
-                               echo "client $C still running racer processes after $WAIT seconds. Killing $pids"
-                               do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)"
-                               do_node $C kill -TERM $pids || true
-                               # let processes to be killed, there maybe many threads to be killed, so give 20 sec gap
-                               sleep 20
-       # 3. Check if the processes were killed
-       # exit error if the processes still exist
-                               for pid in $pids; do
-                                       do_node $C "ps -P $pid" && RC=1 || true
-                               done
-                       else
-                               echo "All processes on client $C exited after $WAIT seconds. OK."
-                       fi
-               done
-       else
-               echo "No racer processes running after $WAIT seconds. OK."
-               wait_remote_prog $racer 10
-       fi
-}
-
-racer_cleanup () {
-       if [ "$timeout" == "timeout" ]; then
-               echo $timeout killing RACERPID=$RACERPID
-               kill $RACERPID || true
-               sleep 2 # give chance racer to kill it's processes
-               local dir
-               for dir in $RDIRS; do
-                       do_racer_cleanup $dir
-               done
-       else
-               echo "Racer completed before DURATION=$DURATION expired. Cleaning up..."
-               kill $TIMERPID || true
-               for dir in $RDIRS; do
-                       do_racer_cleanup $dir
-               done
-       fi
-}
-
-racer_timeout () {
-       timeout="timeout"
-       RACERPID=$(cat $PIDFILE)
-       rm -f $PIDFILE
-       racer_cleanup
-       echo "$0: completed $RC"
-       return $RC
-}
-
-build_test_filter
-check_and_setup_lustre
-trap racer_timeout ALRM
-
-# run racer
-test_1() {
-    RC=0
-
-    timer_on $((DURATION + 5))
-
-    RACERPID=""
-    for rdir in $RDIRS; do
-        do_nodes $CLIENTS "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" &
-        pid=$!
-        RACERPID="$RACERPID $pid"
-    done
-
-    echo RACERPID=$RACERPID
-    echo $RACERPID > $PIDFILE
-    for rpid in $RACERPID; do
-        wait $rpid
-        rc=$?
-        echo "rpid=$rpid rc=$rc"
-        if [ $rc != 0 ]; then
-                RC=$((RC + 1))
-        fi
-    done
-
-    racer_cleanup
-
-    return $RC
-}
-run_test 1 "racer on clients: $CLIENTS DURATION=$DURATION"
-
-complete $(basename $0) $SECONDS
-check_and_cleanup_lustre
-exit_status
index 849ee8a..53d9235 100644 (file)
@@ -4,7 +4,7 @@ sanity-benchmark
 sanityn
 lfsck
 liblustre
-runracer
+racer
 replay-single
 conf-sanity
 recovery-small