#!/bin/bash #set -vx set -e LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} racer=`which racer.sh` [ -z "$racer" ] && echo racer is not installed && exit 1 CLIENTS=${CLIENTS:-$HOSTNAME} RACERDIRS=$@ RACERDIRS=${RACERDIRS:-$DIR} echo RACERDIRS=$RACERDIRS for d in ${RACERDIRS}; do RDIRS="$RDIRS $d/racer" mkdir -p $d/racer # lfs setstripe $d/racer -c -1 done DURATION=${DURATION:-120} assert_env CLIENTS timer_on () { sleep $1 && kill -s ALRM $$ & TIMERPID=$! echo TIMERPID=$TIMERPID } do_racer_cleanup () { trap 0 local WAIT=0 local INTERVAL=5 local pids local rc=0 local RDIR=$1 echo "DOING RACER CLEANUP ... " # Check if all processes are killed local clients=$CLIENTS # 1.Let chance to racer to kill all it's processes # FIXME: not sure how long does it take for racer to kill all processes # 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec while [ $WAIT -lt 90 ]; do running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true) [ -z "$running" ] && rc=0 && break echo "clients $clients are still running the racer processes. Waited $WAIT secs" echo $running rc=1 [ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL)) sleep $INTERVAL WAIT=$((WAIT + INTERVAL)) done # 2. Kill the remaining processes if [ $rc -ne 0 ]; then for C in ${clients//,/ } ; do pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true) if [ ! -z "$pids" ]; then echo "client $C still running racer processes after $WAIT seconds. Killing $pids" do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" do_node $C kill -TERM $pids || true # let processes to be killed sleep 2 # 3. Check if the processes were killed # exit error if the processes still exist for pid in $pids; do do_node $C "ps -P $pid" && RC=1 || true done else echo "All processes on client $C exited after $WAIT seconds. OK." fi done else echo "No racer processes running after $WAIT seconds. OK." wait_remote_prog $racer 10 fi } racer_cleanup () { if [ "$timeout" == "timeout" ]; then echo $timeout killing RACERPID=$RACERPID kill $RACERPID || true sleep 2 # give chance racer to kill it's processes local dir for dir in $RDIRS; do do_racer_cleanup $dir done else echo "Racer completed before DURATION=$DURATION expired. Cleaning up..." kill $TIMERPID for dir in $RDIRS; do do_racer_cleanup $dir done fi } racer_timeout () { timeout="timeout" racer_cleanup echo "$0: completed $RC" exit $RC } # run racer log "Start racer on clients: $CLIENTS DURATION=$DURATION" RC=0 trap racer_timeout ALRM timer_on $((DURATION + 5)) RACERPID="" for rdir in $RDIRS; do do_nodes $CLIENTS "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" & pid=$! RACERPID="$RACERPID $pid" done echo RACERPID=$RACERPID for rpid in $RACERPID; do wait $rpid rc=$? echo "rpid=$rpid rc=$rc" if [ $rc != 0 ]; then RC=$((RC + 1)) fi done racer_cleanup echo "$0: completed $RC" exit $RC