#!/bin/bash #set -vx set -e ONLY=${ONLY:-"$*"} LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging racer=`which racer.sh 2> /dev/null` echo racer: $racer [ -z "$racer" ] && echo racer is not installed && exit 1 CLIENTS=${CLIENTS:-$HOSTNAME} RACERDIRS=${RACERDIRS:-$DIR} echo RACERDIRS=$RACERDIRS for d in ${RACERDIRS}; do RDIRS="$RDIRS $d/racer" mkdir -p $d/racer # lfs setstripe $d/racer -c -1 done DURATION=${DURATION:-120} PIDFILE=$TMP/racer.$$ assert_env CLIENTS timer_on () { sleep $1 && kill -s ALRM $$ & TIMERPID=$! echo TIMERPID=$TIMERPID } do_racer_cleanup () { trap 0 local WAIT=0 local INTERVAL=5 local pids local rc=0 local TMAX local RDIR=$1 echo "DOING RACER CLEANUP ... " # Check if all processes are killed local clients=$CLIENTS local num_clients=$(get_node_count ${clients//,/ }) if at_is_enabled; then TMAX=$(at_max_get mds) else TMAX=$(lctl get_param -n timeout) fi [ $TMAX -gt $((num_clients * 60)) ] || TMAX=$((num_clients * 60)) # 1.Let chance to racer to kill all it's processes # FIXME: not sure how long does it take for racer to kill all processes # 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec while [ $WAIT -lt $TMAX ]; do running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true) [ -z "$running" ] && rc=0 && break echo "clients $clients are still running the racer processes. Waited $WAIT secs" echo $running rc=1 [ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL)) sleep $INTERVAL WAIT=$((WAIT + INTERVAL)) done # 2. Kill the remaining processes if [ $rc -ne 0 ]; then for C in ${clients//,/ } ; do pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true) if [ ! -z "$pids" ]; then echo "client $C still running racer processes after $WAIT seconds. Killing $pids" do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" do_node $C kill -TERM $pids || true # let processes to be killed, there maybe many threads to be killed, so give 20 sec gap sleep 20 # 3. Check if the processes were killed # exit error if the processes still exist for pid in $pids; do do_node $C "ps -P $pid" && RC=1 || true done else echo "All processes on client $C exited after $WAIT seconds. OK." fi done else echo "No racer processes running after $WAIT seconds. OK." wait_remote_prog $racer 10 fi } racer_cleanup () { if [ "$timeout" == "timeout" ]; then echo $timeout killing RACERPID=$RACERPID kill $RACERPID || true sleep 2 # give chance racer to kill it's processes local dir for dir in $RDIRS; do do_racer_cleanup $dir done else echo "Racer completed before DURATION=$DURATION expired. Cleaning up..." kill $TIMERPID || true for dir in $RDIRS; do do_racer_cleanup $dir done fi } racer_timeout () { timeout="timeout" RACERPID=$(cat $PIDFILE) rm -f $PIDFILE racer_cleanup echo "$0: completed $RC" return $RC } build_test_filter check_and_setup_lustre trap racer_timeout ALRM # run racer test_1() { RC=0 timer_on $((DURATION + 5)) RACERPID="" for rdir in $RDIRS; do do_nodes $CLIENTS "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" & pid=$! RACERPID="$RACERPID $pid" done echo RACERPID=$RACERPID echo $RACERPID > $PIDFILE for rpid in $RACERPID; do wait $rpid rc=$? echo "rpid=$rpid rc=$rc" if [ $rc != 0 ]; then RC=$((RC + 1)) fi done racer_cleanup return $RC } run_test 1 "racer on clients: $CLIENTS DURATION=$DURATION" equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true