#set -vx
set -e
+ONLY=${ONLY:-"$*"}
LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
. $LUSTRE/tests/test-framework.sh
init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
-racer=`which racer.sh`
+racer=`which racer.sh 2> /dev/null`
+echo racer: $racer
[ -z "$racer" ] && echo racer is not installed && exit 1
CLIENTS=${CLIENTS:-$HOSTNAME}
-RDIR=$DIR/racer
-mkdir -p $RDIR
+RACERDIRS=${RACERDIRS:-$DIR}
+echo RACERDIRS=$RACERDIRS
+for d in ${RACERDIRS}; do
+ RDIRS="$RDIRS $d/racer"
+ mkdir -p $d/racer
+# lfs setstripe $d/racer -c -1
+done
+
DURATION=${DURATION:-120}
+PIDFILE=$TMP/racer.$$
assert_env CLIENTS
local INTERVAL=5
local pids
local rc=0
+ local TMAX
+
+ local RDIR=$1
echo "DOING RACER CLEANUP ... "
# Check if all processes are killed
local clients=$CLIENTS
+ local num_clients=$(get_node_count ${clients//,/ })
+
+ if at_is_enabled; then
+ TMAX=$(at_max_get mds)
+ else
+ TMAX=$(lctl get_param -n timeout)
+ fi
+ [ $TMAX -gt $((num_clients * 60)) ] || TMAX=$((num_clients * 60))
# 1.Let chance to racer to kill all it's processes
# FIXME: not sure how long does it take for racer to kill all processes
# 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec
- while [ $WAIT -lt 90 ]; do
+ while [ $WAIT -lt $TMAX ]; do
running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true)
[ -z "$running" ] && rc=0 && break
echo "clients $clients are still running the racer processes. Waited $WAIT secs"
echo "client $C still running racer processes after $WAIT seconds. Killing $pids"
do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)"
do_node $C kill -TERM $pids || true
- # let processes to be killed
- sleep 2
+ # let processes to be killed, there maybe many threads to be killed, so give 20 sec gap
+ sleep 20
# 3. Check if the processes were killed
# exit error if the processes still exist
for pid in $pids; do
echo $timeout killing RACERPID=$RACERPID
kill $RACERPID || true
sleep 2 # give chance racer to kill it's processes
- do_racer_cleanup
+ local dir
+ for dir in $RDIRS; do
+ do_racer_cleanup $dir
+ done
else
echo "Racer completed before DURATION=$DURATION expired. Cleaning up..."
- kill $TIMERPID
- do_racer_cleanup
+ kill $TIMERPID || true
+ for dir in $RDIRS; do
+ do_racer_cleanup $dir
+ done
fi
}
racer_timeout () {
timeout="timeout"
+ RACERPID=$(cat $PIDFILE)
+ rm -f $PIDFILE
racer_cleanup
echo "$0: completed $RC"
- exit $RC
+ return $RC
}
-# run racer
-log "Start racer on clients: $CLIENTS DURATION=$DURATION"
-RC=0
-
+build_test_filter
+check_and_setup_lustre
trap racer_timeout ALRM
-timer_on $((DURATION + 5))
+# run racer
+test_1() {
+ RC=0
+
+ timer_on $((DURATION + 5))
+
+ RACERPID=""
+ for rdir in $RDIRS; do
+ do_nodes $CLIENTS "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" &
+ pid=$!
+ RACERPID="$RACERPID $pid"
+ done
+
+ echo RACERPID=$RACERPID
+ echo $RACERPID > $PIDFILE
+ for rpid in $RACERPID; do
+ wait $rpid
+ rc=$?
+ echo "rpid=$rpid rc=$rc"
+ if [ $rc != 0 ]; then
+ RC=$((RC + 1))
+ fi
+ done
+
+ racer_cleanup
+
+ return $RC
+}
+run_test 1 "racer on clients: $CLIENTS DURATION=$DURATION"
-do_nodes $CLIENTS "DURATION=$DURATION $racer $RDIR" &
-RACERPID=$!
-echo RACERPID=$RACERPID
-wait $RACERPID || RC=2
-racer_cleanup
-echo "$0: completed $RC"
-exit $RC
+equals_msg `basename $0`: test complete, cleaning up
+check_and_cleanup_lustre
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true