noinst_SCRIPTS += mdsrate-stat-small.sh mdsrate-stat-large.sh
noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause
noinst_SCRIPTS += sanity-sec.sh sanity-gss.sh krb5_login.sh setup_kerberos.sh
+noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh
+noinst_SCRIPTS += run_dbench.sh
nobase_noinst_SCRIPTS = cfg/local.sh
nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh
nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh
[ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\""
[ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484"
-export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY"
+export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY RECOVERY_MDS_SCALE"
if [ "$ACC_SM_ONLY" ]; then
for O in $TESTSUITE_LIST; do
PERFORMANCE_SANITY="done"
fi
+[ "$SLOW" = no ] && RECOVERY_MDS_SCALE="no"
+[ "$RECOVERY_MDS_SCALE" != "no" ] && skip_remmds recovery-mds-scale && RECOVERY_MDS_SCALE=no && MSKIPPED=1
+[ "$RECOVERY_MDS_SCALE" != "no" ] && skip_remost recovery-mds-scale && RECOVERY_MDS_SCALE=no && OSKIPPED=1
+if [ "$RECOVERY_MDS_SCALE" != "no" ]; then
+ title recovery-mds-scale
+ bash recovery-mds-scale.sh
+ RECOVERY_MDS_SCALE="done"
+fi
+
RC=$?
title FINISHED
echo "Finished at `date` in $((`date +%s` - $STARTTIME))s"
export PATH=:$PATH:$MPIBIN
MPIRUN=$(which mpirun) || true
MPI_USER=${MPI_USER:-mpiuser}
+
+# for recovery scale tests
+# default boulder cluster iozone location
+export PATH=/opt/iozone/bin:$PATH
+SHARED_DIRECTORY=${SHARED_DIRECTORY:-""} # bug 17839 comment 65
+LOADS="dd tar dbench iozone"
+CLIENT_LOADS=($LOADS)
--- /dev/null
+#!/bin/bash
+
+# Was Test 11 in cmd3.
+# For duration of 24 hours repeatedly failover a random MDS at
+# 10 minute intervals and verify that no application errors occur.
+
+# Test runs one of CLIENT_LOAD progs on remote clients.
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+SETUP=${SETUP:-""}
+CLEANUP=${CLEANUP:-""}
+. $LUSTRE/tests/test-framework.sh
+
+init_test_env $@
+
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+DEBUGLOG=$TESTSUITELOG.debug
+exec 2>$DEBUGLOG
+echo "--- env ---" >&2
+env >&2
+echo "--- env ---" >&2
+set -x
+
+[ "$SHARED_DIRECTORY" ] || \
+ { skip "$0: Empty SHARED_DIRECTORY" && exit 0; }
+
+[ -n "$CLIENTS" ] || { skip "$0 Need two or more remote clients" && exit 0; }
+[ $CLIENTCOUNT -ge 3 ] || \
+ { skip "$0 Need two or more clients, have $CLIENTCOUNT" && exit 0; }
+
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY}/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+
+build_test_filter
+
+check_and_setup_lustre
+rm -rf $DIR/[df][0-9]*
+
+# the test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $(hostname) $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_item_from_list $NODES_TO_USE $(hostname))
+
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
+
+MDTS=""
+for ((i=1; i<=$MDSCOUNT; i++)) do
+ MDTS="$MDTS mds$i"
+done
+MDTS=$(comma_list $MDTS)
+
+OSTS=""
+for ((i=1; i<=$OSTCOUNT; i++)) do
+ OSTS="$OSTS ost$i"
+done
+OSTS=$(comma_list $OSTS)
+
+ERRORS_OK="" # No application failures should occur during this test.
+FLAVOR=${FLAVOR:-"MDS"}
+
+rm -f $END_RUN_FILE
+
+vmstatLOG=${TESTSUITELOG}_$(basename $0 .sh).vmstat
+
+server_numfailovers () {
+ local facet
+ local var
+
+ for facet in $MDTS ${OSTS//,/ }; do
+ var=${facet}_nums
+ val=${!var}
+ if [ "$val" ] ; then
+ echo "$facet failed over $val times"
+ fi
+ done
+}
+
+summary_and_cleanup () {
+
+ local rc=$?
+ local var
+ trap 0
+
+ # Having not empty END_RUN_FILE means the failed loads only
+ if [ -s $END_RUN_FILE ]; then
+ echo "Found the END_RUN_FILE file: $END_RUN_FILE"
+ cat $END_RUN_FILE
+ local END_RUN_NODE=
+ read END_RUN_NODE < $END_RUN_FILE
+
+ # a client load will end (i.e. fail) if it finds
+ # the end run file. that does not mean that that client load
+ # actually failed though. the first node in the END_RUN_NODE is
+ # the one we are really interested in.
+ if [ -n "$END_RUN_NODE" ]; then
+ var=${END_RUN_NODE}_load
+ echo "Client load failed on node $END_RUN_NODE"
+ echo
+ echo "client $END_RUN_NODE load stdout and debug files :
+ ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}
+ ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug"
+ fi
+ rc=1
+ fi
+
+ echo $(date +'%F %H:%M:%S') Terminating clients loads ...
+ echo "$0" >> $END_RUN_FILE
+ local result=PASS
+ [ $rc -eq 0 ] || result=FAIL
+
+ log "Duraion: $DURATION
+Server failover period: $SERVER_FAILOVER_PERIOD seconds
+Exited after: $ELAPSED seconds
+Number of failovers before exit:
+$(server_numfailovers)
+Status: $result: rc=$rc"
+
+ # stop the vmstats on the OSTs
+ if [ "$VMSTAT" ]; then
+ do_nodes $(comma_list $(osts_nodes)) "test -f /tmp/vmstat.pid && \
+ { kill -s TERM \$(cat /tmp/vmstat.pid); rm -f /tmp/vmstat.pid; \
+ gzip -f9 $vmstatLOG-\$(hostname); }"
+ fi
+
+ # make sure the client loads die
+ do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && \
+ { kill -s TERM \$(cat $LOAD_PID_FILE) || true; }"
+
+ # and free up the pdshes that started them, if any are still around
+ if [ -n "$CLIENT_LOAD_PIDS" ]; then
+ kill $CLIENT_LOAD_PIDS || true
+ sleep 5
+ kill -9 $CLIENT_LOAD_PIDS || true
+ fi
+ [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
+
+ exit $rc
+}
+
+#
+# MAIN
+#
+log "-----============= $0 starting =============-----"
+
+trap summary_and_cleanup EXIT INT
+
+DURATION=${DURATION:-$((60*60*24))}
+ELAPSED=0
+NUM_FAILOVERS=0
+
+# vmstat the osts
+if [ "$VMSTAT" ]; then
+ do_nodes $(comma_list $(osts_nodes)) "vmstat 1 > $vmstatLOG-\$(hostname) 2>/dev/null </dev/null & echo \$! > /tmp/vmstat.pid"
+fi
+
+# Start client loads.
+start_client_loads $NODES_TO_USE
+
+echo clients load pids:
+if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $LOAD_PID_FILE"; then
+ if [ -e $DEBUGLOG ]; then
+ exec 2<&-
+ cat $DEBUGLOG
+ exit 3
+ fi
+fi
+
+START_TS=$(date +%s)
+CURRENT_TS=$START_TS
+
+if [ "$FLAVOR" == "MDS" ]; then
+ SERVER_FAILOVER_PERIOD=$MDS_FAILOVER_PERIOD
+ SERVERS=$MDTS
+else
+ SERVER_FAILOVER_PERIOD=$OSS_FAILOVER_PERIOD
+ SERVERS=$OSTS
+fi
+
+SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
+
+MINSLEEP=${MINSLEEP:-120}
+REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62
+REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))}
+reqfail=0
+sleep=0
+while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+
+ # In order to perform the
+ # expected number of failovers, we need to account the following :
+ # 1) the time that has elapsed during the client load checking
+ # 2) time takes for failover
+
+ it_time_start=$(date +%s)
+
+ SERVERFACET=$(get_random_entry $SERVERS)
+ var=${SERVERFACET}_nums
+
+ # Check that our client loads are still running. If any have died,
+ # that means they have died outside of recovery, which is unacceptable.
+
+ log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
+ ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+
+ if ! check_client_loads $NODES_TO_USE; then
+ exit 4
+ fi
+
+ log "Starting failover on $SERVERNODE"
+
+ facet_failover "$SERVERFACET" || exit 1
+
+ # Check that our client loads are still running during failover.
+ # No application failures should occur.
+
+ log "==== Checking the clients loads AFTER failover -- failure NOT OK"
+ if ! check_client_loads $NODES_TO_USE; then
+ log "Client load failed during failover. Exiting"
+ exit 5
+ fi
+
+ # Increment the number of failovers
+ NUM_FAILOVERS=$((NUM_FAILOVERS+1))
+ val=$((${!var} + 1))
+ eval $var=$val
+
+ CURRENT_TS=$(date +%s)
+ ELAPSED=$((CURRENT_TS - START_TS))
+
+ sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
+
+ # keep count the number of itterations when
+ # time spend to failover and two client loads check exceeded
+ # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
+ if [ $sleep -lt $MINSLEEP ]; then
+ reqfail=$((reqfail +1))
+ log "WARNING: failover and two check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP !
+Failed to meet interval $reqfail times ( REQFAIL=$REQFAIL ); have sleep=$sleep"
+ [ $reqfail -gt $REQFAIL ] && exit 6
+ fi
+
+ log "$SERVERFACET has failed over ${!var} times, and counting..."
+ if [ $sleep -gt 0 ]; then
+ echo "sleeping $sleep seconds ... "
+ sleep $sleep
+ fi
+done
+
+exit 0
--- /dev/null
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+ echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+ exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+ trap 0
+ echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+ kill $load_pid
+ kill -TERM -$PPID
+ sleep 5
+ kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/dbench-$(hostname)
+
+CONTINUE=true
+
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+ echoerr "$(date +'%F %H:%M:%S'): dbench run starting"
+
+ mkdir -p $TESTDIR
+ rundbench -D $TESTDIR 2 1>$LOG &
+ load_pid=$!
+
+ wait $load_pid
+ if [ ${PIPESTATUS[0]} -eq 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): dbench succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): dbench run finished"
+ else
+ echoerr "$(date +'%F %H:%M:%S'): dbench failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
+ CONTINUE=false
+ fi
+ fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): dbench run exiting"
--- /dev/null
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+ echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+ exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+ echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+ kill -TERM -$PPID
+ sleep 5
+ kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/dd-$(hostname)
+
+CONTINUE=true
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+ echoerr "$(date +'%F %H:%M:%S'): dd run starting"
+ mkdir -p $TESTDIR
+ cd $TESTDIR
+ dd bs=4k count=1000000 if=/dev/zero of=$TESTDIR/dd-file 1>$LOG &
+ load_pid=$!
+ wait $load_pid
+
+ if [ $? -eq 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): dd succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): dd run finished"
+ else
+ echoerr "$(date +'%F %H:%M:%S'): dd failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
+ CONTINUE=false
+ fi
+ fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): dd run exiting"
--- /dev/null
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+ echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+ exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+ echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+ kill -TERM -$PPID
+ sleep 5
+ kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/iozone-$(hostname)
+
+# needed to debug oom problem
+#echo 1 > /proc/sys/vm/vm_gfp_debug
+#killpids=""
+#vmstat 1 1000000 >$TMP/iozone.vmstat.out &
+#killpids="$killpids $!"
+#$LUSTRE_TESTS/runvmstat > $TMP/iozone.runvmstat.out &
+#killpids="$killpids $!"
+
+CONTINUE=true
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+ echoerr "$(date +'%F %H:%M:%S'): iozone run starting"
+ mkdir -p $TESTDIR
+ cd $TESTDIR
+ iozone -a -M -R -V 0xab -g 100M -q 512k -i0 -i1 -f $TESTDIR/iozone-file 1>$LOG &
+ load_pid=$!
+ wait $load_pid
+ if [ ${PIPESTATUS[0]} -eq 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): iozone succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ if [ -d $TESTDIR ]; then
+ echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR"
+ echo $(hostname) >> $END_RUN_FILE
+ CONTINUE=false
+ fi
+ echoerr "$(date +'%F %H:%M:%S'): iozone run finished"
+ else
+ echoerr "$(date +'%F %H:%M:%S'): iozone failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
+ CONTINUE=false
+ fi
+ fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): iozone run exiting"
+#kill $killpids
+#sleep 5
+#kill -9 $killpids
--- /dev/null
+#!/bin/bash
+set -x
+
+TMP=${TMP:-/tmp}
+
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale}
+LOG=${TESTSUITELOG}_$(basename $0)-$(hostname)
+DEBUGLOG=${LOG}.debug
+
+mkdir -p ${LOG%/*}
+
+rm -f $LOG $DEBUGLOG
+exec 2>$DEBUGLOG
+
+if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then
+ echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE"
+ exit 1
+fi
+
+echoerr () { echo "$@" 1>&2 ; }
+
+signaled() {
+ echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
+ kill -TERM -$PPID
+ sleep 5
+ kill -KILL -$PPID
+}
+
+trap signaled TERM
+
+# recovery-mds-scale uses this to signal the client loads to die
+echo $$ >$LOAD_PID_FILE
+
+TESTDIR=$MOUNT/tar-$(hostname)
+
+CONTINUE=true
+while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+ echoerr "$(date +'%F %H:%M:%S'): tar run starting"
+ mkdir -p $TESTDIR
+ cd $TESTDIR
+ tar cf - /etc | tar xf - 2>&1 | tee $LOG &
+ load_pid=$!
+ps -e f -o "pid ppid pgrp comm" >$TMP/client-load.ps-list
+ wait $load_pid
+ RC=${PIPESTATUS[0]}
+ PREV_ERRORS=$(grep "exit delayed from previous errors" $LOG) || true
+ if [ $RC -ne 0 -a "$ERRORS_OK" -a "$PREV_ERRORS" ]; then
+ echoerr "$(date +'%F %H:%M:%S'): tar errors earlier, ignoring"
+ RC=0
+ fi
+ if [ $RC -eq 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): tar succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): tar run finished"
+ else
+ echoerr "$(date +'%F %H:%M:%S'): tar failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
+ CONTINUE=false
+ fi
+ fi
+done
+
+echoerr "$(date +'%F %H:%M:%S'): tar run exiting"
TGT=$DIR/client.txt
CLIENT_PREFIX="${DBENCH_LIB} /usr/share/dbench /usr/local/share /usr/lib/dbench"
CLIENT_FILE="client.txt client_plain.txt dbench_client"
-which dbench > /dev/null 2>&1 || { skip "$0: dbench not installed" && exit 0; }
+if ! which dbench > /dev/null 2>&1 ; then
+ [ "$MISSING_DBENCH_OK" ] || { error "dbench is not installed !" && exit 3; }
+ skip "$0: dbench is not installed"
+ exit 0
+fi
CLIENT=""
for prefix in $CLIENT_PREFIX; do
shift $((OPTIND - 1))
+trap '
+echo kill dbench main pid=$DBENCHPID
+kill $DBENCHPID
+rm -rf dbench $LIBS71 client.txt
+exit 0
+' TERM
+
cd $DIR
echo "running 'dbench $@' $PREFIX $PWD at `date`"
-$RUN dbench -c client.txt $@
+
+$RUN dbench -c client.txt $@ &
+DBENCHPID=$!
+echo "dbench PID=$DBENCHPID"
+wait $DBENCHPID
RC=$?
[ $RC -ne 0 ] && killall -9 dbench
}
shutdown_facet() {
- facet=$1
+ local facet=$1
if [ "$FAILURE_MODE" = HARD ]; then
$POWER_DOWN `facet_active_host $facet`
sleep 2
fi
}
+# recovery-scale functions
+check_progs_installed () {
+ local clients=$1
+ shift
+ local progs=$@
+
+ do_nodes $clients "set -x ; PATH=:$PATH status=true; for prog in $progs; do
+ which \\\$prog || { echo \\\$prog missing on \\\$(hostname) && status=false; }
+ done;
+ eval \\\$status"
+}
+
+start_client_load() {
+ local list=(${1//,/ })
+ local nodenum=$2
+
+ local numloads=${#CLIENT_LOADS[@]}
+ local testnum=$((nodenum % numloads))
+
+ do_node ${list[nodenum]} "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \
+ BREAK_ON_ERROR=$BREAK_ON_ERROR \
+ END_RUN_FILE=$END_RUN_FILE \
+ LOAD_PID_FILE=$LOAD_PID_FILE \
+ TESTSUITELOG=$TESTSUITELOG \
+ run_${CLIENT_LOADS[testnum]}.sh" &
+ CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!"
+ log "Started client load: ${CLIENT_LOADS[testnum]} on ${list[nodenum]}"
+
+ eval export ${list[nodenum]}_load=${CLIENT_LOADS[testnum]}
+ return 0
+}
+
+start_client_loads () {
+ local clients=(${1//,/ })
+
+ for ((num=0; num < ${#clients[@]}; num++ )); do
+ start_client_load $1 $num
+ done
+}
+
+# only for remote client
+check_client_load () {
+ local client=$1
+ local var=${client}_load
+
+ local TESTLOAD=run_${!var}.sh
+
+ ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
+
+ check_catastrophe $client || return 2
+
+ # see if the load is still on the client
+ local tries=3
+ local RC=254
+ while [ $RC = 254 -a $tries -gt 0 ]; do
+ let tries=$tries-1
+ # assume success
+ RC=0
+ if ! do_node $client "ps auxwww | grep -v grep | grep -q $TESTLOAD"; then
+ RC=${PIPESTATUS[0]}
+ sleep 30
+ fi
+ done
+ if [ $RC = 254 ]; then
+ echo "got a return status of $RC from do_node while checking (i.e. with 'ps') the client load on the remote system"
+ # see if we can diagnose a bit why this is
+ fi
+
+ return $RC
+}
+check_client_loads () {
+ local clients=${1//,/ }
+ local client=
+ local rc=0
+
+ for client in $clients; do
+ check_client_load $client
+ rc=$?
+ if [ "$rc" != 0 ]; then
+ log "Client load failed on node $client, rc=$rc"
+ return $rc
+ fi
+ done
+}
+# End recovery-scale functions
+
# verify that lustre actually cleaned up properly
cleanup_check() {
[ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \
echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g'
}
+# list is comma separated list
+exclude_item_from_list () {
+ local list=$1
+ local excluded=$2
+
+ list=${list//,/ }
+ list=$(echo " $list " | sed -re "s/\s+$excluded\s+/ /g")
+ echo $(comma_list $list)
+}
+
absolute_path() {
(cd `dirname $1`; echo $PWD/`basename $1`)
}
CLIENTCOUNT=$((${#remoteclients[@]} + 1))
}
+get_random_entry () {
+ local rnodes=$1
+
+ rnodes=${rnodes//,/ }
+
+ local nodes=($rnodes)
+ local num=${#nodes[@]}
+ local i=$((RANDOM * num / 65536))
+
+ echo ${nodes[i]}
+}
+
is_patchless ()
{
lctl get_param version | grep -q patchless
}
check_catastrophe () {
- local rnodes=$(comma_list $(remote_nodes_list))
+ local rnodes=${1:-$(comma_list $(remote_nodes_list))}
- [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && return 1
+ [ -f $CATASTROPHE ] && [ $(cat $CATASTROPHE) -ne 0 ] && return 1
if [ $rnodes ]; then
- do_nodes $rnodes "[ -f $CATASTROPHE ] && { [ \`cat $CATASTROPHE\` -eq 0 ] || false; } || true"
+ do_nodes $rnodes "set -x; [ -f $CATASTROPHE ] && { [ \`cat $CATASTROPHE\` -eq 0 ] || false; } || true"
fi
}