From f6796fea971503083308076ce78acfc385271ae4 Mon Sep 17 00:00:00 2001 From: grev Date: Tue, 16 Dec 2008 17:20:53 +0000 Subject: [PATCH] b=17839 i=Brian cmd3-11 ported to acc-sm t-f -- new RECOVERY_MDS_SCALE -- rundbench modified -- new t-f functions, run loads scripts --- lustre/tests/Makefile.am | 2 + lustre/tests/acceptance-small.sh | 11 +- lustre/tests/cfg/ncli.sh | 7 + lustre/tests/recovery-mds-scale.sh | 256 +++++++++++++++++++++++++++++++++++++ lustre/tests/run_dbench.sh | 65 ++++++++++ lustre/tests/run_dd.sh | 62 +++++++++ lustre/tests/run_iozone.sh | 77 +++++++++++ lustre/tests/run_tar.sh | 68 ++++++++++ lustre/tests/rundbench | 19 ++- lustre/tests/test-framework.sh | 116 ++++++++++++++++- 10 files changed, 676 insertions(+), 7 deletions(-) create mode 100644 lustre/tests/recovery-mds-scale.sh create mode 100755 lustre/tests/run_dbench.sh create mode 100755 lustre/tests/run_dd.sh create mode 100755 lustre/tests/run_iozone.sh create mode 100755 lustre/tests/run_tar.sh diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index ebda53f..c9c8800 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -17,6 +17,8 @@ noinst_SCRIPTS += mdsrate-create-large.sh mdsrate-lookup-1dir.sh noinst_SCRIPTS += mdsrate-stat-small.sh mdsrate-stat-large.sh noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause noinst_SCRIPTS += sanity-sec.sh sanity-gss.sh krb5_login.sh setup_kerberos.sh +noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh +noinst_SCRIPTS += run_dbench.sh nobase_noinst_SCRIPTS = cfg/local.sh nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index 7b320d8..7edce6f 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -23,7 +23,7 @@ fi [ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\"" [ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484" -export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY" +export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY RECOVERY_MDS_SCALE" if [ "$ACC_SM_ONLY" ]; then for O in $TESTSUITE_LIST; do @@ -432,6 +432,15 @@ if [ "$PERFORMANCE_SANITY" != "no" ]; then PERFORMANCE_SANITY="done" fi +[ "$SLOW" = no ] && RECOVERY_MDS_SCALE="no" +[ "$RECOVERY_MDS_SCALE" != "no" ] && skip_remmds recovery-mds-scale && RECOVERY_MDS_SCALE=no && MSKIPPED=1 +[ "$RECOVERY_MDS_SCALE" != "no" ] && skip_remost recovery-mds-scale && RECOVERY_MDS_SCALE=no && OSKIPPED=1 +if [ "$RECOVERY_MDS_SCALE" != "no" ]; then + title recovery-mds-scale + bash recovery-mds-scale.sh + RECOVERY_MDS_SCALE="done" +fi + RC=$? title FINISHED echo "Finished at `date` in $((`date +%s` - $STARTTIME))s" diff --git a/lustre/tests/cfg/ncli.sh b/lustre/tests/cfg/ncli.sh index c583c5f..a3fff9f 100644 --- a/lustre/tests/cfg/ncli.sh +++ b/lustre/tests/cfg/ncli.sh @@ -15,3 +15,10 @@ MPIBIN=${MPIBIN:-/testsuite/tests/`arch`/bin} export PATH=:$PATH:$MPIBIN MPIRUN=$(which mpirun) || true MPI_USER=${MPI_USER:-mpiuser} + +# for recovery scale tests +# default boulder cluster iozone location +export PATH=/opt/iozone/bin:$PATH +SHARED_DIRECTORY=${SHARED_DIRECTORY:-""} # bug 17839 comment 65 +LOADS="dd tar dbench iozone" +CLIENT_LOADS=($LOADS) diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh new file mode 100644 index 0000000..7440ed5 --- /dev/null +++ b/lustre/tests/recovery-mds-scale.sh @@ -0,0 +1,256 @@ +#!/bin/bash + +# Was Test 11 in cmd3. +# For duration of 24 hours repeatedly failover a random MDS at +# 10 minute intervals and verify that no application errors occur. + +# Test runs one of CLIENT_LOAD progs on remote clients. + +LUSTRE=${LUSTRE:-`dirname $0`/..} +SETUP=${SETUP:-""} +CLEANUP=${CLEANUP:-""} +. $LUSTRE/tests/test-framework.sh + +init_test_env $@ + +. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} + +TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} +DEBUGLOG=$TESTSUITELOG.debug +exec 2>$DEBUGLOG +echo "--- env ---" >&2 +env >&2 +echo "--- env ---" >&2 +set -x + +[ "$SHARED_DIRECTORY" ] || \ + { skip "$0: Empty SHARED_DIRECTORY" && exit 0; } + +[ -n "$CLIENTS" ] || { skip "$0 Need two or more remote clients" && exit 0; } +[ $CLIENTCOUNT -ge 3 ] || \ + { skip "$0 Need two or more clients, have $CLIENTCOUNT" && exit 0; } + +END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY}/end_run_file} +LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} + +remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 +remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 + +build_test_filter + +check_and_setup_lustre +rm -rf $DIR/[df][0-9]* + +# the test node needs to be insulated from a lustre failure as much as possible, +# so not even loading the lustre modules is ideal. +# -- umount lustre +# -- remove hostname from clients list +zconf_umount $(hostname) $MOUNT +NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} +NODES_TO_USE=$(exclude_item_from_list $NODES_TO_USE $(hostname)) + +check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} + +MDTS="" +for ((i=1; i<=$MDSCOUNT; i++)) do + MDTS="$MDTS mds$i" +done +MDTS=$(comma_list $MDTS) + +OSTS="" +for ((i=1; i<=$OSTCOUNT; i++)) do + OSTS="$OSTS ost$i" +done +OSTS=$(comma_list $OSTS) + +ERRORS_OK="" # No application failures should occur during this test. +FLAVOR=${FLAVOR:-"MDS"} + +rm -f $END_RUN_FILE + +vmstatLOG=${TESTSUITELOG}_$(basename $0 .sh).vmstat + +server_numfailovers () { + local facet + local var + + for facet in $MDTS ${OSTS//,/ }; do + var=${facet}_nums + val=${!var} + if [ "$val" ] ; then + echo "$facet failed over $val times" + fi + done +} + +summary_and_cleanup () { + + local rc=$? + local var + trap 0 + + # Having not empty END_RUN_FILE means the failed loads only + if [ -s $END_RUN_FILE ]; then + echo "Found the END_RUN_FILE file: $END_RUN_FILE" + cat $END_RUN_FILE + local END_RUN_NODE= + read END_RUN_NODE < $END_RUN_FILE + + # a client load will end (i.e. fail) if it finds + # the end run file. that does not mean that that client load + # actually failed though. the first node in the END_RUN_NODE is + # the one we are really interested in. + if [ -n "$END_RUN_NODE" ]; then + var=${END_RUN_NODE}_load + echo "Client load failed on node $END_RUN_NODE" + echo + echo "client $END_RUN_NODE load stdout and debug files : + ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE} + ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug" + fi + rc=1 + fi + + echo $(date +'%F %H:%M:%S') Terminating clients loads ... + echo "$0" >> $END_RUN_FILE + local result=PASS + [ $rc -eq 0 ] || result=FAIL + + log "Duraion: $DURATION +Server failover period: $SERVER_FAILOVER_PERIOD seconds +Exited after: $ELAPSED seconds +Number of failovers before exit: +$(server_numfailovers) +Status: $result: rc=$rc" + + # stop the vmstats on the OSTs + if [ "$VMSTAT" ]; then + do_nodes $(comma_list $(osts_nodes)) "test -f /tmp/vmstat.pid && \ + { kill -s TERM \$(cat /tmp/vmstat.pid); rm -f /tmp/vmstat.pid; \ + gzip -f9 $vmstatLOG-\$(hostname); }" + fi + + # make sure the client loads die + do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && \ + { kill -s TERM \$(cat $LOAD_PID_FILE) || true; }" + + # and free up the pdshes that started them, if any are still around + if [ -n "$CLIENT_LOAD_PIDS" ]; then + kill $CLIENT_LOAD_PIDS || true + sleep 5 + kill -9 $CLIENT_LOAD_PIDS || true + fi + [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT + + exit $rc +} + +# +# MAIN +# +log "-----============= $0 starting =============-----" + +trap summary_and_cleanup EXIT INT + +DURATION=${DURATION:-$((60*60*24))} +ELAPSED=0 +NUM_FAILOVERS=0 + +# vmstat the osts +if [ "$VMSTAT" ]; then + do_nodes $(comma_list $(osts_nodes)) "vmstat 1 > $vmstatLOG-\$(hostname) 2>/dev/null /tmp/vmstat.pid" +fi + +# Start client loads. +start_client_loads $NODES_TO_USE + +echo clients load pids: +if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $LOAD_PID_FILE"; then + if [ -e $DEBUGLOG ]; then + exec 2<&- + cat $DEBUGLOG + exit 3 + fi +fi + +START_TS=$(date +%s) +CURRENT_TS=$START_TS + +if [ "$FLAVOR" == "MDS" ]; then + SERVER_FAILOVER_PERIOD=$MDS_FAILOVER_PERIOD + SERVERS=$MDTS +else + SERVER_FAILOVER_PERIOD=$OSS_FAILOVER_PERIOD + SERVERS=$OSTS +fi + +SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes + +MINSLEEP=${MINSLEEP:-120} +REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62 +REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))} +reqfail=0 +sleep=0 +while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do + + # In order to perform the + # expected number of failovers, we need to account the following : + # 1) the time that has elapsed during the client load checking + # 2) time takes for failover + + it_time_start=$(date +%s) + + SERVERFACET=$(get_random_entry $SERVERS) + var=${SERVERFACET}_nums + + # Check that our client loads are still running. If any have died, + # that means they have died outside of recovery, which is unacceptable. + + log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ + ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" + + if ! check_client_loads $NODES_TO_USE; then + exit 4 + fi + + log "Starting failover on $SERVERNODE" + + facet_failover "$SERVERFACET" || exit 1 + + # Check that our client loads are still running during failover. + # No application failures should occur. + + log "==== Checking the clients loads AFTER failover -- failure NOT OK" + if ! check_client_loads $NODES_TO_USE; then + log "Client load failed during failover. Exiting" + exit 5 + fi + + # Increment the number of failovers + NUM_FAILOVERS=$((NUM_FAILOVERS+1)) + val=$((${!var} + 1)) + eval $var=$val + + CURRENT_TS=$(date +%s) + ELAPSED=$((CURRENT_TS - START_TS)) + + sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start))) + + # keep count the number of itterations when + # time spend to failover and two client loads check exceeded + # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ) + if [ $sleep -lt $MINSLEEP ]; then + reqfail=$((reqfail +1)) + log "WARNING: failover and two check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP ! +Failed to meet interval $reqfail times ( REQFAIL=$REQFAIL ); have sleep=$sleep" + [ $reqfail -gt $REQFAIL ] && exit 6 + fi + + log "$SERVERFACET has failed over ${!var} times, and counting..." + if [ $sleep -gt 0 ]; then + echo "sleeping $sleep seconds ... " + sleep $sleep + fi +done + +exit 0 diff --git a/lustre/tests/run_dbench.sh b/lustre/tests/run_dbench.sh new file mode 100755 index 0000000..f82d9dd --- /dev/null +++ b/lustre/tests/run_dbench.sh @@ -0,0 +1,65 @@ +#!/bin/bash +set -x + +TMP=${TMP:-/tmp} + +TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} +LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) +DEBUGLOG=${LOG}.debug + +mkdir -p ${LOG%/*} + +rm -f $LOG $DEBUGLOG +exec 2>$DEBUGLOG + +if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then + echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE" + exit 1 +fi + +echoerr () { echo "$@" 1>&2 ; } + +signaled() { + trap 0 + echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate" + kill $load_pid + kill -TERM -$PPID + sleep 5 + kill -KILL -$PPID +} + +trap signaled TERM + +# recovery-mds-scale uses this to signal the client loads to die +echo $$ >$LOAD_PID_FILE + +TESTDIR=$MOUNT/dbench-$(hostname) + +CONTINUE=true + +while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do + echoerr "$(date +'%F %H:%M:%S'): dbench run starting" + + mkdir -p $TESTDIR + rundbench -D $TESTDIR 2 1>$LOG & + load_pid=$! + + wait $load_pid + if [ ${PIPESTATUS[0]} -eq 0 ]; then + echoerr "$(date +'%F %H:%M:%S'): dbench succeeded" + cd $TMP + rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): dbench run finished" + else + echoerr "$(date +'%F %H:%M:%S'): dbench failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break + CONTINUE=false + fi + fi +done + +echoerr "$(date +'%F %H:%M:%S'): dbench run exiting" diff --git a/lustre/tests/run_dd.sh b/lustre/tests/run_dd.sh new file mode 100755 index 0000000..96a4950 --- /dev/null +++ b/lustre/tests/run_dd.sh @@ -0,0 +1,62 @@ +#!/bin/bash +set -x + +TMP=${TMP:-/tmp} + +TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} +LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) +DEBUGLOG=${LOG}.debug + +mkdir -p ${LOG%/*} + +rm -f $LOG $DEBUGLOG +exec 2>$DEBUGLOG + +if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then + echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE" + exit 1 +fi + +echoerr () { echo "$@" 1>&2 ; } + +signaled() { + echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate" + kill -TERM -$PPID + sleep 5 + kill -KILL -$PPID +} + +trap signaled TERM + +# recovery-mds-scale uses this to signal the client loads to die +echo $$ >$LOAD_PID_FILE + +TESTDIR=$MOUNT/dd-$(hostname) + +CONTINUE=true +while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do + echoerr "$(date +'%F %H:%M:%S'): dd run starting" + mkdir -p $TESTDIR + cd $TESTDIR + dd bs=4k count=1000000 if=/dev/zero of=$TESTDIR/dd-file 1>$LOG & + load_pid=$! + wait $load_pid + + if [ $? -eq 0 ]; then + echoerr "$(date +'%F %H:%M:%S'): dd succeeded" + cd $TMP + rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): dd run finished" + else + echoerr "$(date +'%F %H:%M:%S'): dd failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break + CONTINUE=false + fi + fi +done + +echoerr "$(date +'%F %H:%M:%S'): dd run exiting" diff --git a/lustre/tests/run_iozone.sh b/lustre/tests/run_iozone.sh new file mode 100755 index 0000000..2b71118 --- /dev/null +++ b/lustre/tests/run_iozone.sh @@ -0,0 +1,77 @@ +#!/bin/bash +set -x + +TMP=${TMP:-/tmp} + +TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} +LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) +DEBUGLOG=${LOG}.debug + +mkdir -p ${LOG%/*} + +rm -f $LOG $DEBUGLOG +exec 2>$DEBUGLOG + +if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then + echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE" + exit 1 +fi + +echoerr () { echo "$@" 1>&2 ; } + +signaled() { + echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate" + kill -TERM -$PPID + sleep 5 + kill -KILL -$PPID +} + +trap signaled TERM + +# recovery-mds-scale uses this to signal the client loads to die +echo $$ >$LOAD_PID_FILE + +TESTDIR=$MOUNT/iozone-$(hostname) + +# needed to debug oom problem +#echo 1 > /proc/sys/vm/vm_gfp_debug +#killpids="" +#vmstat 1 1000000 >$TMP/iozone.vmstat.out & +#killpids="$killpids $!" +#$LUSTRE_TESTS/runvmstat > $TMP/iozone.runvmstat.out & +#killpids="$killpids $!" + +CONTINUE=true +while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do + echoerr "$(date +'%F %H:%M:%S'): iozone run starting" + mkdir -p $TESTDIR + cd $TESTDIR + iozone -a -M -R -V 0xab -g 100M -q 512k -i0 -i1 -f $TESTDIR/iozone-file 1>$LOG & + load_pid=$! + wait $load_pid + if [ ${PIPESTATUS[0]} -eq 0 ]; then + echoerr "$(date +'%F %H:%M:%S'): iozone succeeded" + cd $TMP + rm -rf $TESTDIR + if [ -d $TESTDIR ]; then + echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR" + echo $(hostname) >> $END_RUN_FILE + CONTINUE=false + fi + echoerr "$(date +'%F %H:%M:%S'): iozone run finished" + else + echoerr "$(date +'%F %H:%M:%S'): iozone failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break + CONTINUE=false + fi + fi +done + +echoerr "$(date +'%F %H:%M:%S'): iozone run exiting" +#kill $killpids +#sleep 5 +#kill -9 $killpids diff --git a/lustre/tests/run_tar.sh b/lustre/tests/run_tar.sh new file mode 100755 index 0000000..7502c241 --- /dev/null +++ b/lustre/tests/run_tar.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -x + +TMP=${TMP:-/tmp} + +TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} +LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) +DEBUGLOG=${LOG}.debug + +mkdir -p ${LOG%/*} + +rm -f $LOG $DEBUGLOG +exec 2>$DEBUGLOG + +if [ -z "$MOUNT" -o -z "$END_RUN_FILE" -o -z "$LOAD_PID_FILE" ]; then + echo "The following must be set: MOUNT END_RUN_FILE LOAD_PID_FILE" + exit 1 +fi + +echoerr () { echo "$@" 1>&2 ; } + +signaled() { + echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate" + kill -TERM -$PPID + sleep 5 + kill -KILL -$PPID +} + +trap signaled TERM + +# recovery-mds-scale uses this to signal the client loads to die +echo $$ >$LOAD_PID_FILE + +TESTDIR=$MOUNT/tar-$(hostname) + +CONTINUE=true +while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do + echoerr "$(date +'%F %H:%M:%S'): tar run starting" + mkdir -p $TESTDIR + cd $TESTDIR + tar cf - /etc | tar xf - 2>&1 | tee $LOG & + load_pid=$! +ps -e f -o "pid ppid pgrp comm" >$TMP/client-load.ps-list + wait $load_pid + RC=${PIPESTATUS[0]} + PREV_ERRORS=$(grep "exit delayed from previous errors" $LOG) || true + if [ $RC -ne 0 -a "$ERRORS_OK" -a "$PREV_ERRORS" ]; then + echoerr "$(date +'%F %H:%M:%S'): tar errors earlier, ignoring" + RC=0 + fi + if [ $RC -eq 0 ]; then + echoerr "$(date +'%F %H:%M:%S'): tar succeeded" + cd $TMP + rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): tar run finished" + else + echoerr "$(date +'%F %H:%M:%S'): tar failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break + CONTINUE=false + fi + fi +done + +echoerr "$(date +'%F %H:%M:%S'): tar run exiting" diff --git a/lustre/tests/rundbench b/lustre/tests/rundbench index fb21863..c3fa9cb 100755 --- a/lustre/tests/rundbench +++ b/lustre/tests/rundbench @@ -27,7 +27,11 @@ mkdir -p $DIR TGT=$DIR/client.txt CLIENT_PREFIX="${DBENCH_LIB} /usr/share/dbench /usr/local/share /usr/lib/dbench" CLIENT_FILE="client.txt client_plain.txt dbench_client" -which dbench > /dev/null 2>&1 || { skip "$0: dbench not installed" && exit 0; } +if ! which dbench > /dev/null 2>&1 ; then + [ "$MISSING_DBENCH_OK" ] || { error "dbench is not installed !" && exit 3; } + skip "$0: dbench is not installed" + exit 0 +fi CLIENT="" for prefix in $CLIENT_PREFIX; do @@ -65,9 +69,20 @@ fi shift $((OPTIND - 1)) +trap ' +echo kill dbench main pid=$DBENCHPID +kill $DBENCHPID +rm -rf dbench $LIBS71 client.txt +exit 0 +' TERM + cd $DIR echo "running 'dbench $@' $PREFIX $PWD at `date`" -$RUN dbench -c client.txt $@ + +$RUN dbench -c client.txt $@ & +DBENCHPID=$! +echo "dbench PID=$DBENCHPID" +wait $DBENCHPID RC=$? [ $RC -ne 0 ] && killall -9 dbench diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 440887e..4f19d4f 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -580,7 +580,7 @@ zconf_umount_clients() { } shutdown_facet() { - facet=$1 + local facet=$1 if [ "$FAILURE_MODE" = HARD ]; then $POWER_DOWN `facet_active_host $facet` sleep 2 @@ -605,6 +605,92 @@ boot_node() { fi } +# recovery-scale functions +check_progs_installed () { + local clients=$1 + shift + local progs=$@ + + do_nodes $clients "set -x ; PATH=:$PATH status=true; for prog in $progs; do + which \\\$prog || { echo \\\$prog missing on \\\$(hostname) && status=false; } + done; + eval \\\$status" +} + +start_client_load() { + local list=(${1//,/ }) + local nodenum=$2 + + local numloads=${#CLIENT_LOADS[@]} + local testnum=$((nodenum % numloads)) + + do_node ${list[nodenum]} "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \ + BREAK_ON_ERROR=$BREAK_ON_ERROR \ + END_RUN_FILE=$END_RUN_FILE \ + LOAD_PID_FILE=$LOAD_PID_FILE \ + TESTSUITELOG=$TESTSUITELOG \ + run_${CLIENT_LOADS[testnum]}.sh" & + CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!" + log "Started client load: ${CLIENT_LOADS[testnum]} on ${list[nodenum]}" + + eval export ${list[nodenum]}_load=${CLIENT_LOADS[testnum]} + return 0 +} + +start_client_loads () { + local clients=(${1//,/ }) + + for ((num=0; num < ${#clients[@]}; num++ )); do + start_client_load $1 $num + done +} + +# only for remote client +check_client_load () { + local client=$1 + local var=${client}_load + + local TESTLOAD=run_${!var}.sh + + ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1 + + check_catastrophe $client || return 2 + + # see if the load is still on the client + local tries=3 + local RC=254 + while [ $RC = 254 -a $tries -gt 0 ]; do + let tries=$tries-1 + # assume success + RC=0 + if ! do_node $client "ps auxwww | grep -v grep | grep -q $TESTLOAD"; then + RC=${PIPESTATUS[0]} + sleep 30 + fi + done + if [ $RC = 254 ]; then + echo "got a return status of $RC from do_node while checking (i.e. with 'ps') the client load on the remote system" + # see if we can diagnose a bit why this is + fi + + return $RC +} +check_client_loads () { + local clients=${1//,/ } + local client= + local rc=0 + + for client in $clients; do + check_client_load $client + rc=$? + if [ "$rc" != 0 ]; then + log "Client load failed on node $client, rc=$rc" + return $rc + fi + done +} +# End recovery-scale functions + # verify that lustre actually cleaned up properly cleanup_check() { [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \ @@ -1403,6 +1489,16 @@ comma_list() { echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g' } +# list is comma separated list +exclude_item_from_list () { + local list=$1 + local excluded=$2 + + list=${list//,/ } + list=$(echo " $list " | sed -re "s/\s+$excluded\s+/ /g") + echo $(comma_list $list) +} + absolute_path() { (cd `dirname $1`; echo $PWD/`basename $1`) } @@ -1982,6 +2078,18 @@ init_clients_lists () { CLIENTCOUNT=$((${#remoteclients[@]} + 1)) } +get_random_entry () { + local rnodes=$1 + + rnodes=${rnodes//,/ } + + local nodes=($rnodes) + local num=${#nodes[@]} + local i=$((RANDOM * num / 65536)) + + echo ${nodes[i]} +} + is_patchless () { lctl get_param version | grep -q patchless @@ -2156,11 +2264,11 @@ restore_lustre_params() { } check_catastrophe () { - local rnodes=$(comma_list $(remote_nodes_list)) + local rnodes=${1:-$(comma_list $(remote_nodes_list))} - [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && return 1 + [ -f $CATASTROPHE ] && [ $(cat $CATASTROPHE) -ne 0 ] && return 1 if [ $rnodes ]; then - do_nodes $rnodes "[ -f $CATASTROPHE ] && { [ \`cat $CATASTROPHE\` -eq 0 ] || false; } || true" + do_nodes $rnodes "set -x; [ -f $CATASTROPHE ] && { [ \`cat $CATASTROPHE\` -eq 0 ] || false; } || true" fi } -- 1.8.3.1