From 36459a19d3688ae38847f36d896326be92e7dd38 Mon Sep 17 00:00:00 2001 From: Yu Jian Date: Wed, 11 Apr 2012 16:23:34 +0800 Subject: [PATCH] LU-734 tests: add sub-tests into recovery-*-scale tests This patch adds sub-tests into the recovery-*-scale tests so that test results and logs could be gathered properly and uploaded to Maloo. The patch also does some cleanup works on the test scripts and moves some common functions into test-framework.sh. Signed-off-by: Yu Jian Change-Id: Ife174285d182ad5a2d4823767ca59df5a10b4aa4 Reviewed-on: http://review.whamcloud.com/2509 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Minh Diep Reviewed-by: Cliff White Reviewed-by: Oleg Drokin --- lustre/tests/cfg/local.sh | 1 - lustre/tests/recovery-double-scale.sh | 339 ++++++++++++++--------------- lustre/tests/recovery-mds-scale.sh | 335 +++++++++++++---------------- lustre/tests/recovery-random-scale.sh | 386 +++++++++++++++------------------- lustre/tests/run_IOR.sh | 33 +-- lustre/tests/run_dbench.sh | 33 +-- lustre/tests/run_dd.sh | 33 +-- lustre/tests/run_iozone.sh | 37 ++-- lustre/tests/run_tar.sh | 35 +-- lustre/tests/rundbench | 10 +- lustre/tests/test-framework.sh | 138 +++++++++--- 11 files changed, 671 insertions(+), 709 deletions(-) diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index 2364b49..e376160 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -129,7 +129,6 @@ FAIL_ON_ERROR=${FAIL_ON_ERROR:-true} MPIRUN=$(which mpirun 2>/dev/null) || true MPI_USER=${MPI_USER:-mpiuser} -SHARED_DIR_LOGS=${SHARED_DIR_LOGS:-""} # This is used by a small number of tests to share state between the client # running the tests, or in some cases between the servers (e.g. lfsck.sh). diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh index 100b054..8a1cdd9 100644 --- a/lustre/tests/recovery-double-scale.sh +++ b/lustre/tests/recovery-double-scale.sh @@ -1,4 +1,5 @@ #!/bin/bash +# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: # All pairwise combinations of node failures. # Was cmd3-17 @@ -8,98 +9,73 @@ # Script fails pair of nodes: # -- in parallel by default # -- in series if SERIAL is set +set -e -LUSTRE=${LUSTRE:-`dirname $0`/..} -SETUP=${SETUP:-""} -CLEANUP=${CLEANUP:-""} -. $LUSTRE/tests/test-framework.sh +ONLY=${ONLY:-"$*"} -init_test_env $@ +# bug number for skipped test: +ALWAYS_EXCEPT="$RECOVERY_DOUBLE_SCALE_EXCEPT" +# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} -DEBUGLOG=$TESTSUITELOG.debug - -cleanup_logs -exec 2>$DEBUGLOG -echo "--- env ---" >&2 -env >&2 -echo "--- env ---" >&2 -set -x +remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0 +remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0 -[ "$SHARED_DIRECTORY" ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; } +[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] && + skip_env "need three or more clients" && exit 0 -check_shared_dir $SHARED_DIRECTORY || - error "$SHARED_DIRECTORY isn't a shared directory" - -[ -n "$CLIENTS" ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; } - -[ $CLIENTCOUNT -ge 3 ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; } - -END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} -LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} - -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 - -check_timeout || exit 1 +if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then + skip_env "SHARED_DIRECTORY should be specified with a shared directory \ +which is accessable on all of the nodes" + exit 0 +fi [[ $FAILURE_MODE = SOFT ]] && \ log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" -build_test_filter - -check_and_setup_lustre -rm -rf $DIR/[df][0-9]* - -# the test node needs to be insulated from a lustre failure as much as possible, -# so not even loading the lustre modules is ideal. -# -- umount lustre -# -- remove hostname from clients list -zconf_umount $(hostname) $MOUNT -NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} -NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname)) +# Set SERIAL to serialize the failure through a recovery of the first failure. +SERIAL=${SERIAL:-""} +ERRORS_OK="yes" -check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} +[ "$SERIAL" ] && ERRORS_OK="" -MDTS=$(get_facets MDS) -OSTS=$(get_facets OST) +FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60 * 5))} # 5 minutes -rm -f $END_RUN_FILE +END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} +LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} reboot_recover_node () { # item var contains a pair of clients if nodetype=clients # I would prefer to have a list here local item=$1 - local nodetype=$2 - local timeout=$($LCTL get_param -n timeout) + local nodetype=$2 + local c # MDS, OST item contains the facet case $nodetype in - MDS|OST ) facet_failover $item - [ "$SERIAL" ] && wait_recovery_complete $item || true - ;; - clients) for c in ${item//,/ }; do - # make sure the client loads die - do_nodes $c "set -x; test -f $TMP/client-load.pid && \ - { kill -s TERM \$(cat $TMP/client-load.pid) || true; }" - shutdown_client $c - boot_node $c - echo "Reintegrating $c" - # one client fails; need dk logs from this client only - zconf_mount $c $MOUNT || NODES="$c $(mdts_nodes) $(osts_nodes)" error_exit "zconf_mount failed" - done - start_client_loads $item - ;; - # script failure: - # don't use error (), the logs from all nodes not needed - * ) echo "reboot_recover_node: nodetype=$nodetype. Must be one of 'MDS', 'OST', or 'clients'." - exit 1;; + MDS|OST ) facet_failover $item + [ "$SERIAL" ] && wait_recovery_complete $item || true + ;; + clients) for c in ${item//,/ }; do + # make sure the client loads die + stop_process $c $LOAD_PID_FILE + shutdown_client $c + boot_node $c + echo "Reintegrating $c" + zconf_mount $c $MOUNT || + error "mount $MOUNT on $c failed" + client_up $c || error "start client on $c failed" + done + start_client_loads $item + ;; + * ) echo "ERROR: invalid nodetype=$nodetype." \ + "Must be one of 'MDS', 'OST', or 'clients'." + exit 1;; esac } @@ -111,11 +87,9 @@ get_item_type () { case $type in MDS ) list=$MDTS;; OST ) list=$OSTS;; - clients) list=$NODES_TO_USE - ;; - # script failure: - # don't use error (), the logs from all nodes not needed - * ) echo "Invalid type=$type. Must be one of 'MDS', 'OST', or 'clients'." + clients) list=$NODES_TO_USE;; + * ) echo "ERROR: invalid type=$type." \ + "Must be one of 'MDS', 'OST', or 'clients'." exit 1;; esac @@ -126,8 +100,8 @@ get_item_type () { return fi - item=$(get_random_entry $list) - if [ "$type" = clients ] ; then + local item=$(get_random_entry $list) + if [ "$type" = "clients" ]; then item="$item $(get_random_entry $(exclude_items_from_list $list $item))" item=$(comma_list $item) fi @@ -151,29 +125,26 @@ failover_pair() { local client2= log " -==== START === $title " +==== START === $title" item1=$(get_item_type $type1) [ "$item1" ] || \ { echo "type1=$type1 item1 is empty" && return 0; } item2=$(get_item_type $type2 $item1) [ "$item2" ] || \ - { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" && return 0; } + { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" \ + && return 0; } # Check that our client loads are still running. If any have died, # that means they have died outside of recovery, which is unacceptable. log "==== Checking the clients loads BEFORE failover -- failure NOT OK" - # FIXME. need print summary on exit - if ! check_client_loads $NODES_TO_USE; then - exit 4 - fi + check_client_loads $NODES_TO_USE || exit $? log "Done checking client loads. Failing type1=$type1 item1=$item1 ... " + reboot_recover_node $item1 $type1 || exit $? - reboot_recover_node $item1 $type1 - - # Hendrix test17 description: + # Hendrix test17 description: # Introduce a failure, wait at # least 5 minutes (for recovery), # introduce a 2nd @@ -185,14 +156,14 @@ failover_pair() { # We have a "double failures" if SERIAL is not set, # do not need a sleep between failures for "double failures" - log " Failing type2=$type2 item2=$item2 ... " - reboot_recover_node $item2 $type2 + log " Failing type2=$type2 item2=$item2 ... " + reboot_recover_node $item2 $type2 || exit $? # Client loads are allowed to die while in recovery, so we just # restart them. - log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK" - restart_client_loads $NODES_TO_USE $ERRORS_OK || return $? - log "Done checking / re-Starting client loads. PASS" + log "==== Checking the clients loads AFTER failover -- ERRORS_OK=$ERRORS_OK" + restart_client_loads $NODES_TO_USE $ERRORS_OK || exit $? + log "Done checking / re-starting client loads. PASS" return 0 } @@ -200,25 +171,12 @@ summary_and_cleanup () { local rc=$? trap 0 + CURRENT_TS=$(date +%s) + ELAPSED=$((CURRENT_TS - START_TS)) + # Having not empty END_RUN_FILE means the failed loads only if [ -s $END_RUN_FILE ]; then - echo "Found the END_RUN_FILE file: $END_RUN_FILE" - cat $END_RUN_FILE - local END_RUN_NODE= - read END_RUN_NODE < $END_RUN_FILE - - # a client load will end (i.e. fail) if it finds - # the end run file. that does not mean that that client load - # actually failed though. the first node in the END_RUN_NODE is - # the one we are really interested in. - if [ -n "$END_RUN_NODE" ]; then - var=$(node_var_name $END_RUN_NODE)_load - echo "Client load failed on node $END_RUN_NODE" - echo - echo "client $END_RUN_NODE load debug output :" - local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug - do_node ${END_RUN_NODE} "set -x; [ -e $logfile ] && cat $logfile " || true - fi + print_end_run_file $END_RUN_FILE rc=1 fi @@ -232,114 +190,123 @@ Server failover period: $FAILOVER_PERIOD seconds Exited after: $ELAPSED seconds Status: $result: rc=$rc" - # make sure the client loads die - do_nodes $NODES_TO_USE "set -x; test -f $TMP/client-load.pid && \ - { kill -s TERM \$(cat $TMP/client-load.pid) || true; }" - - # and free up the pdshes that started them, if any are still around - if [ -n "$CLIENT_LOAD_PIDS" ]; then - kill $CLIENT_LOAD_PIDS || true - sleep 5 - kill -9 $CLIENT_LOAD_PIDS || true - fi + # stop the client loads + stop_client_loads $NODES_TO_USE $LOAD_PID_FILE if [ $rc -ne 0 ]; then # we are interested in only on failed clients and servers local failedclients=$(cat $END_RUN_FILE | grep -v $0) # FIXME: need ostfailover-s nodes also for FLAVOR=OST - local product=$(gather_logs $(comma_list $(osts_nodes) \ - $(mdts_nodes) $mdsfailover_HOST $failedclients)) - echo logs files $product + gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \ + $mdsfailover_HOST $failedclients) fi - [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT exit $rc } -trap summary_and_cleanup EXIT TERM INT +################################## Main Flow ################################### +build_test_filter -# -# MAIN -# -log "-----============= $0 starting =============-----" +check_and_setup_lustre +rm -rf $DIR/[Rdfs][0-9]* -START_TS=$(date +%s) -CURRENT_TS=$START_TS -ELAPSED=0 +check_timeout || exit 1 -# Set SERIAL to serialize the failure through a recovery of the first failure. -SERIAL=${SERIAL:-""} -ERRORS_OK="yes" +# The test node needs to be insulated from a lustre failure as much as possible, +# so not even loading the lustre modules is ideal. +# -- umount lustre +# -- remove hostname from clients list +zconf_umount $HOSTNAME $MOUNT +NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} +NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME) -[ "$SERIAL" ] && ERRORS_OK="" +check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} -FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes +MDTS=$(get_facets MDS) +OSTS=$(get_facets OST) -# Start client loads. -start_client_loads $NODES_TO_USE -echo clients load pids: -if ! do_nodesv $NODES_TO_USE "cat $TMP/client-load.pid"; then - exit 3 -fi +ELAPSED=0 +START_TS=$(date +%s) +CURRENT_TS=$START_TS -# FIXME: Do we want to have an initial sleep period where the clients -# just run before introducing a failure? -sleep $FAILOVER_PERIOD +# Every pairwise combination of client failures (2 clients), +# MDS failure, and OST failure will be tested. +test_pairwise_fail() { + trap summary_and_cleanup EXIT TERM INT -#CMD_TEST_NUM=17.1 -failover_pair MDS OST "test 1: failover MDS, then OST ==========" -sleep $FAILOVER_PERIOD + # Start client loads. + rm -f $END_RUN_FILE + start_client_loads $NODES_TO_USE -#CMD_TEST_NUM=17.2 -failover_pair MDS clients "test 2: failover MDS, then 2 clients ====" -sleep $FAILOVER_PERIOD + echo clients load pids: + do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3 -#CMD_TEST_NUM=17.3 -if [ $MDSCOUNT -gt 1 ]; then - failover_pair MDS MDS "test 3: failover MDS, then another MDS ==" + # FIXME: Do we want to have an initial sleep period where the clients + # just run before introducing a failure? sleep $FAILOVER_PERIOD -else - skip "$0 : $MDSCOUNT < 2 MDTs, test 3 skipped" -fi -#CMD_TEST_NUM=17.4 -if [ $OSTCOUNT -gt 1 ]; then - failover_pair OST OST "test 4: failover OST, then another OST ==" + # CMD_TEST_NUM=17.1 + failover_pair MDS OST "test 1: failover MDS, then OST ==========" sleep $FAILOVER_PERIOD -else - skip "$0 : $OSTCOUNT < 2 OSTs, test 4 skipped" -fi -#CMD_TEST_NUM=17.5 -failover_pair OST clients "test 5: failover OST, then 2 clients ====" -sleep $FAILOVER_PERIOD + # CMD_TEST_NUM=17.2 + failover_pair MDS clients "test 2: failover MDS, then 2 clients ====" + sleep $FAILOVER_PERIOD -#CMD_TEST_NUM=17.6 -failover_pair OST MDS "test 6: failover OST, then MDS ==========" -sleep $FAILOVER_PERIOD + # CMD_TEST_NUM=17.3 + if [ $MDSCOUNT -gt 1 ]; then + failover_pair MDS MDS "test 3: failover MDS, then another MDS ==" + sleep $FAILOVER_PERIOD + else + skip_env "has less than 2 MDTs, test 3 skipped" + fi -#CMD_TEST_NUM=17.7 -failover_pair clients MDS "test 7: failover 2 clients, then MDS ====" -sleep $FAILOVER_PERIOD + # CMD_TEST_NUM=17.4 + if [ $OSTCOUNT -gt 1 ]; then + failover_pair OST OST "test 4: failover OST, then another OST ==" + sleep $FAILOVER_PERIOD + else + skip_env "has less than 2 OSTs, test 4 skipped" + fi -#CMD_TEST_NUM=17.8 -#failover_pair clients OST "test 8: failover 2 clients, then OST ====" -sleep $FAILOVER_PERIOD + # CMD_TEST_NUM=17.5 + failover_pair OST clients "test 5: failover OST, then 2 clients ====" + sleep $FAILOVER_PERIOD -#CMD_TEST_NUM=17.9 -if [ $CLIENTCOUNT -ge 5 ]; then - failover_pair clients clients "test 9: failover 2 clients, then 2 different clients ==" + # CMD_TEST_NUM=17.6 + failover_pair OST MDS "test 6: failover OST, then MDS ==========" sleep $FAILOVER_PERIOD -fi -log "==== Checking the clients loads AFTER all failovers -- failure NOT OK" -if ! check_client_loads $NODES_TO_USE; then - log "Client load failed after failover. Exiting" - exit 5 -fi -CURRENT_TS=$(date +%s) -ELAPSED=$((CURRENT_TS - START_TS)) + # CMD_TEST_NUM=17.7 + failover_pair clients MDS "test 7: failover 2 clients, then MDS ====" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.8 + failover_pair clients OST "test 8: failover 2 clients, then OST ====" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.9 + if [ $CLIENTCOUNT -gt 4 ]; then + failover_pair clients clients \ + "test 9: failover 2 clients, then 2 different clients ==" + sleep $FAILOVER_PERIOD + else + skip_env "has less than 5 Clients, test 9 skipped" + fi + + log "==== Checking the clients loads AFTER all failovers -- failure NOT OK" + if ! check_client_loads $NODES_TO_USE; then + log "Client load failed after failover. Exiting..." + exit 5 + fi + + exit 0 +} +run_test pairwise_fail "pairwise combination of clients, MDS, and OST failures" -log "Completed successfully in $ELAPSED seconds" +zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed" +client_up || error "start client on $HOSTNAME failed" -exit 0 +complete $(basename $0) $SECONDS +check_and_cleanup_lustre +exit_status diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh index 8b91489..6a914b3 100644 --- a/lustre/tests/recovery-mds-scale.sh +++ b/lustre/tests/recovery-mds-scale.sh @@ -1,90 +1,54 @@ #!/bin/bash +# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: # Was Test 11 in cmd3. # For duration of 24 hours repeatedly failover a random MDS at # 10 minute intervals and verify that no application errors occur. # Test runs one of CLIENT_LOAD progs on remote clients. +set -e -LUSTRE=${LUSTRE:-`dirname $0`/..} -SETUP=${SETUP:-""} -CLEANUP=${CLEANUP:-""} -. $LUSTRE/tests/test-framework.sh +ONLY=${ONLY:-"$*"} -init_test_env $@ +# bug number for skipped test: +ALWAYS_EXCEPT="$RECOVERY_MDS_SCALE_EXCEPT" +# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} -DEBUGLOG=$TESTSUITELOG.debug - -cleanup_logs - -exec 2>$DEBUGLOG -echo "--- env ---" >&2 -env >&2 -echo "--- env ---" >&2 -set -x - -[ "$SHARED_DIRECTORY" ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; } - -check_shared_dir $SHARED_DIRECTORY || - error "$SHARED_DIRECTORY isn't a shared directory" - -[ -n "$CLIENTS" ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; } - -[ $CLIENTCOUNT -ge 3 ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; } - -END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} -LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} - -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 - -build_test_filter - -check_and_setup_lustre -rm -rf $DIR/[df][0-9]* - -max_recov_time=$(max_recovery_time) +remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0 +remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0 -# the test node needs to be insulated from a lustre failure as much as possible, -# so not even loading the lustre modules is ideal. -# -- umount lustre -# -- remove hostname from clients list -zconf_umount $(hostname) $MOUNT -NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} -NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname)) +[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] && + skip_env "need three or more clients" && exit 0 -check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} - -MDTS=$(get_facets MDS) -OSTS=$(get_facets OST) +if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then + skip_env "SHARED_DIRECTORY should be specified with a shared directory \ +which is accessable on all of the nodes" + exit 0 +fi ERRORS_OK="" # No application failures should occur during this test. -FLAVOR=${FLAVOR:-"MDS"} -if [ "$FLAVOR" == "MDS" ]; then - SERVERS=$MDTS -else - SERVERS=$OSTS -fi - if [ "$SLOW" = "no" ]; then DURATION=${DURATION:-$((60 * 30))} - SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))} else DURATION=${DURATION:-$((60 * 60 * 24))} - SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes fi +SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes -rm -f $END_RUN_FILE +MINSLEEP=${MINSLEEP:-120} +REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62 +REQFAIL=${REQFAIL:-$((DURATION / SERVER_FAILOVER_PERIOD * + REQFAIL_PERCENT / 100))} -vmstatLOG=${TESTSUITELOG}_$(basename $0 .sh).vmstat +END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} +LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} +VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid} server_numfailovers () { local facet=$1 @@ -105,30 +69,12 @@ servers_numfailovers () { } summary_and_cleanup () { - local rc=$? - local var trap 0 # Having not empty END_RUN_FILE means the failed loads only if [ -s $END_RUN_FILE ]; then - echo "Found the END_RUN_FILE file: $END_RUN_FILE" - cat $END_RUN_FILE - local END_RUN_NODE= - read END_RUN_NODE < $END_RUN_FILE - - # a client load will end (i.e. fail) if it finds - # the end run file. that does not mean that that client load - # actually failed though. the first node in the END_RUN_NODE is - # the one we are really interested in. - if [ -n "$END_RUN_NODE" ]; then - var=$(node_var_name $END_RUN_NODE)_load - echo "Client load failed on node $END_RUN_NODE" - echo - echo "client $END_RUN_NODE load stdout and debug files : - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE} - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug" - fi + print_end_run_file $END_RUN_FILE rc=1 fi @@ -137,155 +83,170 @@ summary_and_cleanup () { local result=PASS [ $rc -eq 0 ] || result=FAIL - log "Duration: $DURATION + log "Duration: $DURATION Server failover period: $SERVER_FAILOVER_PERIOD seconds Exited after: $ELAPSED seconds Number of failovers before exit: $(servers_numfailovers) Status: $result: rc=$rc" - # stop the vmstats on the OSTs - if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "test -f /tmp/vmstat.pid && \ - { kill -s TERM \$(cat /tmp/vmstat.pid); rm -f /tmp/vmstat.pid; \ - gzip -f9 $vmstatLOG-\$(hostname); }" - fi + # stop vmstat on OSS nodes + [ "$VMSTAT" ] && stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE - # make sure the client loads die - do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && \ - { kill -s TERM \$(cat $LOAD_PID_FILE) || true; }" + # stop the client loads + stop_client_loads $NODES_TO_USE $LOAD_PID_FILE - # and free up the pdshes that started them, if any are still around - if [ -n "$CLIENT_LOAD_PIDS" ]; then - kill $CLIENT_LOAD_PIDS || true - sleep 5 - kill -9 $CLIENT_LOAD_PIDS || true - fi if [ $rc -ne 0 ]; then # we are interested in only on failed clients and servers local failedclients=$(cat $END_RUN_FILE | grep -v $0) # FIXME: need ostfailover-s nodes also for FLAVOR=OST - local product=$(gather_logs $(comma_list $(osts_nodes) \ - $(mdts_nodes) $mdsfailover_HOST $failedclients)) - echo logs files $product + gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \ + $mdsfailover_HOST $failedclients) fi - [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT - exit $rc } -# -# MAIN -# -log "-----============= $0 starting =============-----" +failover_target() { + local flavor=${1:-"MDS"} + local servers + local serverfacet + local var -trap summary_and_cleanup EXIT INT + [ "$flavor" = "MDS" ] && servers=$MDTS || servers=$OSTS -ELAPSED=0 + trap summary_and_cleanup EXIT INT -# vmstat the osts -if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "vmstat 1 > $vmstatLOG-\$(hostname) 2>/dev/null /tmp/vmstat.pid" -fi + # start vmstat on OSS nodes + [ "$VMSTAT" ] && start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE -# Start client loads. -start_client_loads $NODES_TO_USE + # start client loads + rm -f $END_RUN_FILE + start_client_loads $NODES_TO_USE -echo clients load pids: -if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then - exit 3 -fi + echo client loads pids: + do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3 -MINSLEEP=${MINSLEEP:-120} -REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62 -REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))} -reqfail=0 -sleep=0 + ELAPSED=0 + local sleep=0 + local reqfail=0 + local it_time_start + local start_ts=$(date +%s) + local current_ts=$start_ts -START_TS=$(date +%s) -CURRENT_TS=$START_TS + while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do + # In order to perform the + # expected number of failovers, we need to account the following: + # 1) the time that has elapsed during the client load checking + # 2) time takes for failover + it_time_start=$(date +%s) -while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do + serverfacet=$(get_random_entry $servers) + var=${serverfacet}_numfailovers - # In order to perform the - # expected number of failovers, we need to account the following : - # 1) the time that has elapsed during the client load checking - # 2) time takes for failover + # Check that our client loads are still running. If any have died, + # that means they have died outside of recovery, which is unacceptable. + log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ + ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" + check_client_loads $NODES_TO_USE || exit 4 - it_time_start=$(date +%s) + log "Wait $serverfacet recovery complete before doing next failover..." + if ! wait_recovery_complete $serverfacet; then + echo "$serverfacet recovery is not completed!" + exit 7 + fi - SERVERFACET=$(get_random_entry $SERVERS) - var=${SERVERFACET}_numfailovers + log "Checking clients are in FULL state before doing next failover..." + if ! wait_clients_import_state $NODES_TO_USE $serverfacet FULL; then + echo "Clients import not FULL, please consider to increase \ +SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD!" + fi - # Check that our client loads are still running. If any have died, - # that means they have died outside of recovery, which is unacceptable. + log "Starting failover on $serverfacet" + facet_failover "$serverfacet" || exit 1 - log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ - ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" + # Check that our client loads are still running during failover. + # No application failures should occur. + log "==== Checking the clients loads AFTER failover -- failure NOT OK" + if ! check_client_loads $NODES_TO_USE; then + log "Client load failed during failover. Exiting..." + exit 5 + fi - if ! check_client_loads $NODES_TO_USE; then - exit 4 - fi + # Increment the number of failovers. + val=$((${!var} + 1)) + eval $var=$val - log "Wait $SERVERFACET recovery complete before doing next failover ...." + current_ts=$(date +%s) + ELAPSED=$((current_ts - start_ts)) - if ! wait_recovery_complete $SERVERFACET ; then - echo "$SERVERFACET recovery is not completed!" - exit 7 - fi + sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start))) - log "Checking clients are in FULL state before doing next failover" - if ! wait_clients_import_state $NODES_TO_USE $SERVERFACET FULL; then - echo "Clients import not FULL, please consider to increase SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD !" + # Keep counting the number of iterations when + # time spent to failover and two client loads check exceeded + # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ). + if [ $sleep -lt $MINSLEEP ]; then + reqfail=$((reqfail + 1)) + log "WARNING: failover and two check_client_loads time exceeded \ +SERVER_FAILOVER_PERIOD - MINSLEEP! +Failed to load the filesystem with I/O for a minimum period of \ +$MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ). +This iteration, the load was only applied for sleep=$sleep seconds. +Estimated max recovery time: $MAX_RECOV_TIME +Probably the hardware is taking excessively long time to boot. +Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \ +bug 20918" + [ $reqfail -gt $REQFAIL ] && exit 6 + fi - fi - log "Starting failover on $SERVERFACET" + log "$serverfacet has failed over ${!var} times, and counting..." - facet_failover "$SERVERFACET" || exit 1 + [ $((ELAPSED + sleep)) -ge $DURATION ] && break - # Check that our client loads are still running during failover. - # No application failures should occur. + if [ $sleep -gt 0 ]; then + echo "sleeping $sleep seconds... " + sleep $sleep + fi + done + exit 0 +} - log "==== Checking the clients loads AFTER failover -- failure NOT OK" - if ! check_client_loads $NODES_TO_USE; then - log "Client load failed during failover. Exiting" - exit 5 - fi +################################## Main Flow ################################### +build_test_filter - # Increment the number of failovers - val=$((${!var} + 1)) - eval $var=$val +check_and_setup_lustre +rm -rf $DIR/[Rdfs][0-9]* - CURRENT_TS=$(date +%s) - ELAPSED=$((CURRENT_TS - START_TS)) +MAX_RECOV_TIME=$(max_recovery_time) - sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start))) +# The test node needs to be insulated from a lustre failure as much as possible, +# so not even loading the lustre modules is ideal. +# -- umount lustre +# -- remove hostname from clients list +zconf_umount $HOSTNAME $MOUNT +NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} +NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME) - # keep count the number of itterations when - # time spend to failover and two client loads check exceeded - # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ) - if [ $sleep -lt $MINSLEEP ]; then - reqfail=$((reqfail +1)) - log "WARNING: failover and two check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP ! -Failed to load the filesystem with I/O for a minimum period of $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ). -This iteration, the load was only applied for sleep=$sleep seconds. -Estimated max recovery time : $max_recov_time -Probably the hardware is taking excessively long to boot. -Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918" - [ $reqfail -gt $REQFAIL ] && exit 6 - fi +check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} - log "$SERVERFACET has failed over ${!var} times, and counting..." +MDTS=$(get_facets MDS) +OSTS=$(get_facets OST) - if [ $((ELAPSED + sleep)) -ge $DURATION ]; then - break - fi +test_failover_mds() { + # failover a random MDS + failover_target MDS +} +run_test failover_mds "failover MDS" - if [ $sleep -gt 0 ]; then - echo "sleeping $sleep seconds ... " - sleep $sleep - fi -done +test_failover_ost() { + # failover a random OST + failover_target OST +} +run_test failover_ost "failover OST" + +zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed" +client_up || error "start client on $HOSTNAME failed" -exit 0 +complete $(basename $0) $SECONDS +check_and_cleanup_lustre +exit_status diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh index 6b3f758..9de55c0 100644 --- a/lustre/tests/recovery-random-scale.sh +++ b/lustre/tests/recovery-random-scale.sh @@ -1,4 +1,5 @@ #!/bin/bash +# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: # client failure does not affect other clients @@ -9,314 +10,261 @@ # 10 minute intervals and verify that no application errors occur. # Test runs one of CLIENT_LOAD progs on remote clients. +set -e -LUSTRE=${LUSTRE:-`dirname $0`/..} -SETUP=${SETUP:-""} -CLEANUP=${CLEANUP:-""} -. $LUSTRE/tests/test-framework.sh +ONLY=${ONLY:-"$*"} -init_test_env $@ +# bug number for skipped test: +ALWAYS_EXCEPT="$RECOVERY_RANDOM_SCALE_EXCEPT" +# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} -DEBUGLOG=$TESTSUITELOG.debug - -cleanup_logs - -exec 2>$DEBUGLOG -echo "--- env ---" >&2 -env >&2 -echo "--- env ---" >&2 -set -x - -[ "$SHARED_DIRECTORY" ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; } - -check_shared_dir $SHARED_DIRECTORY || - error "$SHARED_DIRECTORY isn't a shared directory" - -[ -n "$CLIENTS" ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; } +remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0 +remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0 -[ $CLIENTCOUNT -ge 3 ] || \ - { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; } +[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] && + skip_env "need three or more clients" && exit 0 -END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} -LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} - -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 +if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then + skip_env "SHARED_DIRECTORY should be specified with a shared directory \ +which is accessable on all of the nodes" + exit 0 +fi [[ $FAILURE_MODE = SOFT ]] && \ log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" -build_test_filter - -check_and_setup_lustre -rm -rf $DIR/[df][0-9]* - -max_recov_time=$(max_recovery_time) - -# the test node needs to be insulated from a lustre failure as much as possible, -# so not even loading the lustre modules is ideal. -# -- umount lustre -# -- remove hostname from clients list -zconf_umount $(hostname) $MOUNT -NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} -NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname)) - -check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} - -MDTS=$(get_facets MDS) +# Application failures are allowed for the failed client +# but not for other clients. +ERRORS_OK="yes" if [ "$SLOW" = "no" ]; then DURATION=${DURATION:-$((60 * 30))} - SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))} else DURATION=${DURATION:-$((60 * 60 * 24))} - SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes fi +SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes -rm -f $END_RUN_FILE +MINSLEEP=${MINSLEEP:-120} +REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62 +REQFAIL=${REQFAIL:-$((DURATION / SERVER_FAILOVER_PERIOD * + REQFAIL_PERCENT / 100))} -vmstatLOG=${TESTSUITELOG}_$(basename $0 .sh).vmstat +END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} +LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} +VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid} numfailovers () { local facet local var - for facet in $MDTS ${failed_clients//,/ }; do + for facet in $MDTS ${FAILED_CLIENTS//,/ }; do var=${facet}_nums val=${!var} if [ "$val" ] ; then - echo "$facet failed over $val times" + echo "$facet failed over $val times" fi done } -# list is comma separated -print_logs () { - local list=$1 - - do_nodes $list "node=\\\$(hostname) -var=\\\${node}_load -log=${TESTSUITELOG}_run_${!var}.sh-\\\$node.debug -if [ -e \\\$log ] ; then -echo Node \\\$node debug log: -cat \\\$log -fi" -} - summary_and_cleanup () { - local rc=$? - local var trap 0 # Having not empty END_RUN_FILE means the failed loads only if [ -s $END_RUN_FILE ]; then - echo "Found the END_RUN_FILE file: $END_RUN_FILE" - cat $END_RUN_FILE - local END_RUN_NODE= - read END_RUN_NODE < $END_RUN_FILE - - # a client load will end (i.e. fail) if it finds - # the end run file. that does not mean that that client load - # actually failed though. the first node in the END_RUN_NODE is - # the one we are really interested in. - if [ -n "$END_RUN_NODE" ]; then - var=$(node_var_name $END_RUN_NODE)_load - echo "Client load failed on node $END_RUN_NODE" - echo - echo "client $END_RUN_NODE load stdout and debug files : - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE} - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug" - fi + print_end_run_file $END_RUN_FILE rc=1 fi - echo $(date +'%F %H:%M:%S') Terminating clients loads ... echo "$0" >> $END_RUN_FILE local result=PASS [ $rc -eq 0 ] || result=FAIL - log "Duration: $DURATION + log "Duration: $DURATION Server failover period: $SERVER_FAILOVER_PERIOD seconds Exited after: $ELAPSED seconds Number of failovers before exit: $(numfailovers) Status: $result: rc=$rc" - # stop the vmstats on the OSTs - if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "test -f /tmp/vmstat.pid && \ - { kill -s TERM \$(cat /tmp/vmstat.pid); rm -f /tmp/vmstat.pid; \ - gzip -f9 $vmstatLOG-\$(hostname); }" - fi + # stop vmstat on OSS nodes + [ "$VMSTAT" ] && stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE - # make sure the client loads die - do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && \ - { kill -s TERM \$(cat $LOAD_PID_FILE) || true; }" - - # and free up the pdshes that started them, if any are still around - if [ -n "$CLIENT_LOAD_PIDS" ]; then - kill $CLIENT_LOAD_PIDS || true - sleep 5 - kill -9 $CLIENT_LOAD_PIDS || true - fi + # stop the client loads + stop_client_loads $NODES_TO_USE $LOAD_PID_FILE if [ $rc -ne 0 ]; then - print_logs $NODES_TO_USE # we are interested in only on failed clients and servers local failedclients=$(cat $END_RUN_FILE | grep -v $0) # FIXME: need ostfailover-s nodes also for FLAVOR=OST - local product=$(gather_logs $(comma_list $(osts_nodes) \ - $(mdts_nodes) $mdsfailover_HOST $failedclients)) - echo logs files $product + gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \ + $mdsfailover_HOST $failedclients) fi - [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT - exit $rc } -# -# MAIN -# -log "-----============= $0 starting =============-----" +################################## Main Flow ################################### +build_test_filter -trap summary_and_cleanup EXIT # INT +check_and_setup_lustre +rm -rf $DIR/[Rdfs][0-9]* -ELAPSED=0 +MAX_RECOV_TIME=$(max_recovery_time) -# vmstat the osts -if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "vmstat 1 > $vmstatLOG-\$(hostname) 2>/dev/null /tmp/vmstat.pid" -fi +# The test node needs to be insulated from a lustre failure as much as possible, +# so not even loading the lustre modules is ideal. +# -- umount lustre +# -- remove hostname from clients list +zconf_umount $HOSTNAME $MOUNT +NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} +NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME) -# Start client loads. -start_client_loads $NODES_TO_USE +check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} -echo clients load pids: -if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then - exit 3 -fi +MDTS=$(get_facets MDS) -START_TS=$(date +%s) -CURRENT_TS=$START_TS +# Fail a random client and then failover a random MDS. +test_fail_client_mds() { + local fail_client + local serverfacet + local client_var + local var -MINSLEEP=${MINSLEEP:-120} -REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62 -REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))} -reqfail=0 -sleep=0 + trap summary_and_cleanup EXIT INT -# This is used for FAIL_CLIENT only -ERRORS_OK="yes" -while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do + # start vmstat on OSS nodes + [ "$VMSTAT" ] && start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE - # In order to perform the - # expected number of failovers, we need to account the following : - # 1) the time that has elapsed during the client load checking - # 2) time takes for failover + # start client loads + rm -f $END_RUN_FILE + start_client_loads $NODES_TO_USE - it_time_start=$(date +%s) - - FAIL_CLIENT=$(get_random_entry $NODES_TO_USE) - client_var=$(node_var_name $FAIL_CLIENT)_nums + echo client loads pids: + do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3 - # store the list of failed clients - # lists are comma separated - failed_clients=$(expand_list $failed_clients $FAIL_CLIENT) + ELAPSED=0 + local sleep=0 + local reqfail=0 + local it_time_start + local start_ts=$(date +%s) + local current_ts=$start_ts - SERVERFACET=$(get_random_entry $MDTS) - var=${SERVERFACET}_nums + while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do + # In order to perform the + # expected number of failovers, we need to account the following: + # 1) the time that has elapsed during the client load checking + # 2) time takes for failover + it_time_start=$(date +%s) - # Check that our client loads are still running. If any have died, - # that means they have died outside of recovery, which is unacceptable. + fail_client=$(get_random_entry $NODES_TO_USE) + client_var=$(node_var_name $fail_client)_nums - log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ - ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" + # store the list of failed clients + # lists are comma separated + FAILED_CLIENTS=$(expand_list $FAILED_CLIENTS $fail_client) - if ! check_client_loads $NODES_TO_USE; then - exit 4 - fi + serverfacet=$(get_random_entry $MDTS) + var=${serverfacet}_nums - log "FAIL CLIENT $FAIL_CLIENT ... " - shutdown_client $FAIL_CLIENT + # Check that our client loads are still running. If any have died, + # that means they have died outside of recovery, which is unacceptable. + log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ + ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" + check_client_loads $NODES_TO_USE || exit 4 - log "Starting failover on $SERVERFACET" + log "FAIL CLIENT $fail_client..." + shutdown_client $fail_client - facet_failover "$SERVERFACET" || exit 1 - if ! wait_recovery_complete $SERVERFACET ; then - echo "$SERVERFACET recovery is not completed!" - exit 7 - fi + log "Starting failover on $serverfacet" + facet_failover "$serverfacet" || exit 1 - boot_node $FAIL_CLIENT - echo "Reintegrating $FAIL_CLIENT" - zconf_mount $FAIL_CLIENT $MOUNT || exit $? - - # Increment the number of failovers - val=$((${!var} + 1)) - eval $var=$val - val=$((${!client_var} + 1)) - eval $client_var=$val - - # load script on failed clients could create END_RUN_FILE - # We shuold remove it and ignore the failure if this - # file contains the failed client only. - # We can not use ERRORS_OK when start all loads at the start of this script - # because the application errors allowed for random failed client only, but - # not for all clients. - if [ -e $END_RUN_FILE ]; then - read END_RUN_NODE < $END_RUN_FILE - [[ $END_RUN_NODE = $FAIL_CLIENT ]] && - rm -f $END_RUN_FILE || exit 13 - fi + if ! wait_recovery_complete $serverfacet; then + echo "$serverfacet recovery is not completed!" + exit 7 + fi - restart_client_loads $FAIL_CLIENT $ERRORS_OK || exit $? + boot_node $fail_client + echo "Reintegrating $fail_client" + zconf_mount $fail_client $MOUNT || exit $? + client_up $fail_client || exit $? + + # Increment the number of failovers + val=$((${!var} + 1)) + eval $var=$val + val=$((${!client_var} + 1)) + eval $client_var=$val + + # load script on failed clients could create END_RUN_FILE + # We shuold remove it and ignore the failure if this + # file contains the failed client only. + # We can not use ERRORS_OK when start all loads at the start of + # this script because the application errors allowed for random + # failed client only, but not for all clients. + if [ -e $END_RUN_FILE ]; then + local end_run_node + read end_run_node < $END_RUN_FILE + [[ $end_run_node = $fail_client ]] && + rm -f $END_RUN_FILE || exit 13 + fi - # Check that not failed clients loads are still running. - # No application failures should occur on clients that was not failed. + restart_client_loads $fail_client $ERRORS_OK || exit $? - log "==== Checking the clients loads AFTER failed client reintegrated -- failure NOT OK" - if ! ERRORS_OK= check_client_loads $(exclude_items_from_list $NODES_TO_USE $FAIL_CLIENT); then - log "Client load failed. Exiting" - exit 5 - fi + # Check that not failed clients loads are still running. + # No application failures should occur on clients that were not failed. + log "==== Checking the clients loads AFTER failed client reintegrated \ +-- failure NOT OK" + if ! ERRORS_OK= check_client_loads \ + $(exclude_items_from_list $NODES_TO_USE $fail_client); then + log "Client load failed. Exiting..." + exit 5 + fi - CURRENT_TS=$(date +%s) - ELAPSED=$((CURRENT_TS - START_TS)) - sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start))) - - # keep count the number of itterations when - # time spend to failover and two client loads check exceeded - # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ) - if [ $sleep -lt $MINSLEEP ]; then - reqfail=$((reqfail +1)) - log "WARNING: failover, client reintegration and check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP ! -Failed to load the filesystem with I/O for a minimum period of $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ). + current_ts=$(date +%s) + ELAPSED=$((current_ts - start_ts)) + sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start))) + + # Keep counting the number of iterations when + # time spent to failover and two client loads check exceeded + # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ). + if [ $sleep -lt $MINSLEEP ]; then + reqfail=$((reqfail + 1)) + log "WARNING: failover, client reintegration and \ +check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP! +Failed to load the filesystem with I/O for a minimum period of \ +$MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ). This iteration, the load was only applied for sleep=$sleep seconds. -Estimated max recovery time : $max_recov_time -Probably the hardware is taking excessively long to boot. -Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918" - [ $reqfail -gt $REQFAIL ] && exit 6 - fi +Estimated max recovery time : $MAX_RECOV_TIME +Probably the hardware is taking excessively long time to boot. +Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \ +bug 20918" + [ $reqfail -gt $REQFAIL ] && exit 6 + fi - log " Number of failovers: + log "Number of failovers: $(numfailovers) and counting..." - if [ $((ELAPSED + sleep)) -ge $DURATION ]; then - break - fi + [ $((ELAPSED + sleep)) -ge $DURATION ] && break - if [ $sleep -gt 0 ]; then - echo "sleeping $sleep seconds ... " - sleep $sleep - fi -done + if [ $sleep -gt 0 ]; then + echo "sleeping $sleep seconds... " + sleep $sleep + fi + done + exit 0 +} +run_test fail_client_mds "fail client, then failover MDS" + +zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed" +client_up || error "start client on $HOSTNAME failed" -exit 0 +complete $(basename $0) $SECONDS +check_and_cleanup_lustre +exit_status diff --git a/lustre/tests/run_IOR.sh b/lustre/tests/run_IOR.sh index 6da7f54..9f8f816 100755 --- a/lustre/tests/run_IOR.sh +++ b/lustre/tests/run_IOR.sh @@ -1,16 +1,19 @@ #!/bin/bash -set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +TESTNAME=${TESTNAME:-""} +[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME + +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} rm -f $LOG $DEBUGLOG exec 2>$DEBUGLOG +set -x . $(dirname $0)/functions.sh @@ -46,19 +49,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do load_pid=$! wait $load_pid if [ ${PIPESTATUS[0]} -eq 0 ]; then - echoerr "$(date +'%F %H:%M:%S'): IOR succeeded" - cd $TMP - rm -rf $TESTDIR - echoerr "$(date +'%F %H:%M:%S'): IOR run finished" + echoerr "$(date +'%F %H:%M:%S'): IOR succeeded" + cd $TMP + rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): IOR run finished" else - echoerr "$(date +'%F %H:%M:%S'): IOR failed" - if [ -z "$ERRORS_OK" ]; then - echo $(hostname) >> $END_RUN_FILE - fi - if [ $BREAK_ON_ERROR ]; then - # break + echoerr "$(date +'%F %H:%M:%S'): IOR failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break CONTINUE=false - fi + fi fi done diff --git a/lustre/tests/run_dbench.sh b/lustre/tests/run_dbench.sh index f1520e8..b6c2ac1 100755 --- a/lustre/tests/run_dbench.sh +++ b/lustre/tests/run_dbench.sh @@ -1,16 +1,19 @@ #!/bin/bash -set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +TESTNAME=${TESTNAME:-""} +[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME + +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} rm -f $LOG $DEBUGLOG exec 2>$DEBUGLOG +set -x . $(dirname $0)/functions.sh @@ -34,19 +37,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do wait $load_pid if [ ${PIPESTATUS[0]} -eq 0 ]; then - echoerr "$(date +'%F %H:%M:%S'): dbench succeeded" - cd $TMP - rm -rf $TESTDIR - echoerr "$(date +'%F %H:%M:%S'): dbench run finished" + echoerr "$(date +'%F %H:%M:%S'): dbench succeeded" + cd $TMP + rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): dbench run finished" else - echoerr "$(date +'%F %H:%M:%S'): dbench failed" - if [ -z "$ERRORS_OK" ]; then - echo $(hostname) >> $END_RUN_FILE - fi - if [ $BREAK_ON_ERROR ]; then - # break + echoerr "$(date +'%F %H:%M:%S'): dbench failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break CONTINUE=false - fi + fi fi done diff --git a/lustre/tests/run_dd.sh b/lustre/tests/run_dd.sh index d8151d8..36af6ae 100755 --- a/lustre/tests/run_dd.sh +++ b/lustre/tests/run_dd.sh @@ -1,16 +1,19 @@ #!/bin/bash -set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +TESTNAME=${TESTNAME:-""} +[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME + +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} rm -f $LOG $DEBUGLOG exec 2>$DEBUGLOG +set -x . $(dirname $0)/functions.sh @@ -35,19 +38,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do wait $load_pid if [ $? -eq 0 ]; then - echoerr "$(date +'%F %H:%M:%S'): dd succeeded" - cd $TMP - rm -rf $TESTDIR - echoerr "$(date +'%F %H:%M:%S'): dd run finished" + echoerr "$(date +'%F %H:%M:%S'): dd succeeded" + cd $TMP + rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): dd run finished" else - echoerr "$(date +'%F %H:%M:%S'): dd failed" - if [ -z "$ERRORS_OK" ]; then - echo $(hostname) >> $END_RUN_FILE - fi - if [ $BREAK_ON_ERROR ]; then - # break + echoerr "$(date +'%F %H:%M:%S'): dd failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break CONTINUE=false - fi + fi fi done diff --git a/lustre/tests/run_iozone.sh b/lustre/tests/run_iozone.sh index 297142d..642303c 100755 --- a/lustre/tests/run_iozone.sh +++ b/lustre/tests/run_iozone.sh @@ -1,16 +1,19 @@ #!/bin/bash -set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +TESTNAME=${TESTNAME:-""} +[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME + +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} rm -f $LOG $DEBUGLOG exec 2>$DEBUGLOG +set -x . $(dirname $0)/functions.sh @@ -32,24 +35,24 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do load_pid=$! wait $load_pid if [ ${PIPESTATUS[0]} -eq 0 ]; then - echoerr "$(date +'%F %H:%M:%S'): iozone succeeded" - cd $TMP - rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): iozone succeeded" + cd $TMP + rm -rf $TESTDIR if [ -d $TESTDIR ]; then - echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR" - echo $(hostname) >> $END_RUN_FILE + echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR" + echo $(hostname) >> $END_RUN_FILE CONTINUE=false fi - echoerr "$(date +'%F %H:%M:%S'): iozone run finished" + echoerr "$(date +'%F %H:%M:%S'): iozone run finished" else - echoerr "$(date +'%F %H:%M:%S'): iozone failed" - if [ -z "$ERRORS_OK" ]; then - echo $(hostname) >> $END_RUN_FILE - fi - if [ $BREAK_ON_ERROR ]; then - # break + echoerr "$(date +'%F %H:%M:%S'): iozone failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break CONTINUE=false - fi + fi fi done diff --git a/lustre/tests/run_tar.sh b/lustre/tests/run_tar.sh index 1bc47e9..f46b874 100755 --- a/lustre/tests/run_tar.sh +++ b/lustre/tests/run_tar.sh @@ -1,16 +1,19 @@ #!/bin/bash -set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +TESTNAME=${TESTNAME:-""} +[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME + +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} rm -f $LOG $DEBUGLOG exec 2>$DEBUGLOG +set -x . $(dirname $0)/functions.sh @@ -24,7 +27,7 @@ echo $$ >$LOAD_PID_FILE TESTDIR=$MOUNT/d0.tar-$(hostname) do_tar() { - tar cf - /etc | tar xf - 2>&1 | tee $LOG + tar cf - /etc | tar xf - >$LOG 2>&1 return ${PIPESTATUS[1]} } @@ -42,19 +45,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do RC=0 fi if [ $RC -eq 0 ]; then - echoerr "$(date +'%F %H:%M:%S'): tar succeeded" - cd $TMP - rm -rf $TESTDIR - echoerr "$(date +'%F %H:%M:%S'): tar run finished" + echoerr "$(date +'%F %H:%M:%S'): tar succeeded" + cd $TMP + rm -rf $TESTDIR + echoerr "$(date +'%F %H:%M:%S'): tar run finished" else - echoerr "$(date +'%F %H:%M:%S'): tar failed" - if [ -z "$ERRORS_OK" ]; then - echo $(hostname) >> $END_RUN_FILE - fi - if [ $BREAK_ON_ERROR ]; then - # break + echoerr "$(date +'%F %H:%M:%S'): tar failed" + if [ -z "$ERRORS_OK" ]; then + echo $(hostname) >> $END_RUN_FILE + fi + if [ $BREAK_ON_ERROR ]; then + # break CONTINUE=false - fi + fi fi done diff --git a/lustre/tests/rundbench b/lustre/tests/rundbench index ab9f236..2b17ebd 100755 --- a/lustre/tests/rundbench +++ b/lustre/tests/rundbench @@ -44,13 +44,13 @@ for prefix in $CLIENT_PREFIX; do [ "x$CLIENT" != "x" ] && break; done -if [ -n "$SRC" -a -s "$SRC" ]; then - CLIENT=${SRC} +if [ -n "$DBENCH_SRC" -a -s "$DBENCH_SRC" ]; then + CLIENT=$DBENCH_SRC fi -[ ! -s "$CLIENT" ] && \ - skip_env "$0 : $(hostname) no client file found for dbench DBENCH_LIB=$DBENCH_LIB SRC=$SRC" && \ - exit 0 +[ ! -s "$CLIENT" ] && + skip_env "$0: no client file found for dbench on $(hostname): "\ + "DBENCH_LIB=$DBENCH_LIB DBENCH_SRC=$DBENCH_SRC" && exit 0 [ ! -s "$TGT" ] && echo "copying $CLIENT to $TGT" && cp $CLIENT $TGT [ ! -s "$TGT" ] && \ diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 6f86ad0..e9e02e1 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -128,11 +128,10 @@ init_test_env() { #[ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} export TESTSUITELOG=${TMP}/${TESTSUITE}.log - if [[ -z $LOGDIRSET ]]; then - export LOGDIR=${LOGDIR:-${TMP}/test_logs/}/$(date +%s) - export LOGDIRSET=true - fi - export HOSTNAME=${HOSTNAME:-`hostname`} + export LOGDIR=${LOGDIR:-${TMP}/test_logs/$(date +%s)} + export TESTLOG_PREFIX=$LOGDIR/$TESTSUITE + + export HOSTNAME=${HOSTNAME:-$(hostname -s)} if ! echo $PATH | grep -q $LUSTRE/utils; then export PATH=$LUSTRE/utils:$PATH fi @@ -1094,14 +1093,20 @@ start_client_load() { eval export ${var}=$load do_node $client "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \ - BREAK_ON_ERROR=$BREAK_ON_ERROR \ - END_RUN_FILE=$END_RUN_FILE \ - LOAD_PID_FILE=$LOAD_PID_FILE \ - TESTSUITELOG=$TESTSUITELOG \ - run_${load}.sh" & - CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!" +BREAK_ON_ERROR=$BREAK_ON_ERROR \ +END_RUN_FILE=$END_RUN_FILE \ +LOAD_PID_FILE=$LOAD_PID_FILE \ +TESTLOG_PREFIX=$TESTLOG_PREFIX \ +TESTNAME=$TESTNAME \ +DBENCH_LIB=$DBENCH_LIB \ +DBENCH_SRC=$DBENCH_SRC \ +run_${load}.sh" & + local ppid=$! log "Started client load: ${load} on $client" + # get the children process IDs + local pids=$(ps --ppid $ppid -o pid= | xargs) + CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $ppid $pids" return 0 } @@ -1118,14 +1123,14 @@ start_client_loads () { sleep 2 } -# only for remote client +# only for remote client check_client_load () { local client=$1 local var=$(node_var_name $client)_load local TESTLOAD=run_${!var}.sh ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1 - + # bug 18914: try to connect several times not only when # check ps, but while check_catastrophe also local tries=3 @@ -1203,7 +1208,7 @@ restart_client_loads () { if [ "$rc" != 0 ]; then log "Client load failed to restart on node $client, rc=$rc" # failure one client load means test fail - # we do not need to check other + # we do not need to check other return $rc fi else @@ -1211,6 +1216,70 @@ restart_client_loads () { fi done } + +# Start vmstat and save its process ID in a file. +start_vmstat() { + local nodes=$1 + local pid_file=$2 + + [ -z "$nodes" -o -z "$pid_file" ] && return 0 + + do_nodes $nodes \ + "vmstat 1 > $TESTLOG_PREFIX.$TESTNAME.vmstat.\\\$(hostname -s).log \ + 2>/dev/null $pid_file" +} + +# Display the nodes on which client loads failed. +print_end_run_file() { + local file=$1 + local node + + [ -s $file ] || return 0 + + echo "Found the END_RUN_FILE file: $file" + cat $file + + # A client load will stop if it finds the END_RUN_FILE file. + # That does not mean the client load actually failed though. + # The first node in END_RUN_FILE is the one we are interested in. + read node < $file + + if [ -n "$node" ]; then + local var=$(node_var_name $node)_load + + local prefix=$TESTLOG_PREFIX + [ -n "$TESTNAME" ] && prefix=$prefix.$TESTNAME + local stdout_log=$prefix.run_${!var}_stdout.$node.log + local debug_log=$(echo $stdout_log | sed 's/\(.*\)stdout/\1debug/') + + echo "Client load ${!var} failed on node $node:" + echo "$stdout_log" + echo "$debug_log" + fi +} + +# Stop the process which had its PID saved in a file. +stop_process() { + local nodes=$1 + local pid_file=$2 + + [ -z "$nodes" -o -z "$pid_file" ] && return 0 + + do_nodes $nodes "test -f $pid_file && + { kill -s TERM \\\$(cat $pid_file); rm -f $pid_file; }" || true +} + +# Stop all client loads. +stop_client_loads() { + local nodes=${1:-$CLIENTS} + local pid_file=$2 + + # stop the client loads + stop_process $nodes $pid_file + + # clean up the processes that started them + [ -n "$CLIENT_LOAD_PIDS" ] && kill -9 $CLIENT_LOAD_PIDS 2>/dev/null || true +} # End recovery-scale functions # verify that lustre actually cleaned up properly @@ -2135,6 +2204,7 @@ setupall() { [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE mount_client $MOUNT [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT + clients_up if [ "$MOUNT_2" ]; then mount_client $MOUNT2 @@ -2578,7 +2648,8 @@ check_and_cleanup_lustre() { fi if is_mounted $MOUNT; then - [ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]* + [ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]* || + error "remove sub-test dirs failed" [ "$ENABLE_QUOTA" ] && restore_quota_type || true fi @@ -2940,6 +3011,7 @@ error_noexit() { log " ${TESTSUITE} ${TESTNAME}: @@@@@@ ${TYPE}: $@ " + mkdir -p $LOGDIR # We need to dump the logs on all nodes if $dump; then gather_logs $(comma_list $(nodes_list)) @@ -4096,7 +4168,7 @@ destroy_pools () { echo destroy the created pools: ${!listvar} for poolname in ${!listvar//,/ }; do - destroy_pool $fsname.$poolname + destroy_pool $fsname.$poolname done } @@ -4111,8 +4183,15 @@ gather_logs () { local ts=$(date +%s) local docp=true + + if [[ ! -f "$YAML_LOG" ]]; then + # init_logging is not performed before gather_logs, + # so the $LOGDIR needs to be checked here + check_shared_dir $LOGDIR && touch $LOGDIR/shared + fi + [ -f $LOGDIR/shared ] && docp=false - + # dump lustre logs, dmesg prefix="$LOGDIR/${TESTSUITE}.${TESTNAME}" @@ -4129,20 +4208,9 @@ gather_logs () { do_nodesv $list \ "$LCTL dk > ${prefix}.debug_log.\\\$(hostname -s).${suffix}; dmesg > ${prefix}.dmesg.\\\$(hostname -s).${suffix}" - if [ ! -f $LOGDIR/shared ]; then + if $docp; then do_nodes $list rsync -az "${prefix}.*.${suffix}" $HOSTNAME:$LOGDIR - fi - - local archive=$LOGDIR/${TESTSUITE}-$ts.tar.bz2 - tar -jcf $archive $LOGDIR/*$ts* $LOGDIR/*${TESTSUITE}* - - echo $archive -} - -cleanup_logs () { - local list=${1:-$(comma_list $(nodes_list))} - - [ -n ${TESTSUITE} ] && do_nodes $list "rm -f $TMP/*${TESTSUITE}*" || true + fi } do_ls () { @@ -4488,20 +4556,24 @@ check_logdir() { # Not found. Create local logdir mkdir -p $dir else - touch $dir/node.$(hostname -s).yml + touch $dir/check_file.$(hostname -s) fi return 0 } check_write_access() { local dir=$1 + local node + local file + for node in $(nodes_list); do - if [ ! -f "$dir/node.$(short_hostname ${node}).yml" ]; then + file=$dir/check_file.$(short_hostname $node) + if [[ ! -f "$file" ]]; then # Logdir not accessible/writable from this node. return 1 fi + rm -f $file || return 1 done - rm -f $dir/node.*.yml return 0 } -- 1.8.3.1