#!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
# All pairwise combinations of node failures.
# Was cmd3-17
# Script fails pair of nodes:
# -- in parallel by default
# -- in series if SERIAL is set
+set -e
-LUSTRE=${LUSTRE:-`dirname $0`/..}
-SETUP=${SETUP:-""}
-CLEANUP=${CLEANUP:-""}
-. $LUSTRE/tests/test-framework.sh
+ONLY=${ONLY:-"$*"}
-init_test_env $@
+# bug number for skipped test:
+ALWAYS_EXCEPT="$RECOVERY_DOUBLE_SCALE_EXCEPT"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
init_logging
-DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log
-
-exec 2>$DEBUGLOG
-echo "--- env ---" >&2
-env >&2
-echo "--- env ---" >&2
-set -x
+remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0
-[ "$SHARED_DIRECTORY" ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; }
+[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] &&
+ skip_env "need three or more clients" && exit 0
-check_shared_dir $SHARED_DIRECTORY ||
- error "$SHARED_DIRECTORY isn't a shared directory"
-
-[ -n "$CLIENTS" ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; }
-
-[ $CLIENTCOUNT -ge 3 ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; }
-
-END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
-LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
-
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
-
-check_timeout || exit 1
+if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then
+ skip_env "SHARED_DIRECTORY should be specified with a shared directory \
+which is accessable on all of the nodes"
+ exit 0
+fi
[[ $FAILURE_MODE = SOFT ]] && \
log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797"
-build_test_filter
-
-check_and_setup_lustre
-rm -rf $DIR/[df][0-9]*
-
-# the test node needs to be insulated from a lustre failure as much as possible,
-# so not even loading the lustre modules is ideal.
-# -- umount lustre
-# -- remove hostname from clients list
-zconf_umount $(hostname) $MOUNT
-NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
-NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
+# Set SERIAL to serialize the failure through a recovery of the first failure.
+SERIAL=${SERIAL:-""}
+ERRORS_OK="yes"
-check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
+[ "$SERIAL" ] && ERRORS_OK=""
-MDTS=$(get_facets MDS)
-OSTS=$(get_facets OST)
+FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60 * 5))} # 5 minutes
-rm -f $END_RUN_FILE
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
reboot_recover_node () {
# item var contains a pair of clients if nodetype=clients
# I would prefer to have a list here
local item=$1
- local nodetype=$2
- local timeout=$($LCTL get_param -n timeout)
+ local nodetype=$2
+ local c
# MDS, OST item contains the facet
case $nodetype in
- MDS|OST ) facet_failover $item
- [ "$SERIAL" ] && wait_recovery_complete $item || true
- ;;
- clients) for c in ${item//,/ }; do
- # make sure the client loads die
- do_nodes $c "set -x; test -f $LOAD_PID_FILE &&
- { kill -s TERM \\\$(cat $LOAD_PID_FILE);
- rm -f $LOAD_PID_FILE || true; }"
- shutdown_client $c
- boot_node $c
- echo "Reintegrating $c"
- # one client fails; need dk logs from this client only
- zconf_mount $c $MOUNT || NODES="$c $(mdts_nodes) $(osts_nodes)" error_exit "zconf_mount failed"
- done
- start_client_loads $item
- ;;
- # script failure:
- # don't use error (), the logs from all nodes not needed
- * ) echo "reboot_recover_node: nodetype=$nodetype. Must be one of 'MDS', 'OST', or 'clients'."
- exit 1;;
+ MDS|OST ) facet_failover $item
+ [ "$SERIAL" ] && wait_recovery_complete $item || true
+ ;;
+ clients) for c in ${item//,/ }; do
+ # make sure the client loads die
+ stop_process $c $LOAD_PID_FILE
+ shutdown_client $c
+ boot_node $c
+ echo "Reintegrating $c"
+ zconf_mount $c $MOUNT ||
+ error "mount $MOUNT on $c failed"
+ client_up $c || error "start client on $c failed"
+ done
+ start_client_loads $item
+ ;;
+ * ) echo "ERROR: invalid nodetype=$nodetype." \
+ "Must be one of 'MDS', 'OST', or 'clients'."
+ exit 1;;
esac
}
case $type in
MDS ) list=$MDTS;;
OST ) list=$OSTS;;
- clients) list=$NODES_TO_USE
- ;;
- # script failure:
- # don't use error (), the logs from all nodes not needed
- * ) echo "Invalid type=$type. Must be one of 'MDS', 'OST', or 'clients'."
+ clients) list=$NODES_TO_USE;;
+ * ) echo "ERROR: invalid type=$type." \
+ "Must be one of 'MDS', 'OST', or 'clients'."
exit 1;;
esac
return
fi
- item=$(get_random_entry $list)
- if [ "$type" = clients ] ; then
+ local item=$(get_random_entry $list)
+ if [ "$type" = "clients" ]; then
item="$item $(get_random_entry $(exclude_items_from_list $list $item))"
item=$(comma_list $item)
fi
local client2=
log "
-==== START === $title "
+==== START === $title"
item1=$(get_item_type $type1)
[ "$item1" ] || \
{ echo "type1=$type1 item1 is empty" && return 0; }
item2=$(get_item_type $type2 $item1)
[ "$item2" ] || \
- { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" && return 0; }
+ { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" \
+ && return 0; }
# Check that our client loads are still running. If any have died,
# that means they have died outside of recovery, which is unacceptable.
log "==== Checking the clients loads BEFORE failover -- failure NOT OK"
-
# FIXME. need print summary on exit
- if ! check_client_loads $NODES_TO_USE; then
- exit 4
- fi
+ check_client_loads $NODES_TO_USE || exit $?
log "Done checking client loads. Failing type1=$type1 item1=$item1 ... "
+ reboot_recover_node $item1 $type1 || exit $?
- reboot_recover_node $item1 $type1
-
- # Hendrix test17 description:
+ # Hendrix test17 description:
# Introduce a failure, wait at
# least 5 minutes (for recovery),
# introduce a 2nd
# We have a "double failures" if SERIAL is not set,
# do not need a sleep between failures for "double failures"
- log " Failing type2=$type2 item2=$item2 ... "
- reboot_recover_node $item2 $type2
+ log " Failing type2=$type2 item2=$item2 ... "
+ reboot_recover_node $item2 $type2 || exit $?
# Client loads are allowed to die while in recovery, so we just
# restart them.
- log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK"
- restart_client_loads $NODES_TO_USE $ERRORS_OK || return $?
- log "Done checking / re-Starting client loads. PASS"
+ log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK"
+ restart_client_loads $NODES_TO_USE $ERRORS_OK || exit $?
+ log "Done checking / re-starting client loads. PASS"
return 0
}
summary_and_cleanup () {
local rc=$?
- local var
trap 0
+ CURRENT_TS=$(date +%s)
+ ELAPSED=$((CURRENT_TS - START_TS))
+
# Having not empty END_RUN_FILE means the failed loads only
if [ -s $END_RUN_FILE ]; then
- echo "Found the END_RUN_FILE file: $END_RUN_FILE"
- cat $END_RUN_FILE
- local END_RUN_NODE=
- read END_RUN_NODE < $END_RUN_FILE
-
- # A client load will stop if it found the END_RUN_FILE file.
- # That does not mean the client load actually failed though.
- # The first node in END_RUN_FILE is the one we are interested in.
- if [ -n "$END_RUN_NODE" ]; then
- var=$(node_var_name $END_RUN_NODE)_load
- echo "Client load failed on node $END_RUN_NODE"
- echo
- echo "Client $END_RUN_NODE load stdout and debug files:
- $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log
- $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log"
- fi
+ print_end_run_file $END_RUN_FILE
rc=1
fi
Exited after: $ELAPSED seconds
Status: $result: rc=$rc"
- # make sure the client loads die
- do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE &&
- { kill -s TERM \\\$(cat $LOAD_PID_FILE);
- rm -f $LOAD_PID_FILE || true; }"
-
- # and free up the pdshes that started them, if any are still around
- if [ -n "$CLIENT_LOAD_PIDS" ]; then
- kill $CLIENT_LOAD_PIDS || true
- sleep 5
- kill -9 $CLIENT_LOAD_PIDS || true
- fi
+ # stop the client loads
+ stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
if [ $rc -ne 0 ]; then
# we are interested in only on failed clients and servers
local failedclients=$(cat $END_RUN_FILE | grep -v $0)
# FIXME: need ostfailover-s nodes also for FLAVOR=OST
- local product=$(gather_logs $(comma_list $(osts_nodes) \
- $(mdts_nodes) $mdsfailover_HOST $failedclients) 1)
- echo $product
+ gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \
+ $mdsfailover_HOST $failedclients)
fi
- [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
exit $rc
}
-trap summary_and_cleanup EXIT TERM INT
+################################## Main Flow ###################################
+build_test_filter
-#
-# MAIN
-#
-log "-----============= $0 starting =============-----"
+check_and_setup_lustre
+rm -rf $DIR/[Rdfs][0-9]*
-START_TS=$(date +%s)
-CURRENT_TS=$START_TS
-ELAPSED=0
+check_timeout || exit 1
-# Set SERIAL to serialize the failure through a recovery of the first failure.
-SERIAL=${SERIAL:-""}
-ERRORS_OK="yes"
+# The test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $HOSTNAME $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
-[ "$SERIAL" ] && ERRORS_OK=""
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
-FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes
+MDTS=$(get_facets MDS)
+OSTS=$(get_facets OST)
-# Start client loads.
-start_client_loads $NODES_TO_USE
+ELAPSED=0
+START_TS=$(date +%s)
+CURRENT_TS=$START_TS
-echo clients load pids:
-if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then
- exit 3
-fi
+# Every pairwise combination of client failures (2 clients),
+# MDS failure, and OST failure will be tested.
+test_pairwise_fail() {
+ trap summary_and_cleanup EXIT TERM INT
-# FIXME: Do we want to have an initial sleep period where the clients
-# just run before introducing a failure?
-sleep $FAILOVER_PERIOD
+ # Start client loads.
+ rm -f $END_RUN_FILE
+ start_client_loads $NODES_TO_USE
-#CMD_TEST_NUM=17.1
-failover_pair MDS OST "test 1: failover MDS, then OST =========="
-sleep $FAILOVER_PERIOD
+ echo clients load pids:
+ do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
-#CMD_TEST_NUM=17.2
-failover_pair MDS clients "test 2: failover MDS, then 2 clients ===="
-sleep $FAILOVER_PERIOD
+ # FIXME: Do we want to have an initial sleep period where the clients
+ # just run before introducing a failure?
+ sleep $FAILOVER_PERIOD
-#CMD_TEST_NUM=17.3
-if [ $MDSCOUNT -gt 1 ]; then
- failover_pair MDS MDS "test 3: failover MDS, then another MDS =="
+ # CMD_TEST_NUM=17.1
+ failover_pair MDS OST "test 1: failover MDS, then OST =========="
sleep $FAILOVER_PERIOD
-else
- skip "$0 : $MDSCOUNT < 2 MDTs, test 3 skipped"
-fi
-#CMD_TEST_NUM=17.4
-if [ $OSTCOUNT -gt 1 ]; then
- failover_pair OST OST "test 4: failover OST, then another OST =="
+ # CMD_TEST_NUM=17.2
+ failover_pair MDS clients "test 2: failover MDS, then 2 clients ===="
sleep $FAILOVER_PERIOD
-else
- skip "$0 : $OSTCOUNT < 2 OSTs, test 4 skipped"
-fi
-#CMD_TEST_NUM=17.5
-failover_pair OST clients "test 5: failover OST, then 2 clients ===="
-sleep $FAILOVER_PERIOD
+ # CMD_TEST_NUM=17.3
+ if [ $MDSCOUNT -gt 1 ]; then
+ failover_pair MDS MDS "test 3: failover MDS, then another MDS =="
+ sleep $FAILOVER_PERIOD
+ else
+ skip_env "has less than 2 MDTs, test 3 skipped"
+ fi
-#CMD_TEST_NUM=17.6
-failover_pair OST MDS "test 6: failover OST, then MDS =========="
-sleep $FAILOVER_PERIOD
+ # CMD_TEST_NUM=17.4
+ if [ $OSTCOUNT -gt 1 ]; then
+ failover_pair OST OST "test 4: failover OST, then another OST =="
+ sleep $FAILOVER_PERIOD
+ else
+ skip_env "has less than 2 OSTs, test 4 skipped"
+ fi
-#CMD_TEST_NUM=17.7
-failover_pair clients MDS "test 7: failover 2 clients, then MDS ===="
-sleep $FAILOVER_PERIOD
+ # CMD_TEST_NUM=17.5
+ failover_pair OST clients "test 5: failover OST, then 2 clients ===="
+ sleep $FAILOVER_PERIOD
-#CMD_TEST_NUM=17.8
-#failover_pair clients OST "test 8: failover 2 clients, then OST ===="
-sleep $FAILOVER_PERIOD
+ # CMD_TEST_NUM=17.6
+ failover_pair OST MDS "test 6: failover OST, then MDS =========="
+ sleep $FAILOVER_PERIOD
-#CMD_TEST_NUM=17.9
-if [ $CLIENTCOUNT -ge 5 ]; then
- failover_pair clients clients "test 9: failover 2 clients, then 2 different clients =="
+ # CMD_TEST_NUM=17.7
+ failover_pair clients MDS "test 7: failover 2 clients, then MDS ===="
sleep $FAILOVER_PERIOD
-fi
-log "==== Checking the clients loads AFTER all failovers -- failure NOT OK"
-if ! check_client_loads $NODES_TO_USE; then
- log "Client load failed after failover. Exiting"
- exit 5
-fi
-CURRENT_TS=$(date +%s)
-ELAPSED=$((CURRENT_TS - START_TS))
+ # CMD_TEST_NUM=17.8
+ failover_pair clients OST "test 8: failover 2 clients, then OST ===="
+ sleep $FAILOVER_PERIOD
+
+ # CMD_TEST_NUM=17.9
+ if [ $CLIENTCOUNT -gt 4 ]; then
+ failover_pair clients clients \
+ "test 9: failover 2 clients, then 2 different clients =="
+ sleep $FAILOVER_PERIOD
+ else
+ skip_env "has less than 5 Clients, test 9 skipped"
+ fi
+
+ log "==== Checking the clients loads AFTER all failovers -- failure NOT OK"
+ if ! check_client_loads $NODES_TO_USE; then
+ log "Client load failed after failover. Exiting..."
+ exit 5
+ fi
+
+ exit 0
+}
+run_test pairwise_fail "pairwise combination of clients, MDS, and OST failures"
-log "Completed successfully in $ELAPSED seconds"
+zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed"
+client_up || error "start client on $HOSTNAME failed"
-exit 0
+complete $(basename $0) $SECONDS
+check_and_cleanup_lustre
+exit_status
#!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
# Was Test 11 in cmd3.
# For duration of 24 hours repeatedly failover a random MDS at
# 10 minute intervals and verify that no application errors occur.
# Test runs one of CLIENT_LOAD progs on remote clients.
+set -e
-LUSTRE=${LUSTRE:-`dirname $0`/..}
-SETUP=${SETUP:-""}
-CLEANUP=${CLEANUP:-""}
-. $LUSTRE/tests/test-framework.sh
+ONLY=${ONLY:-"$*"}
-init_test_env $@
+# bug number for skipped test:
+ALWAYS_EXCEPT="$RECOVERY_MDS_SCALE_EXCEPT"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
init_logging
-DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log
-
-exec 2>$DEBUGLOG
-echo "--- env ---" >&2
-env >&2
-echo "--- env ---" >&2
-set -x
+remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0
-[ "$SHARED_DIRECTORY" ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; }
+[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] &&
+ skip_env "need three or more clients" && exit 0
-check_shared_dir $SHARED_DIRECTORY ||
- error "$SHARED_DIRECTORY isn't a shared directory"
-
-[ -n "$CLIENTS" ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; }
-
-[ $CLIENTCOUNT -ge 3 ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; }
-
-END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
-LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
-VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
-
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
-
-build_test_filter
-
-check_and_setup_lustre
-rm -rf $DIR/[df][0-9]*
-
-max_recov_time=$(max_recovery_time)
-
-# the test node needs to be insulated from a lustre failure as much as possible,
-# so not even loading the lustre modules is ideal.
-# -- umount lustre
-# -- remove hostname from clients list
-zconf_umount $(hostname) $MOUNT
-NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
-NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
-
-check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
-
-MDTS=$(get_facets MDS)
-OSTS=$(get_facets OST)
+if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then
+ skip_env "SHARED_DIRECTORY should be specified with a shared directory \
+which is accessable on all of the nodes"
+ exit 0
+fi
ERRORS_OK="" # No application failures should occur during this test.
-FLAVOR=${FLAVOR:-"MDS"}
-if [ "$FLAVOR" == "MDS" ]; then
- SERVERS=$MDTS
-else
- SERVERS=$OSTS
-fi
-
if [ "$SLOW" = "no" ]; then
DURATION=${DURATION:-$((60 * 30))}
- SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))}
else
DURATION=${DURATION:-$((60 * 60 * 24))}
- SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
fi
+SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
-rm -f $END_RUN_FILE
+MINSLEEP=${MINSLEEP:-120}
+REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62
+REQFAIL=${REQFAIL:-$((DURATION / SERVER_FAILOVER_PERIOD *
+ REQFAIL_PERCENT / 100))}
+
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
server_numfailovers () {
local facet=$1
summary_and_cleanup () {
local rc=$?
- local var
trap 0
# Having not empty END_RUN_FILE means the failed loads only
if [ -s $END_RUN_FILE ]; then
- echo "Found the END_RUN_FILE file: $END_RUN_FILE"
- cat $END_RUN_FILE
- local END_RUN_NODE=
- read END_RUN_NODE < $END_RUN_FILE
-
- # A client load will stop if it found the END_RUN_FILE file.
- # That does not mean the client load actually failed though.
- # The first node in END_RUN_FILE is the one we are interested in.
- if [ -n "$END_RUN_NODE" ]; then
- var=$(node_var_name $END_RUN_NODE)_load
- echo "Client load failed on node $END_RUN_NODE"
- echo
- echo "Client $END_RUN_NODE load stdout and debug files:
- $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log
- $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log"
- fi
+ print_end_run_file $END_RUN_FILE
rc=1
fi
local result=PASS
[ $rc -eq 0 ] || result=FAIL
- log "Duration: $DURATION
+ log "Duration: $DURATION
Server failover period: $SERVER_FAILOVER_PERIOD seconds
Exited after: $ELAPSED seconds
Number of failovers before exit:
$(servers_numfailovers)
Status: $result: rc=$rc"
- # stop the vmstats on the OSTs
- if [ "$VMSTAT" ]; then
- do_nodes $(comma_list $(osts_nodes)) "test -f $VMSTAT_PID_FILE &&
- { kill -s TERM \\\$(cat $VMSTAT_PID_FILE);
- rm -f $VMSTAT_PID_FILE || true; }"
- fi
-
- # make sure the client loads die
- do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE &&
- { kill -s TERM \\\$(cat $LOAD_PID_FILE);
- rm -f $LOAD_PID_FILE || true; }"
+ # stop vmstat on OSS nodes
+ [ "$VMSTAT" ] && stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
- # and free up the pdshes that started them, if any are still around
- if [ -n "$CLIENT_LOAD_PIDS" ]; then
- kill $CLIENT_LOAD_PIDS || true
- sleep 5
- kill -9 $CLIENT_LOAD_PIDS || true
- fi
+ # stop the client loads
+ stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
if [ $rc -ne 0 ]; then
# we are interested in only on failed clients and servers
local failedclients=$(cat $END_RUN_FILE | grep -v $0)
# FIXME: need ostfailover-s nodes also for FLAVOR=OST
- local product=$(gather_logs $(comma_list $(osts_nodes) \
- $(mdts_nodes) $mdsfailover_HOST $failedclients) 1)
- echo $product
+ gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \
+ $mdsfailover_HOST $failedclients)
fi
- [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
-
exit $rc
}
-#
-# MAIN
-#
-log "-----============= $0 starting =============-----"
+failover_target() {
+ local flavor=${1:-"MDS"}
+ local servers
+ local serverfacet
+ local var
-trap summary_and_cleanup EXIT INT
+ [ "$flavor" = "MDS" ] && servers=$MDTS || servers=$OSTS
-ELAPSED=0
+ trap summary_and_cleanup EXIT INT
-# vmstat the osts
-if [ "$VMSTAT" ]; then
- do_nodes $(comma_list $(osts_nodes)) \
- "vmstat 1 > $TESTLOG_PREFIX.vmstat.\\\$(hostname -s).log \
- 2>/dev/null </dev/null & echo \\\$! > $VMSTAT_PID_FILE"
-fi
+ # start vmstat on OSS nodes
+ [ "$VMSTAT" ] && start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
-# Start client loads.
-start_client_loads $NODES_TO_USE
+ # start client loads
+ rm -f $END_RUN_FILE
+ start_client_loads $NODES_TO_USE
-echo clients load pids:
-if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then
- exit 3
-fi
+ echo client loads pids:
+ do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
-MINSLEEP=${MINSLEEP:-120}
-REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62
-REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))}
-reqfail=0
-sleep=0
+ ELAPSED=0
+ local sleep=0
+ local reqfail=0
+ local it_time_start
+ local start_ts=$(date +%s)
+ local current_ts=$start_ts
+
+ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+ # In order to perform the
+ # expected number of failovers, we need to account the following:
+ # 1) the time that has elapsed during the client load checking
+ # 2) time takes for failover
+ it_time_start=$(date +%s)
-START_TS=$(date +%s)
-CURRENT_TS=$START_TS
+ serverfacet=$(get_random_entry $servers)
+ var=${serverfacet}_numfailovers
-while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+ # Check that our client loads are still running. If any have died,
+ # that means they have died outside of recovery, which is unacceptable.
+ log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
+ ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+ check_client_loads $NODES_TO_USE || exit 4
- # In order to perform the
- # expected number of failovers, we need to account the following :
- # 1) the time that has elapsed during the client load checking
- # 2) time takes for failover
+ log "Wait $serverfacet recovery complete before doing next failover..."
+ if ! wait_recovery_complete $serverfacet; then
+ echo "$serverfacet recovery is not completed!"
+ exit 7
+ fi
- it_time_start=$(date +%s)
+ log "Checking clients are in FULL state before doing next failover..."
+ if ! wait_clients_import_state $NODES_TO_USE $serverfacet FULL; then
+ echo "Clients import not FULL, please consider to increase \
+SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD!"
+ fi
- SERVERFACET=$(get_random_entry $SERVERS)
- var=${SERVERFACET}_numfailovers
+ log "Starting failover on $serverfacet"
+ facet_failover "$serverfacet" || exit 1
- # Check that our client loads are still running. If any have died,
- # that means they have died outside of recovery, which is unacceptable.
+ # Check that our client loads are still running during failover.
+ # No application failures should occur.
+ log "==== Checking the clients loads AFTER failover -- failure NOT OK"
+ if ! check_client_loads $NODES_TO_USE; then
+ log "Client load failed during failover. Exiting..."
+ exit 5
+ fi
- log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
- ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+ # Increment the number of failovers.
+ val=$((${!var} + 1))
+ eval $var=$val
- if ! check_client_loads $NODES_TO_USE; then
- exit 4
- fi
+ current_ts=$(date +%s)
+ ELAPSED=$((current_ts - start_ts))
- log "Wait $SERVERFACET recovery complete before doing next failover ...."
+ sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
- if ! wait_recovery_complete $SERVERFACET ; then
- echo "$SERVERFACET recovery is not completed!"
- exit 7
- fi
+ # Keep counting the number of iterations when
+ # time spent to failover and two client loads check exceeded
+ # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
+ if [ $sleep -lt $MINSLEEP ]; then
+ reqfail=$((reqfail + 1))
+ log "WARNING: failover and two check_client_loads time exceeded \
+SERVER_FAILOVER_PERIOD - MINSLEEP!
+Failed to load the filesystem with I/O for a minimum period of \
+$MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
+This iteration, the load was only applied for sleep=$sleep seconds.
+Estimated max recovery time: $MAX_RECOV_TIME
+Probably the hardware is taking excessively long time to boot.
+Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \
+bug 20918"
+ [ $reqfail -gt $REQFAIL ] && exit 6
+ fi
- log "Checking clients are in FULL state before doing next failover"
- if ! wait_clients_import_state $NODES_TO_USE $SERVERFACET FULL; then
- echo "Clients import not FULL, please consider to increase SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD !"
+ log "$serverfacet has failed over ${!var} times, and counting..."
- fi
- log "Starting failover on $SERVERFACET"
+ [ $((ELAPSED + sleep)) -ge $DURATION ] && break
- facet_failover "$SERVERFACET" || exit 1
+ if [ $sleep -gt 0 ]; then
+ echo "sleeping $sleep seconds... "
+ sleep $sleep
+ fi
+ done
+ exit 0
+}
- # Check that our client loads are still running during failover.
- # No application failures should occur.
+################################## Main Flow ###################################
+build_test_filter
- log "==== Checking the clients loads AFTER failover -- failure NOT OK"
- if ! check_client_loads $NODES_TO_USE; then
- log "Client load failed during failover. Exiting"
- exit 5
- fi
+check_and_setup_lustre
+rm -rf $DIR/[Rdfs][0-9]*
- # Increment the number of failovers
- val=$((${!var} + 1))
- eval $var=$val
+MAX_RECOV_TIME=$(max_recovery_time)
- CURRENT_TS=$(date +%s)
- ELAPSED=$((CURRENT_TS - START_TS))
+# The test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $HOSTNAME $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
- sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
- # keep count the number of itterations when
- # time spend to failover and two client loads check exceeded
- # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
- if [ $sleep -lt $MINSLEEP ]; then
- reqfail=$((reqfail +1))
- log "WARNING: failover and two check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP !
-Failed to load the filesystem with I/O for a minimum period of $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
-This iteration, the load was only applied for sleep=$sleep seconds.
-Estimated max recovery time : $max_recov_time
-Probably the hardware is taking excessively long to boot.
-Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
- [ $reqfail -gt $REQFAIL ] && exit 6
- fi
+MDTS=$(get_facets MDS)
+OSTS=$(get_facets OST)
- log "$SERVERFACET has failed over ${!var} times, and counting..."
+test_failover_mds() {
+ # failover a random MDS
+ failover_target MDS
+}
+run_test failover_mds "failover MDS"
- if [ $((ELAPSED + sleep)) -ge $DURATION ]; then
- break
- fi
+test_failover_ost() {
+ # failover a random OST
+ failover_target OST
+}
+run_test failover_ost "failover OST"
- if [ $sleep -gt 0 ]; then
- echo "sleeping $sleep seconds ... "
- sleep $sleep
- fi
-done
+zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed"
+client_up || error "start client on $HOSTNAME failed"
-exit 0
+complete $(basename $0) $SECONDS
+check_and_cleanup_lustre
+exit_status
#!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
# client failure does not affect other clients
# 10 minute intervals and verify that no application errors occur.
# Test runs one of CLIENT_LOAD progs on remote clients.
+set -e
-LUSTRE=${LUSTRE:-`dirname $0`/..}
-SETUP=${SETUP:-""}
-CLEANUP=${CLEANUP:-""}
-. $LUSTRE/tests/test-framework.sh
+ONLY=${ONLY:-"$*"}
-init_test_env $@
+# bug number for skipped test:
+ALWAYS_EXCEPT="$RECOVERY_RANDOM_SCALE_EXCEPT"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
init_logging
-DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log
-
-exec 2>$DEBUGLOG
-echo "--- env ---" >&2
-env >&2
-echo "--- env ---" >&2
-set -x
+remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0
-[ "$SHARED_DIRECTORY" ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; }
+[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] &&
+ skip_env "need three or more clients" && exit 0
-check_shared_dir $SHARED_DIRECTORY ||
- error "$SHARED_DIRECTORY isn't a shared directory"
-
-[ -n "$CLIENTS" ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; }
-
-[ $CLIENTCOUNT -ge 3 ] || \
- { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; }
-
-END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
-LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
-VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
-
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then
+ skip_env "SHARED_DIRECTORY should be specified with a shared directory \
+which is accessable on all of the nodes"
+ exit 0
+fi
[[ $FAILURE_MODE = SOFT ]] && \
log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797"
-build_test_filter
-
-check_and_setup_lustre
-rm -rf $DIR/[df][0-9]*
-
-max_recov_time=$(max_recovery_time)
-
-# the test node needs to be insulated from a lustre failure as much as possible,
-# so not even loading the lustre modules is ideal.
-# -- umount lustre
-# -- remove hostname from clients list
-zconf_umount $(hostname) $MOUNT
-NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
-NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
-
-check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
-
-MDTS=$(get_facets MDS)
+# Application failures are allowed for the failed client
+# but not for other clients.
+ERRORS_OK="yes"
if [ "$SLOW" = "no" ]; then
DURATION=${DURATION:-$((60 * 30))}
- SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))}
else
DURATION=${DURATION:-$((60 * 60 * 24))}
- SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
fi
+SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
+
+MINSLEEP=${MINSLEEP:-120}
+REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62
+REQFAIL=${REQFAIL:-$((DURATION / SERVER_FAILOVER_PERIOD *
+ REQFAIL_PERCENT / 100))}
-rm -f $END_RUN_FILE
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
numfailovers () {
local facet
local var
- for facet in $MDTS ${failed_clients//,/ }; do
+ for facet in $MDTS ${FAILED_CLIENTS//,/ }; do
var=${facet}_nums
val=${!var}
if [ "$val" ] ; then
- echo "$facet failed over $val times"
+ echo "$facet failed over $val times"
fi
done
}
summary_and_cleanup () {
local rc=$?
- local var
trap 0
# Having not empty END_RUN_FILE means the failed loads only
if [ -s $END_RUN_FILE ]; then
- echo "Found the END_RUN_FILE file: $END_RUN_FILE"
- cat $END_RUN_FILE
- local END_RUN_NODE=
- read END_RUN_NODE < $END_RUN_FILE
-
- # A client load will stop if it found the END_RUN_FILE file.
- # That does not mean the client load actually failed though.
- # The first node in END_RUN_FILE is the one we are interested in.
- if [ -n "$END_RUN_NODE" ]; then
- var=$(node_var_name $END_RUN_NODE)_load
- echo "Client load failed on node $END_RUN_NODE"
- echo
- echo "Client $END_RUN_NODE load stdout and debug files:
- $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log
- $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log"
- fi
+ print_end_run_file $END_RUN_FILE
rc=1
fi
local result=PASS
[ $rc -eq 0 ] || result=FAIL
- log "Duration: $DURATION
+ log "Duration: $DURATION
Server failover period: $SERVER_FAILOVER_PERIOD seconds
Exited after: $ELAPSED seconds
Number of failovers before exit:
$(numfailovers)
Status: $result: rc=$rc"
- # stop the vmstats on the OSTs
- if [ "$VMSTAT" ]; then
- do_nodes $(comma_list $(osts_nodes)) "test -f $VMSTAT_PID_FILE &&
- { kill -s TERM \\\$(cat $VMSTAT_PID_FILE);
- rm -f $VMSTAT_PID_FILE || true; }"
- fi
+ # stop vmstat on OSS nodes
+ [ "$VMSTAT" ] && stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
- # make sure the client loads die
- do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE &&
- { kill -s TERM \\\$(cat $LOAD_PID_FILE);
- rm -f $LOAD_PID_FILE || true; }"
-
- # and free up the pdshes that started them, if any are still around
- if [ -n "$CLIENT_LOAD_PIDS" ]; then
- kill $CLIENT_LOAD_PIDS || true
- sleep 5
- kill -9 $CLIENT_LOAD_PIDS || true
- fi
+ # stop the client loads
+ stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
if [ $rc -ne 0 ]; then
# we are interested in only on failed clients and servers
local failedclients=$(cat $END_RUN_FILE | grep -v $0)
# FIXME: need ostfailover-s nodes also for FLAVOR=OST
- local product=$(gather_logs $(comma_list $(osts_nodes) \
- $(mdts_nodes) $mdsfailover_HOST $failedclients) 1)
- echo $product
+ gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \
+ $mdsfailover_HOST $failedclients)
fi
- [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
-
exit $rc
}
-#
-# MAIN
-#
-log "-----============= $0 starting =============-----"
+################################## Main Flow ###################################
+build_test_filter
-trap summary_and_cleanup EXIT # INT
+check_and_setup_lustre
+rm -rf $DIR/[Rdfs][0-9]*
-ELAPSED=0
+MAX_RECOV_TIME=$(max_recovery_time)
-# vmstat the osts
-if [ "$VMSTAT" ]; then
- do_nodes $(comma_list $(osts_nodes)) \
- "vmstat 1 > $TESTLOG_PREFIX.vmstat.\\\$(hostname -s).log \
- 2>/dev/null </dev/null & echo \\\$! > $VMSTAT_PID_FILE"
-fi
+# The test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $HOSTNAME $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
-# Start client loads.
-start_client_loads $NODES_TO_USE
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
-echo clients load pids:
-if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then
- exit 3
-fi
+MDTS=$(get_facets MDS)
-START_TS=$(date +%s)
-CURRENT_TS=$START_TS
+# Fail a random client and then failover a random MDS.
+test_fail_client_mds() {
+ local fail_client
+ local serverfacet
+ local client_var
+ local var
-MINSLEEP=${MINSLEEP:-120}
-REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62
-REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))}
-reqfail=0
-sleep=0
+ trap summary_and_cleanup EXIT INT
-# This is used for FAIL_CLIENT only
-ERRORS_OK="yes"
-while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+ # start vmstat on OSS nodes
+ [ "$VMSTAT" ] && start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
- # In order to perform the
- # expected number of failovers, we need to account the following :
- # 1) the time that has elapsed during the client load checking
- # 2) time takes for failover
+ # start client loads
+ rm -f $END_RUN_FILE
+ start_client_loads $NODES_TO_USE
- it_time_start=$(date +%s)
-
- FAIL_CLIENT=$(get_random_entry $NODES_TO_USE)
- client_var=$(node_var_name $FAIL_CLIENT)_nums
+ echo client loads pids:
+ do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
- # store the list of failed clients
- # lists are comma separated
- failed_clients=$(expand_list $failed_clients $FAIL_CLIENT)
+ ELAPSED=0
+ local sleep=0
+ local reqfail=0
+ local it_time_start
+ local start_ts=$(date +%s)
+ local current_ts=$start_ts
- SERVERFACET=$(get_random_entry $MDTS)
- var=${SERVERFACET}_nums
+ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+ # In order to perform the
+ # expected number of failovers, we need to account the following:
+ # 1) the time that has elapsed during the client load checking
+ # 2) time takes for failover
+ it_time_start=$(date +%s)
- # Check that our client loads are still running. If any have died,
- # that means they have died outside of recovery, which is unacceptable.
+ fail_client=$(get_random_entry $NODES_TO_USE)
+ client_var=$(node_var_name $fail_client)_nums
- log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
- ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+ # store the list of failed clients
+ # lists are comma separated
+ FAILED_CLIENTS=$(expand_list $FAILED_CLIENTS $fail_client)
- if ! check_client_loads $NODES_TO_USE; then
- exit 4
- fi
+ serverfacet=$(get_random_entry $MDTS)
+ var=${serverfacet}_nums
- log "FAIL CLIENT $FAIL_CLIENT ... "
- shutdown_client $FAIL_CLIENT
+ # Check that our client loads are still running. If any have died,
+ # that means they have died outside of recovery, which is unacceptable.
+ log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
+ ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+ check_client_loads $NODES_TO_USE || exit 4
- log "Starting failover on $SERVERFACET"
+ log "FAIL CLIENT $fail_client..."
+ shutdown_client $fail_client
- facet_failover "$SERVERFACET" || exit 1
- if ! wait_recovery_complete $SERVERFACET ; then
- echo "$SERVERFACET recovery is not completed!"
- exit 7
- fi
+ log "Starting failover on $serverfacet"
+ facet_failover "$serverfacet" || exit 1
- boot_node $FAIL_CLIENT
- echo "Reintegrating $FAIL_CLIENT"
- zconf_mount $FAIL_CLIENT $MOUNT || exit $?
-
- # Increment the number of failovers
- val=$((${!var} + 1))
- eval $var=$val
- val=$((${!client_var} + 1))
- eval $client_var=$val
-
- # load script on failed clients could create END_RUN_FILE
- # We shuold remove it and ignore the failure if this
- # file contains the failed client only.
- # We can not use ERRORS_OK when start all loads at the start of this script
- # because the application errors allowed for random failed client only, but
- # not for all clients.
- if [ -e $END_RUN_FILE ]; then
- read END_RUN_NODE < $END_RUN_FILE
- [[ $END_RUN_NODE = $FAIL_CLIENT ]] &&
- rm -f $END_RUN_FILE || exit 13
- fi
+ if ! wait_recovery_complete $serverfacet; then
+ echo "$serverfacet recovery is not completed!"
+ exit 7
+ fi
- restart_client_loads $FAIL_CLIENT $ERRORS_OK || exit $?
+ boot_node $fail_client
+ echo "Reintegrating $fail_client"
+ zconf_mount $fail_client $MOUNT || exit $?
+ client_up $fail_client || exit $?
+
+ # Increment the number of failovers
+ val=$((${!var} + 1))
+ eval $var=$val
+ val=$((${!client_var} + 1))
+ eval $client_var=$val
+
+ # load script on failed clients could create END_RUN_FILE
+ # We shuold remove it and ignore the failure if this
+ # file contains the failed client only.
+ # We can not use ERRORS_OK when start all loads at the start of
+ # this script because the application errors allowed for random
+ # failed client only, but not for all clients.
+ if [ -e $END_RUN_FILE ]; then
+ local end_run_node
+ read end_run_node < $END_RUN_FILE
+ [[ $end_run_node = $fail_client ]] &&
+ rm -f $END_RUN_FILE || exit 13
+ fi
- # Check that not failed clients loads are still running.
- # No application failures should occur on clients that was not failed.
+ restart_client_loads $fail_client $ERRORS_OK || exit $?
- log "==== Checking the clients loads AFTER failed client reintegrated -- failure NOT OK"
- if ! ERRORS_OK= check_client_loads $(exclude_items_from_list $NODES_TO_USE $FAIL_CLIENT); then
- log "Client load failed. Exiting"
- exit 5
- fi
+ # Check that not failed clients loads are still running.
+ # No application failures should occur on clients that were not failed.
+ log "==== Checking the clients loads AFTER failed client reintegrated \
+-- failure NOT OK"
+ if ! ERRORS_OK= check_client_loads \
+ $(exclude_items_from_list $NODES_TO_USE $fail_client); then
+ log "Client load failed. Exiting..."
+ exit 5
+ fi
- CURRENT_TS=$(date +%s)
- ELAPSED=$((CURRENT_TS - START_TS))
- sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
-
- # keep count the number of itterations when
- # time spend to failover and two client loads check exceeded
- # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
- if [ $sleep -lt $MINSLEEP ]; then
- reqfail=$((reqfail +1))
- log "WARNING: failover, client reintegration and check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP !
-Failed to load the filesystem with I/O for a minimum period of $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
+ current_ts=$(date +%s)
+ ELAPSED=$((current_ts - start_ts))
+ sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
+
+ # Keep counting the number of iterations when
+ # time spent to failover and two client loads check exceeded
+ # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
+ if [ $sleep -lt $MINSLEEP ]; then
+ reqfail=$((reqfail + 1))
+ log "WARNING: failover, client reintegration and \
+check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP!
+Failed to load the filesystem with I/O for a minimum period of \
+$MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
This iteration, the load was only applied for sleep=$sleep seconds.
-Estimated max recovery time : $max_recov_time
-Probably the hardware is taking excessively long to boot.
-Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
- [ $reqfail -gt $REQFAIL ] && exit 6
- fi
+Estimated max recovery time : $MAX_RECOV_TIME
+Probably the hardware is taking excessively long time to boot.
+Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \
+bug 20918"
+ [ $reqfail -gt $REQFAIL ] && exit 6
+ fi
- log " Number of failovers:
+ log "Number of failovers:
$(numfailovers) and counting..."
- if [ $((ELAPSED + sleep)) -ge $DURATION ]; then
- break
- fi
+ [ $((ELAPSED + sleep)) -ge $DURATION ] && break
- if [ $sleep -gt 0 ]; then
- echo "sleeping $sleep seconds ... "
- sleep $sleep
- fi
-done
+ if [ $sleep -gt 0 ]; then
+ echo "sleeping $sleep seconds... "
+ sleep $sleep
+ fi
+ done
+ exit 0
+}
+run_test fail_client_mds "fail client, then failover MDS"
+
+zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed"
+client_up || error "start client on $HOSTNAME failed"
-exit 0
+complete $(basename $0) $SECONDS
+check_and_cleanup_lustre
+exit_status
#!/bin/bash
-set -x
TMP=${TMP:-/tmp}
TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
rm -f $LOG $DEBUGLOG
exec 2>$DEBUGLOG
+set -x
. $(dirname $0)/functions.sh
load_pid=$!
wait $load_pid
if [ ${PIPESTATUS[0]} -eq 0 ]; then
- echoerr "$(date +'%F %H:%M:%S'): IOR succeeded"
- cd $TMP
- rm -rf $TESTDIR
- echoerr "$(date +'%F %H:%M:%S'): IOR run finished"
+ echoerr "$(date +'%F %H:%M:%S'): IOR succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): IOR run finished"
else
- echoerr "$(date +'%F %H:%M:%S'): IOR failed"
- if [ -z "$ERRORS_OK" ]; then
- echo $(hostname) >> $END_RUN_FILE
- fi
- if [ $BREAK_ON_ERROR ]; then
- # break
+ echoerr "$(date +'%F %H:%M:%S'): IOR failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
CONTINUE=false
- fi
+ fi
fi
done
#!/bin/bash
-set -x
TMP=${TMP:-/tmp}
TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
rm -f $LOG $DEBUGLOG
exec 2>$DEBUGLOG
+set -x
. $(dirname $0)/functions.sh
wait $load_pid
if [ ${PIPESTATUS[0]} -eq 0 ]; then
- echoerr "$(date +'%F %H:%M:%S'): dbench succeeded"
- cd $TMP
- rm -rf $TESTDIR
- echoerr "$(date +'%F %H:%M:%S'): dbench run finished"
+ echoerr "$(date +'%F %H:%M:%S'): dbench succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): dbench run finished"
else
- echoerr "$(date +'%F %H:%M:%S'): dbench failed"
- if [ -z "$ERRORS_OK" ]; then
- echo $(hostname) >> $END_RUN_FILE
- fi
- if [ $BREAK_ON_ERROR ]; then
- # break
+ echoerr "$(date +'%F %H:%M:%S'): dbench failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
CONTINUE=false
- fi
+ fi
fi
done
#!/bin/bash
-set -x
TMP=${TMP:-/tmp}
TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
rm -f $LOG $DEBUGLOG
exec 2>$DEBUGLOG
+set -x
. $(dirname $0)/functions.sh
wait $load_pid
if [ $? -eq 0 ]; then
- echoerr "$(date +'%F %H:%M:%S'): dd succeeded"
- cd $TMP
- rm -rf $TESTDIR
- echoerr "$(date +'%F %H:%M:%S'): dd run finished"
+ echoerr "$(date +'%F %H:%M:%S'): dd succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): dd run finished"
else
- echoerr "$(date +'%F %H:%M:%S'): dd failed"
- if [ -z "$ERRORS_OK" ]; then
- echo $(hostname) >> $END_RUN_FILE
- fi
- if [ $BREAK_ON_ERROR ]; then
- # break
+ echoerr "$(date +'%F %H:%M:%S'): dd failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
CONTINUE=false
- fi
+ fi
fi
done
#!/bin/bash
-set -x
TMP=${TMP:-/tmp}
TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
rm -f $LOG $DEBUGLOG
exec 2>$DEBUGLOG
+set -x
. $(dirname $0)/functions.sh
load_pid=$!
wait $load_pid
if [ ${PIPESTATUS[0]} -eq 0 ]; then
- echoerr "$(date +'%F %H:%M:%S'): iozone succeeded"
- cd $TMP
- rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): iozone succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
if [ -d $TESTDIR ]; then
- echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR"
- echo $(hostname) >> $END_RUN_FILE
+ echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR"
+ echo $(hostname) >> $END_RUN_FILE
CONTINUE=false
fi
- echoerr "$(date +'%F %H:%M:%S'): iozone run finished"
+ echoerr "$(date +'%F %H:%M:%S'): iozone run finished"
else
- echoerr "$(date +'%F %H:%M:%S'): iozone failed"
- if [ -z "$ERRORS_OK" ]; then
- echo $(hostname) >> $END_RUN_FILE
- fi
- if [ $BREAK_ON_ERROR ]; then
- # break
+ echoerr "$(date +'%F %H:%M:%S'): iozone failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
CONTINUE=false
- fi
+ fi
fi
done
#!/bin/bash
-set -x
TMP=${TMP:-/tmp}
TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
rm -f $LOG $DEBUGLOG
exec 2>$DEBUGLOG
+set -x
. $(dirname $0)/functions.sh
RC=0
fi
if [ $RC -eq 0 ]; then
- echoerr "$(date +'%F %H:%M:%S'): tar succeeded"
- cd $TMP
- rm -rf $TESTDIR
- echoerr "$(date +'%F %H:%M:%S'): tar run finished"
+ echoerr "$(date +'%F %H:%M:%S'): tar succeeded"
+ cd $TMP
+ rm -rf $TESTDIR
+ echoerr "$(date +'%F %H:%M:%S'): tar run finished"
else
- echoerr "$(date +'%F %H:%M:%S'): tar failed"
- if [ -z "$ERRORS_OK" ]; then
- echo $(hostname) >> $END_RUN_FILE
- fi
- if [ $BREAK_ON_ERROR ]; then
- # break
+ echoerr "$(date +'%F %H:%M:%S'): tar failed"
+ if [ -z "$ERRORS_OK" ]; then
+ echo $(hostname) >> $END_RUN_FILE
+ fi
+ if [ $BREAK_ON_ERROR ]; then
+ # break
CONTINUE=false
- fi
+ fi
fi
done
END_RUN_FILE=$END_RUN_FILE \
LOAD_PID_FILE=$LOAD_PID_FILE \
TESTLOG_PREFIX=$TESTLOG_PREFIX \
+ TESTNAME=$TESTNAME \
run_${load}.sh" &
local ppid=$!
log "Started client load: ${load} on $client"
sleep 2
}
-# only for remote client
+# only for remote client
check_client_load () {
local client=$1
local var=$(node_var_name $client)_load
local TESTLOAD=run_${!var}.sh
ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
-
+
# bug 18914: try to connect several times not only when
# check ps, but while check_catastrophe also
local tries=3
if [ "$rc" != 0 ]; then
log "Client load failed to restart on node $client, rc=$rc"
# failure one client load means test fail
- # we do not need to check other
+ # we do not need to check other
return $rc
fi
else
fi
done
}
+
+# Start vmstat and save its process ID in a file.
+start_vmstat() {
+ local nodes=$1
+ local pid_file=$2
+
+ [ -z "$nodes" -o -z "$pid_file" ] && return 0
+
+ do_nodes $nodes \
+ "vmstat 1 > $TESTLOG_PREFIX.$TESTNAME.vmstat.\\\$(hostname -s).log \
+ 2>/dev/null </dev/null & echo \\\$! > $pid_file"
+}
+
+# Display the nodes on which client loads failed.
+print_end_run_file() {
+ local file=$1
+ local node
+
+ [ -s $file ] || return 0
+
+ echo "Found the END_RUN_FILE file: $file"
+ cat $file
+
+ # A client load will stop if it finds the END_RUN_FILE file.
+ # That does not mean the client load actually failed though.
+ # The first node in END_RUN_FILE is the one we are interested in.
+ read node < $file
+
+ if [ -n "$node" ]; then
+ local var=$(node_var_name $node)_load
+
+ local prefix=$TESTLOG_PREFIX
+ [ -n "$TESTNAME" ] && prefix=$prefix.$TESTNAME
+ local stdout_log=$prefix.run_${!var}_stdout.$node.log
+ local debug_log=$(echo $stdout_log | sed 's/\(.*\)stdout/\1debug/')
+
+ echo "Client load ${!var} failed on node $node:"
+ echo "$stdout_log"
+ echo "$debug_log"
+ fi
+}
+
+# Stop the process which had its PID saved in a file.
+stop_process() {
+ local nodes=$1
+ local pid_file=$2
+
+ [ -z "$nodes" -o -z "$pid_file" ] && return 0
+
+ do_nodes $nodes "test -f $pid_file &&
+ { kill -s TERM \\\$(cat $pid_file); rm -f $pid_file; }" || true
+}
+
+# Stop all client loads.
+stop_client_loads() {
+ local nodes=${1:-$CLIENTS}
+ local pid_file=$2
+
+ # stop the client loads
+ stop_process $nodes $pid_file
+
+ # clean up the processes that started them
+ [ -n "$CLIENT_LOAD_PIDS" ] && kill -9 $CLIENT_LOAD_PIDS 2>/dev/null || true
+}
# End recovery-scale functions
# verify that lustre actually cleaned up properly
[ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
mount_client $MOUNT
[ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT
+ clients_up
if [ "$MOUNT_2" ]; then
mount_client $MOUNT2
fi
if is_mounted $MOUNT; then
- [ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]*
+ [ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]* ||
+ error "remove sub-test dirs failed"
[ "$ENABLE_QUOTA" ] && restore_quota_type || true
fi
log " ${TESTSUITE} ${TESTNAME}: @@@@@@ ${TYPE}: $@ "
+ mkdir -p $LOGDIR
# We need to dump the logs on all nodes
if $dump; then
- gather_logs $(comma_list $(nodes_list)) 0
+ gather_logs $(comma_list $(nodes_list))
fi
debugrestore
gather_logs () {
local list=$1
- local tar_logs=$2
local ts=$(date +%s)
local docp=true
[ -f $LOGDIR/shared ] && docp=false
-
+
# dump lustre logs, dmesg
prefix="$TESTLOG_PREFIX.$TESTNAME"
dmesg > ${prefix}.dmesg.\\\$(hostname -s).${suffix}"
if [ ! -f $LOGDIR/shared ]; then
do_nodes $list rsync -az "${prefix}.*.${suffix}" $HOSTNAME:$LOGDIR
- fi
-
- if [ $tar_logs == 1 ]; then
- local archive=$LOGDIR/${TESTSUITE}-$ts.tar.bz2
- tar -jcf $archive $LOGDIR/*$ts* $LOGDIR/*${TESTSUITE}*
-
- echo $archive
fi
}