noinst_SCRIPTS += mdsrate-stat-small.sh mdsrate-stat-large.sh replay-vbr.sh
noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause
noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh
-noinst_SCRIPTS += run_dbench.sh
+noinst_SCRIPTS += run_dbench.sh recovery-double-scale.sh
nobase_noinst_SCRIPTS = cfg/local.sh
nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh
nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh
[ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\""
[ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484"
-export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL REPLAY_VBR INSANITY SANITY_QUOTA PERFORMANCE_SANITY LARGE_SCALE RECOVERY_MDS_SCALE"
+export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL REPLAY_VBR INSANITY SANITY_QUOTA PERFORMANCE_SANITY LARGE_SCALE RECOVERY_MDS_SCALE RECOVERY_DOUBLE_SCALE"
if [ "$ACC_SM_ONLY" ]; then
for O in $TESTSUITE_LIST; do
RECOVERY_MDS_SCALE="done"
fi
+[ "$RECOVERY_DOUBLE_SCALE" != "no" ] && skip_remmds recovery-double-scale && RECOVERY_DOUBLE_SCALE=no && MSKIPPED=1
+[ "$RECOVERY_DOUBLE_SCALE" != "no" ] && skip_remost recovery-double-scale && RECOVERY_DOUBLE_SCALE=no && OSKIPPED=1
+if [ "$RECOVERY_DOUBLE_SCALE" != "no" ]; then
+ title recovery-double-scale
+ bash recovery-double-scale.sh
+ RECOVERY_DOUBLE_SCALE="done"
+fi
+
RC=$?
title FINISHED
echo "Finished at `date` in $((`date +%s` - $STARTTIME))s"
echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
}
-shutdown_client() {
- client=$1
- if [ "$FAILURE_MODE" = HARD ]; then
- $POWER_DOWN $client
- while ping -w 3 -c 1 $client > /dev/null 2>&1; do
- echo "waiting for node $client to fail"
- sleep 1
- done
- elif [ "$FAILURE_MODE" = SOFT ]; then
- zconf_umount $client $MOUNT -f
- fi
-}
-
fail_clients() {
num=$1
--- /dev/null
+#!/bin/bash
+
+# All pairwise combinations of node failures.
+# Was cmd3-17
+#
+# Author: Chris Cooper <ccooper@clusterfs.com>
+#
+# Script fails pair of nodes:
+# -- in parallel by default
+# -- in series if SERIAL is set
+
+LUSTRE=${LUSTRE:-`dirname $0`/..}
+SETUP=${SETUP:-""}
+CLEANUP=${CLEANUP:-""}
+. $LUSTRE/tests/test-framework.sh
+
+init_test_env $@
+
+. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-double-scale}
+DEBUGLOG=$TESTSUITELOG.debug
+exec 2>$DEBUGLOG
+echo "--- env ---" >&2
+env >&2
+echo "--- env ---" >&2
+set -x
+
+[ -n "$CLIENTS" ] || { skip "$0 Need two or more remote clients" && exit 0; }
+[ $CLIENTCOUNT -ge 3 ] || \
+ { skip "$0 Need two or more remote clients, have $CLIENTCOUNT" && exit 0; }
+
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY}/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+
+remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+
+check_timeout || exit 1
+
+build_test_filter
+
+check_and_setup_lustre
+rm -rf $DIR/[df][0-9]*
+
+# the test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $(hostname) $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
+
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
+
+MDTS=$(get_facets MDS)
+OSTS=$(get_facets OST)
+
+rm -f $END_RUN_FILE
+
+reboot_recover_node () {
+ # item var contains a pair of clients if nodetype=clients
+ # I would prefer to have a list here
+ local item=$1
+ local nodetype=$2
+ local timeout=$($LCTL get_param -n timeout)
+
+ # MDS, OST item contains the facet
+ case $nodetype in
+ MDS|OST ) facet_failover $item
+ [ "$SERIAL" ] && wait_recovery_complete $item $((timeout * 4)) || true
+ ;;
+ clients) for c in ${item//,/ }; do
+ shutdown_client $c
+ boot_node $c
+ done
+ start_client_loads $list || return $?
+ ;;
+ * ) error "reboot_recover_node: nodetype=$nodetype. Must be one of 'MDS', 'OST', or 'clients'."
+ exit 1;;
+ esac
+}
+
+get_item_type () {
+ local type=$1
+ local excluded=${2:-""}
+
+ local list
+ case $type in
+ MDS ) list=$MDTS;;
+ OST ) list=$OSTS;;
+ clients) list=$NODES_TO_USE
+ ;;
+ * ) error "Invalid type=$type. Must be one of 'MDS', 'OST', or 'clients'."
+ exit 1;;
+ esac
+
+ [ "$excluded" ] && list=$(exclude_items_from_list $list $excluded)
+ # empty list
+ if [ ! "$(echo $list)" ]; then
+ echo
+ return
+ fi
+
+ item=$(get_random_entry $list)
+ if [ "$type" = clients ] ; then
+ item="$item $(get_random_entry $(exclude_items_from_list $list $item))"
+ item=$(comma_list $item)
+ fi
+ echo $item
+}
+
+# failover_pair
+#
+# for the two nodetypes specified, chooses a random node(s) from each
+# class, reboots the nodes sequentially, and then restarts lustre on
+# the nodes.
+failover_pair() {
+ local type1=$1
+ local type2=$2
+ local title=$3
+
+ local client_nodes=""
+ local item1=
+ local item2=
+ local client1=
+ local client2=
+
+ log "
+==== START === $title "
+
+ item1=$(get_item_type $type1)
+ [ "$item1" ] || \
+ { echo "type1=$type1 item1 is empty" && return 0; }
+ item2=$(get_item_type $type2 $item1)
+ [ "$item2" ] || \
+ { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" && return 0; }
+
+ # Check that our client loads are still running. If any have died,
+ # that means they have died outside of recovery, which is unacceptable.
+ log "==== Checking the clients loads BEFORE failover -- failure NOT OK"
+
+ # FIXME. need print summary on exit
+ if ! check_client_loads $NODES_TO_USE; then
+ exit 4
+ fi
+
+ log "Done checking client loads. Failing type1=$type1 item1=$item1 ... "
+
+ reboot_recover_node $item1 $type1 || return $?
+
+ # Hendrix test17 description:
+ # Introduce a failure, wait at
+ # least 5 minutes (for recovery),
+ # introduce a 2nd
+ # failure, and wait another 5
+ # minutes
+
+ # reboot_recover_node waits recovery in according to
+ # SERIAL value.
+ # We have a "double failures" if SERIAL is not set,
+ # do not need a sleep between failures for "double failures"
+
+ log " Failing type2=$type2 item2=$item2 ... "
+ reboot_recover_node $item2 $type2 || return $?
+
+ # Client loads are allowed to die while in recovery, so we just
+ # restart them.
+ log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK"
+ restart_client_loads $NODES_TO_USE $ERRORS_OK || return $?
+ log "Done checking / re-Starting client loads. PASS"
+ return 0
+}
+
+summary_and_cleanup () {
+ local rc=$?
+ trap 0
+
+ # Having not empty END_RUN_FILE means the failed loads only
+ if [ -s $END_RUN_FILE ]; then
+ echo "Found the END_RUN_FILE file: $END_RUN_FILE"
+ cat $END_RUN_FILE
+ local END_RUN_NODE=
+ read END_RUN_NODE < $END_RUN_FILE
+
+ # a client load will end (i.e. fail) if it finds
+ # the end run file. that does not mean that that client load
+ # actually failed though. the first node in the END_RUN_NODE is
+ # the one we are really interested in.
+ if [ -n "$END_RUN_NODE" ]; then
+ var=${END_RUN_NODE}_load
+ echo "Client load failed on node $END_RUN_NODE"
+ echo
+ echo "client $END_RUN_NODE load debug output :"
+ local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug
+ do_node ${END_RUN_NODE} "set -x; [ -e $logfile ] && cat $logfile " || true
+ fi
+ rc=1
+ fi
+
+ echo $(date +'%F %H:%M:%S') Terminating clients loads ...
+ echo "$0" >> $END_RUN_FILE
+ local result=PASS
+ [ $rc -eq 0 ] || result=FAIL
+
+ log "
+Server failover period: $FAILOVER_PERIOD seconds
+Exited after: $ELAPSED seconds
+Status: $result: rc=$rc"
+
+ # make sure the client loads die
+ do_nodes $NODES_TO_USE "set -x; test -f $TMP/client-load.pid && \
+ { kill -s TERM \$(cat $TMP/client-load.pid) || true; }"
+
+ # and free up the pdshes that started them, if any are still around
+ if [ -n "$CLIENT_LOAD_PIDS" ]; then
+ kill $CLIENT_LOAD_PIDS || true
+ sleep 5
+ kill -9 $CLIENT_LOAD_PIDS || true
+ fi
+ [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
+ exit $rc
+}
+
+trap summary_and_cleanup EXIT TERM INT
+
+#
+# MAIN
+#
+log "-----============= $0 starting =============-----"
+
+START_TS=$(date +%s)
+CURRENT_TS=$START_TS
+ELAPSED=0
+
+# Set SERIAL to serialize the failure through a recovery of the first failure.
+SERIAL=${SERIAL:-""}
+ERRORS_OK="yes"
+
+[ "$SERIAL" ] && ERRORS_OK=""
+
+FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes
+
+# Start client loads.
+start_client_loads $NODES_TO_USE
+echo clients load pids:
+if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $TMP/client-load.pid"; then
+ if [ -e $DEBUGLOG ]; then
+ exec 2<&-
+ cat $DEBUGLOG
+ exit 3
+ fi
+fi
+
+# FIXME: Do we want to have an initial sleep period where the clients
+# just run before introducing a failure?
+sleep $FAILOVER_PERIOD
+
+#CMD_TEST_NUM=17.1
+failover_pair MDS OST "test 1: failover MDS, then OST =========="
+sleep $FAILOVER_PERIOD
+
+#CMD_TEST_NUM=17.2
+failover_pair MDS clients "test 2: failover MDS, then 2 clients ===="
+sleep $FAILOVER_PERIOD
+
+#CMD_TEST_NUM=17.3
+# No test 3 for 1.8.x lustre version
+
+#CMD_TEST_NUM=17.4
+if [ $OSTCOUNT -gt 1 ]; then
+ failover_pair OST OST "test 4: failover OST, then another OST =="
+ sleep $FAILOVER_PERIOD
+else
+ skip "$0 : $OSTCOUNT < 2 OSTs, test 4 skipped"
+fi
+
+#CMD_TEST_NUM=17.5
+failover_pair OST clients "test 5: failover OST, then 2 clients ===="
+sleep $FAILOVER_PERIOD
+
+#CMD_TEST_NUM=17.6
+failover_pair OST MDS "test 6: failover OST, then MDS =========="
+sleep $FAILOVER_PERIOD
+
+#CMD_TEST_NUM=17.7
+failover_pair clients MDS "test 7: failover 2 clients, then MDS ===="
+sleep $FAILOVER_PERIOD
+
+#CMD_TEST_NUM=17.8
+#failover_pair clients OST "test 8: failover 2 clients, then OST ===="
+sleep $FAILOVER_PERIOD
+
+#CMD_TEST_NUM=17.9
+if [ $CLIENTCOUNT -ge 5 ]; then
+ failover_pair clients clients "test 9: failover 2 clients, then 2 different clients =="
+ sleep $FAILOVER_PERIOD
+fi
+log "==== Checking the clients loads AFTER all failovers -- failure NOT OK"
+if ! check_client_loads $NODES_TO_USE; then
+ log "Client load failed after failover. Exiting"
+ exit 5
+fi
+
+CURRENT_TS=$(date +%s)
+ELAPSED=$((CURRENT_TS - START_TS))
+
+log "Completed successfully in $ELAPSED seconds"
+
+exit 0
# -- remove hostname from clients list
zconf_umount $(hostname) $MOUNT
NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
-NODES_TO_USE=$(exclude_item_from_list $NODES_TO_USE $(hostname))
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
-MDTS=mds
-
-OSTS=""
-for ((i=1; i<=$OSTCOUNT; i++)) do
- OSTS="$OSTS ost$i"
-done
-OSTS=$(comma_list $OSTS)
+MDTS=$(get_facets MDS)
+OSTS=$(get_facets OST)
ERRORS_OK="" # No application failures should occur during this test.
FLAVOR=${FLAVOR:-"MDS"}
fail mds # start orphan recovery
df -P $DIR || df -P $DIR || true # reconnect
- wait_mds_recovery_done || error "MDS recovery not done"
+ wait_recovery_complete mds || error "MDS recovery not done"
AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
log "before $BEFOREUSED, after $AFTERUSED"
# recovery-mds-scale uses this to signal the client loads to die
echo $$ >$LOAD_PID_FILE
-TESTDIR=$MOUNT/dbench-$(hostname)
+TESTDIR=$MOUNT/d0.dbench-$(hostname)
CONTINUE=true
# recovery-mds-scale uses this to signal the client loads to die
echo $$ >$LOAD_PID_FILE
-TESTDIR=$MOUNT/dd-$(hostname)
+TESTDIR=$MOUNT/d0.dd-$(hostname)
CONTINUE=true
while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
# recovery-mds-scale uses this to signal the client loads to die
echo $$ >$LOAD_PID_FILE
-TESTDIR=$MOUNT/iozone-$(hostname)
+TESTDIR=$MOUNT/d0.iozone-$(hostname)
# needed to debug oom problem
#echo 1 > /proc/sys/vm/vm_gfp_debug
# recovery-mds-scale uses this to signal the client loads to die
echo $$ >$LOAD_PID_FILE
-TESTDIR=$MOUNT/tar-$(hostname)
+TESTDIR=$MOUNT/d0.tar-$(hostname)
CONTINUE=true
while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
fi"
}
+shudown_node_hard () {
+ local host=$1
+ local attempts=3
+
+ for i in $(seq $attempts) ; do
+ $POWER_DOWN $host
+ sleep 1
+ ping -w 3 -c 1 $host > /dev/null 2>&1 || return 0
+ echo "waiting for $host to fail attempts=$attempts"
+ [ $i -lt $attempts ] || \
+ { echo "$host still pingable after power down! attempts=$attempts" && return 1; }
+ done
+}
+
+shutdown_client() {
+ local client=$1
+ local mnt=${2:-$MOUNT}
+ local attempts=3
+
+ if [ "$FAILURE_MODE" = HARD ]; then
+ shudown_node_hard $client
+ else
+ zconf_umount_clients $client $mnt -f
+ fi
+}
+
shutdown_facet() {
local facet=$1
if [ "$FAILURE_MODE" = HARD ]; then
- $POWER_DOWN `facet_active_host $facet`
- sleep 2
+ shudown_node_hard $(facet_active_host $facet)
elif [ "$FAILURE_MODE" = SOFT ]; then
stop $facet
fi
}
start_client_load() {
- local list=(${1//,/ })
- local nodenum=$2
-
- local numloads=${#CLIENT_LOADS[@]}
- local testnum=$((nodenum % numloads))
+ local client=$1
+ local var=${client}_load
- do_node ${list[nodenum]} "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \
+ do_node $client "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \
BREAK_ON_ERROR=$BREAK_ON_ERROR \
END_RUN_FILE=$END_RUN_FILE \
LOAD_PID_FILE=$LOAD_PID_FILE \
TESTSUITELOG=$TESTSUITELOG \
- run_${CLIENT_LOADS[testnum]}.sh" &
+ run_${!var}.sh" &
CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!"
- log "Started client load: ${CLIENT_LOADS[testnum]} on ${list[nodenum]}"
+ log "Started client load: ${!var} on $client"
- eval export ${list[nodenum]}_load=${CLIENT_LOADS[testnum]}
return 0
}
start_client_loads () {
local clients=(${1//,/ })
+ local numloads=${#CLIENT_LOADS[@]}
+ local testnum
- for ((num=0; num < ${#clients[@]}; num++ )); do
- start_client_load $1 $num
+ for ((nodenum=0; nodenum < ${#clients[@]}; nodenum++ )); do
+ testnum=$((nodenum % numloads))
+ eval export ${clients[nodenum]}_load=${CLIENT_LOADS[testnum]}
+ start_client_load ${clients[nodenum]}
done
}
for client in $clients; do
check_client_load $client
- rc=$?
+ rc=${PIPESTATUS[0]}
if [ "$rc" != 0 ]; then
log "Client load failed on node $client, rc=$rc"
return $rc
fi
done
}
+
+restart_client_loads () {
+ local clients=${1//,/ }
+ local expectedfail=${2:-""}
+ local client=
+ local rc=0
+
+ for client in $clients; do
+ check_client_load $client
+ rc=${PIPESTATUS[0]}
+ if [ "$rc" != 0 -a "$expectedfail"]; then
+ start_client_load $client
+ echo "Restarted client load: on $client. Checking ..."
+ check_client_load $client
+ rc=${PIPESTATUS[0]}
+ if [ "$rc" != 0 ]; then
+ log "Client load failed to restart on node $client, rc=$rc"
+ # failure one client load means test fail
+ # we do not need to check other
+ return $rc
+ fi
+ else
+ return $rc
+ fi
+ done
+}
# End recovery-scale functions
# verify that lustre actually cleaned up properly
}
wait_for_host() {
- local HOST=$1
- check_network "$HOST" 900
- while ! do_node $HOST "ls -d $LUSTRE " > /dev/null; do sleep 5; done
+ local host=$1
+ check_network "$host" 900
+ while ! do_node $host "ls -d $LUSTRE " > /dev/null; do sleep 5; done
}
wait_for() {
local facet=$1
- local HOST=`facet_active_host $facet`
- wait_for_host $HOST
+ local host=`facet_active_host $facet`
+ wait_for_host $host
}
-wait_mds_recovery_done () {
- local timeout=`do_facet mds lctl get_param -n timeout`
-#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2)
-# as we are in process of changing obd_timeout in different ways
-# let's set MAX longer than that
- local MAX=$(( timeout * 4 ))
+wait_recovery_complete () {
+ local facet=$1
+
+ # Use default policy if $2 is not passed by caller.
+ #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2)
+ # as we are in process of changing obd_timeout in different ways
+ # let's set MAX longer than that
+ local MAX=${2:-$(( TIMEOUT * 4 ))}
+
+ local var_svc=${facet}_svc
+ local procfile="*.${!var_svc}.recovery_status"
local WAIT=0
+ local STATUS=
+
while [ $WAIT -lt $MAX ]; do
- STATUS=`do_facet mds "lctl get_param -n mds.*-MDT*.recovery_status | grep status"`
- echo $STATUS | grep COMPLETE && return 0
+ STATUS=$(do_facet $facet lctl get_param -n $procfile | grep status)
+ [[ $STATUS = "status: COMPLETE" ]] && return 0
sleep 5
WAIT=$((WAIT + 5))
- echo "Waiting $(($MAX - $WAIT)) secs for MDS recovery done"
+ echo "Waiting $((MAX - WAIT)) secs for $facet recovery done. $STATUS"
done
- echo "MDS recovery not done in $MAX sec"
+ echo "$facet recovery not done in $MAX sec. $STATUS"
return 1
}
RECOVERY_START_TIME=`date +%s`
echo "df pid is $DFPID"
change_active $facet
- TO=`facet_active_host $facet`
+ local TO=`facet_active_host $facet`
echo "Failover $facet to $TO"
wait_for $facet
mount_facet $facet || error "Restart of $facet failed"
echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g'
}
-# list is comma separated list
-exclude_item_from_list () {
+# list, excluded are the comma separated lists
+exclude_items_from_list () {
local list=$1
local excluded=$2
+ local item
list=${list//,/ }
- list=$(echo " $list " | sed -re "s/\s+$excluded\s+/ /g")
+ for item in ${excluded//,/ }; do
+ list=$(echo " $list " | sed -re "s/\s+$item\s+/ /g")
+ done
echo $(comma_list $list)
}
(cd `dirname $1`; echo $PWD/`basename $1`)
}
+get_facets () {
+ local name=$(echo $1 | tr "[:upper:]" "[:lower:]")
+ local type=$(echo $1 | tr "[:lower:]" "[:upper:]")
+
+ local list=""
+ MDS
+
+ case $type in
+ MDS ) list=mds;;
+ OST ) for ((i=1; i<=$OSTCOUNT; i++)) do
+ list="$list ${name}$i"
+ done;;
+ * ) error "Invalid facet type"
+ exit 1;;
+ esac
+ echo $(comma_list $list)
+}
+
##################################
# Adaptive Timeouts funcs