From: Li Wei Date: Tue, 27 Apr 2010 02:28:08 +0000 (+0800) Subject: b=21251 Add lustre/tests/ha.sh X-Git-Tag: 1.10.0.41~14 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=544282f9a72ff8b15e9d73ff90fc53f5c5970b2d b=21251 Add lustre/tests/ha.sh This is a simple failover test script that works with configurations controlled by a CRM and have multiple targets per server. i=robert.read i=grev --- diff --git a/lustre/tests/ha.sh b/lustre/tests/ha.sh new file mode 100755 index 0000000..d978419 --- /dev/null +++ b/lustre/tests/ha.sh @@ -0,0 +1,529 @@ +#!/bin/bash +# vim: expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80 +# +# NAME +# +# ha.sh - test Lustre HA (aka failover) configurations +# +# SYNOPSIS +# +# ha.sh [OPTIONS] +# +# DESCRIPTION +# +# ha.sh tests Lustre HA (aka failover) configurations with a CRM. +# +# OPTIONS +# +# -h +# Help. +# +# -c HOST[,...] +# Specify client nodes. +# +# -s HOST[,...] +# Specify server nodes. +# +# -v HOST[,...] +# Specify victim nodes to be rebooted. +# +# -d DIRECTORY +# Choose a parent of the test directory. "/mnt/lustre" if not specified. +# +# -u SECONDS +# Define a duration for the test. 86400 seconds if not specified. +# +# -w +# Only run the workloads; no failure will be introduced. +# +# ASSUMPTIONS +# +# A Lustre file system is up and mounted on all client nodes. This script +# does not mount or unmount any Lustre targets or clients, let alone format +# anything. +# +# Each target has a failnode, so that workloads can continue after a power +# failure. +# +# Targets are automatically failed back when their primary node is back. This +# assumption avoids calling CRM-specific commands to trigger failbacks, making +# this script more CRM-neural. +# +# A crash dump mechanism is configured to catch LBUGs, panics, etc. +# +# WORKLOADS +# +# Each client runs the same set of MPI and non-MPI workloads. These +# applications are run in short loops so that their exit status can be waited +# for and checked within reasonable time by ha_wait_loads. +# +# PROCESS STRUCTURE AND IPC +# +# On the node where this script is run, the processes look like this: +# +# ~ ha.sh (ha_killer) +# +# ~ ha.sh (ha_repeat_mpi_load ior) +# ~ mpirun IOR +# ~ ha.sh (ha_repeat_mpi_load simul) +# ~ mpirun simul +# ~ ... (one for each MPI load) +# +# ~ ha.sh (ha_repeat_nonmpi_load client2 dbench) +# ~ pdsh client2 dbench +# ~ ha.sh (ha_repeat_nonmpi_load client2 iozone) +# ~ pdsh client2 iozone +# ~ ha.sh (ha_repeat_nonmpi_load client5 iozone) +# ~ pdsh client5 iozone +# ~ ... (one for each non-MPI load on each client) +# +# Each tilde represents a process. Indentations imply parent-children +# relation. +# +# IPC is done by files in the temporary directory. +# + +ha_info() +{ + echo "$0: $(date +%s):" "$@" +} + +ha_error() +{ + ha_info "$@" >&2 +} + +ha_trap_err() +{ + local i + + ha_error "Trap ERR triggered by:" + ha_error " $BASH_COMMAND" + ha_error "Call trace:" + for ((i = 0; i < ${#FUNCNAME[@]}; i++)); do + ha_error " ${FUNCNAME[$i]} [${BASH_SOURCE[$i]}:${BASH_LINENO[$i]}]" + done +} + +trap ha_trap_err ERR +set -eE + +declare ha_tmp_dir=/tmp/$(basename $0)-$$ +declare ha_stop_file=$ha_tmp_dir/stop +declare ha_fail_file=$ha_tmp_dir/fail +declare ha_status_file_prefix=$ha_tmp_dir/status +declare -a ha_status_files +declare ha_machine_file=$ha_tmp_dir/machine_file +declare ha_power_down_cmd=${POWER_DOWN:-pm -0} +declare ha_power_up_cmd=${POWER_UP:-pm -1} +declare -a ha_clients +declare -a ha_servers +declare -a ha_victims +declare ha_test_dir=/mnt/lustre/$(basename $0)-$$ +declare ha_start_time=$(date +%s) +declare ha_expected_duration=$((60 * 60 * 24)) +declare ha_nr_loops=0 +declare ha_stop_signals="SIGINT SIGTERM SIGHUP" +declare ha_load_timeout=$((60 * 10)) +declare ha_workloads_only=false +declare -a ha_mpi_load_tags=( + ior + simul +) +declare -a ha_mpi_load_cmds=( + "/testsuite/tests/x86_64/rhel5/IOR/src/C/IOR -b 256m -o {}/f.ior -t 2m + -w -W -T 1" + "/testsuite/tests/x86_64/rhel5/simul/simul -d {}" +) +declare -a ha_nonmpi_load_tags=( + dd + tar +) +declare -a ha_nonmpi_load_cmds=( + "dd if=/dev/zero of={}/f.dd bs=1M count=256" + "tar cf - /etc/fonts | tar xf - -C {}" +) + +ha_usage() +{ + ha_info "Usage: $0 -c HOST[,...] -s HOST[,...]" \ + "-v HOST[,...] [-d DIRECTORY] [-u SECONDS]" +} + +ha_process_arguments() +{ + local opt + + while getopts hc:s:v:d:u:w opt; do + case $opt in + h) + ha_usage + exit 0 + ;; + c) + ha_clients=(${OPTARG//,/ }) + ;; + s) + ha_servers=(${OPTARG//,/ }) + ;; + v) + ha_victims=(${OPTARG//,/ }) + ;; + d) + ha_test_dir=$OPTARG/$(basename $0)-$$ + ;; + u) + ha_expected_duration=$OPTARG + ;; + w) + ha_workloads_only=true + ;; + \?) + ha_usage + exit 1 + ;; + esac + done + + if [ -z "${ha_clients[*]}" ] || \ + [ -z "${ha_servers[*]}" ] || \ + [ -z "${ha_victims[*]}" ]; then + ha_error "-c, -s, and -v are all mandatory" + ha_usage + exit 1 + fi +} + +ha_on() +{ + local nodes=$1 + + shift + pdsh -w $nodes PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin "$@" +} + +ha_trap_exit() +{ + if [ -e "$ha_fail_file" ]; then + ha_info "Test directory $ha_test_dir not removed" + ha_info "Temporary directory $ha_tmp_dir not removed" + else + ha_on ${ha_clients[0]} rm -rf "$ha_test_dir" + rm -rf "$ha_tmp_dir" + fi +} + +ha_trap_stop_signals() +{ + ha_info "${ha_stop_signals// /,} received" + touch "$ha_stop_file" +} + +ha_sleep() +{ + local n=$1 + + ha_info "Sleeping for ${n}s" + # + # sleep(1) could interrupted. + # + sleep $n || true +} + +ha_lock() +{ + local lock=$1 + + until mkdir "$lock" >/dev/null 2>&1; do + ha_sleep 1 >/dev/null + done +} + +ha_unlock() +{ + local lock=$1 + + rm -r "$lock" +} + +ha_dump_logs() +{ + local nodes=${1// /,} + local file=/tmp/$(basename $0)-$$-$(date +%s).dk + local lock=$ha_tmp_dir/lock-dump-logs + + ha_lock "$lock" + ha_info "Dumping lctl log to $file" + ha_on $nodes "lctl dk >$file" + ha_unlock "$lock" +} + +ha_repeat_mpi_load() +{ + local load=$1 + local status=$2 + local tag=${ha_mpi_load_tags[$load]} + local cmd=${ha_mpi_load_cmds[$load]} + local dir=$ha_test_dir/$tag + local log=$ha_tmp_dir/$tag + local rc=0 + local nr_loops=0 + local start_time=$(date +%s) + + cmd=${cmd//"{}"/$dir} + + ha_info "Starting $tag" + + while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do + { + ha_on ${ha_clients[0]} mkdir -p "$dir" && \ + mpirun -np ${#ha_clients[@]} -machinefile "$ha_machine_file" \ + $cmd && \ + ha_on ${ha_clients[0]} rm -rf "$dir" + } >>"$log" 2>&1 || rc=$? + + if ((rc != 0)); then + ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}" + touch "$ha_fail_file" + touch "$ha_stop_file" + fi + echo $rc >"$status" + + nr_loops=$((nr_loops + 1)) + done + + avg_loop_time=$((($(date +%s) - start_time) / nr_loops)) + + ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time" +} + +ha_start_mpi_loads() +{ + local client + local load + local tag + local status + + for client in ${ha_clients[@]}; do + echo $client >>"$ha_machine_file" + done + + for ((load = 0; load < ${#ha_mpi_load_tags[@]}; load++)); do + tag=${ha_mpi_load_tags[$load]} + status=$ha_status_file_prefix-$tag + ha_repeat_mpi_load $load $status & + ha_status_files+=("$status") + done +} + +ha_repeat_nonmpi_load() +{ + local client=$1 + local load=$2 + local status=$3 + local tag=${ha_nonmpi_load_tags[$load]} + local cmd=${ha_nonmpi_load_cmds[$load]} + local dir=$ha_test_dir/$client-$tag + local log=$ha_tmp_dir/$client-$tag + local rc=0 + local nr_loops=0 + local start_time=$(date +%s) + + cmd=${cmd//"{}"/$dir} + + ha_info "Starting $tag on $client" + + while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do + ha_on $client "mkdir -p $dir && \ + $cmd && \ + rm -rf $dir" >>"$log" 2>&1 || rc=$? + + if ((rc != 0)); then + ha_dump_logs "$client ${ha_servers[*]}" + touch "$ha_fail_file" + touch "$ha_stop_file" + fi + echo $rc >"$status" + + nr_loops=$((nr_loops + 1)) + done + + avg_loop_time=$((($(date +%s) - start_time) / nr_loops)) + + ha_info "$tag on $client stopped: rc $rc avg loop time ${avg_loop_time}s" +} + +ha_start_nonmpi_loads() +{ + local client + local load + local tag + local status + + for client in ${ha_clients[@]}; do + for ((load = 0; load < ${#ha_nonmpi_load_tags[@]}; load++)); do + tag=${ha_nonmpi_load_tags[$load]} + status=$ha_status_file_prefix-$tag-$client + ha_repeat_nonmpi_load $client $load $status & + ha_status_files+=("$status") + done + done +} + +ha_start_loads() +{ + trap ha_trap_stop_signals $ha_stop_signals + ha_start_nonmpi_loads + ha_start_mpi_loads +} + +ha_stop_loads() +{ + touch $ha_stop_file + trap - $ha_stop_signals + ha_info "Waiting for workloads to stop" + wait +} + +ha_wait_loads() +{ + local file + local end=$(($(date +%s) + ha_load_timeout)) + + ha_info "Waiting for workload status" + rm -f "${ha_status_files[@]}" + for file in "${ha_status_files[@]}"; do + until [ -e "$ha_stop_file" ] || + [ -e "$file" ] || + (($(date +%s) >= end)); do + ha_sleep 1 >/dev/null + done + done +} + +ha_power_down() +{ + local node=$1 + + ha_info "Powering down $node" + $ha_power_down_cmd $node +} + +ha_power_up() +{ + local node=$1 + + ha_info "Powering up $node" + $ha_power_up_cmd $node +} + +# +# rand MAX +# +# Print a random integer within [0, MAX). +# +ha_rand() +{ + local max=$1 + + # + # See "5.2 Bash Variables" from "info bash". + # + echo -n $((RANDOM * max / 32768)) +} + +ha_aim() +{ + local i=$(ha_rand ${#ha_victims[@]}) + + echo -n ${ha_victims[$i]} +} + +ha_wait_node() +{ + local node=$1 + local end=$(($(date +%s) + 5 * 60)) + + ha_info "Waiting for $node to boot up" + until pdsh -w $node -S hostname >/dev/null 2>&1 || + [ -e "$ha_stop_file" ] || + (($(date +%s) >= end)); do + ha_sleep 1 >/dev/null + done +} + +ha_summarize() +{ + ha_info "---------------8<---------------" + ha_info "Summary:" + ha_info " Duration: $(($(date +%s) - $ha_start_time))s" + ha_info " Loops: $ha_nr_loops" +} + +ha_killer() +{ + local node + + while (($(date +%s) < ha_start_time + ha_expected_duration)) && + [ ! -e "$ha_stop_file" ]; do + ha_info "---------------8<---------------" + + node=$(ha_aim) + + ha_info "Failing $node" + ha_sleep $(ha_rand 10) + ha_power_down $node + ha_sleep 10 + ha_wait_loads || return + + if [ -e $ha_stop_file ]; then + ha_power_up $node + break; + fi + + ha_info "Bringing $node back" + ha_sleep $(ha_rand 10) + ha_power_up $node + ha_wait_node $node + # + # Wait for the failback to start. + # + ha_sleep 60 + ha_wait_loads || return + + ha_sleep $(ha_rand 20) + + ha_nr_loops=$((ha_nr_loops + 1)) + ha_info "Loop $ha_nr_loops done" + done + ha_summarize +} + +ha_main() +{ + ha_process_arguments "$@" + + trap ha_trap_exit EXIT + mkdir "$ha_tmp_dir" + ha_on ${ha_clients[0]} mkdir "$ha_test_dir" + + ha_start_loads + ha_wait_loads + + if $ha_workloads_only; then + ha_sleep $((60 * 60)) + else + ha_killer + ha_dump_logs + fi + + ha_stop_loads + + if [ -e "$ha_fail_file" ]; then + exit 1 + else + exit 0 + fi +} + +ha_main "$@"