2 # vim: expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80
6 # ha.sh - test Lustre HA (aka failover) configurations
14 # ha.sh tests Lustre HA (aka failover) configurations with a CRM.
22 # Specify client nodes.
25 # Specify server nodes.
28 # Specify victim nodes to be rebooted.
31 # Choose a parent of the test directory. "/mnt/lustre" if not specified.
34 # Define a duration for the test. 86400 seconds if not specified.
37 # Only run the workloads; no failure will be introduced.
41 # A Lustre file system is up and mounted on all client nodes. This script
42 # does not mount or unmount any Lustre targets or clients, let alone format
45 # Each target has a failnode, so that workloads can continue after a power
48 # Targets are automatically failed back when their primary node is back. This
49 # assumption avoids calling CRM-specific commands to trigger failbacks, making
50 # this script more CRM-neural.
52 # A crash dump mechanism is configured to catch LBUGs, panics, etc.
56 # Each client runs the same set of MPI and non-MPI workloads. These
57 # applications are run in short loops so that their exit status can be waited
58 # for and checked within reasonable time by ha_wait_loads.
60 # PROCESS STRUCTURE AND IPC
62 # On the node where this script is run, the processes look like this:
66 # ~ ha.sh (ha_repeat_mpi_load ior)
68 # ~ ha.sh (ha_repeat_mpi_load simul)
70 # ~ ... (one for each MPI load)
72 # ~ ha.sh (ha_repeat_nonmpi_load client2 dbench)
73 # ~ pdsh client2 dbench
74 # ~ ha.sh (ha_repeat_nonmpi_load client2 iozone)
75 # ~ pdsh client2 iozone
76 # ~ ha.sh (ha_repeat_nonmpi_load client5 iozone)
77 # ~ pdsh client5 iozone
78 # ~ ... (one for each non-MPI load on each client)
80 # Each tilde represents a process. Indentations imply parent-children
83 # IPC is done by files in the temporary directory.
88 echo "$0: $(date +%s):" "$@"
100 ha_error "Trap ERR triggered by:"
101 ha_error " $BASH_COMMAND"
102 ha_error "Call trace:"
103 for ((i = 0; i < ${#FUNCNAME[@]}; i++)); do
104 ha_error " ${FUNCNAME[$i]} [${BASH_SOURCE[$i]}:${BASH_LINENO[$i]}]"
111 declare ha_tmp_dir=/tmp/$(basename $0)-$$
112 declare ha_stop_file=$ha_tmp_dir/stop
113 declare ha_fail_file=$ha_tmp_dir/fail
114 declare ha_status_file_prefix=$ha_tmp_dir/status
115 declare -a ha_status_files
116 declare ha_machine_file=$ha_tmp_dir/machine_file
117 declare ha_power_down_cmd=${POWER_DOWN:-pm -0}
118 declare ha_power_up_cmd=${POWER_UP:-pm -1}
119 declare -a ha_clients
120 declare -a ha_servers
121 declare -a ha_victims
122 declare ha_test_dir=/mnt/lustre/$(basename $0)-$$
123 declare ha_start_time=$(date +%s)
124 declare ha_expected_duration=$((60 * 60 * 24))
125 declare ha_nr_loops=0
126 declare ha_stop_signals="SIGINT SIGTERM SIGHUP"
127 declare ha_load_timeout=$((60 * 10))
128 declare ha_workloads_only=false
129 declare -a ha_mpi_load_tags=(
133 declare -a ha_mpi_load_cmds=(
134 "/testsuite/tests/x86_64/rhel5/IOR/src/C/IOR -b 256m -o {}/f.ior -t 2m
136 "/testsuite/tests/x86_64/rhel5/simul/simul -d {}"
138 declare -a ha_nonmpi_load_tags=(
142 declare -a ha_nonmpi_load_cmds=(
143 "dd if=/dev/zero of={}/f.dd bs=1M count=256"
144 "tar cf - /etc/fonts | tar xf - -C {}"
149 ha_info "Usage: $0 -c HOST[,...] -s HOST[,...]" \
150 "-v HOST[,...] [-d DIRECTORY] [-u SECONDS]"
153 ha_process_arguments()
157 while getopts hc:s:v:d:u:w opt; do
164 ha_clients=(${OPTARG//,/ })
167 ha_servers=(${OPTARG//,/ })
170 ha_victims=(${OPTARG//,/ })
173 ha_test_dir=$OPTARG/$(basename $0)-$$
176 ha_expected_duration=$OPTARG
179 ha_workloads_only=true
188 if [ -z "${ha_clients[*]}" ] || \
189 [ -z "${ha_servers[*]}" ] || \
190 [ -z "${ha_victims[*]}" ]; then
191 ha_error "-c, -s, and -v are all mandatory"
203 pdsh -w $nodes PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin "$@" || rc=$?
209 if [ -e "$ha_fail_file" ]; then
210 ha_info "Test directory $ha_test_dir not removed"
211 ha_info "Temporary directory $ha_tmp_dir not removed"
213 ha_on ${ha_clients[0]} rm -rf "$ha_test_dir"
218 ha_trap_stop_signals()
220 ha_info "${ha_stop_signals// /,} received"
221 touch "$ha_stop_file"
228 ha_info "Sleeping for ${n}s"
230 # sleep(1) could interrupted.
239 until mkdir "$lock" >/dev/null 2>&1; do
240 ha_sleep 1 >/dev/null
253 local nodes=${1// /,}
254 local file=/tmp/$(basename $0)-$$-$(date +%s).dk
255 local lock=$ha_tmp_dir/lock-dump-logs
258 ha_info "Dumping lctl log to $file"
259 ha_on $nodes "lctl dk >$file" || true
267 local tag=${ha_mpi_load_tags[$load]}
268 local cmd=${ha_mpi_load_cmds[$load]}
269 local dir=$ha_test_dir/$tag
270 local log=$ha_tmp_dir/$tag
273 local start_time=$(date +%s)
275 cmd=${cmd//"{}"/$dir}
277 ha_info "Starting $tag"
279 while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
281 ha_on ${ha_clients[0]} mkdir -p "$dir" && \
282 mpirun -np ${#ha_clients[@]} -machinefile "$ha_machine_file" \
284 ha_on ${ha_clients[0]} rm -rf "$dir"
285 } >>"$log" 2>&1 || rc=$?
288 ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
289 touch "$ha_fail_file"
290 touch "$ha_stop_file"
294 nr_loops=$((nr_loops + 1))
297 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
299 ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time"
309 for client in ${ha_clients[@]}; do
310 echo $client >>"$ha_machine_file"
313 for ((load = 0; load < ${#ha_mpi_load_tags[@]}; load++)); do
314 tag=${ha_mpi_load_tags[$load]}
315 status=$ha_status_file_prefix-$tag
316 ha_repeat_mpi_load $load $status &
317 ha_status_files+=("$status")
321 ha_repeat_nonmpi_load()
326 local tag=${ha_nonmpi_load_tags[$load]}
327 local cmd=${ha_nonmpi_load_cmds[$load]}
328 local dir=$ha_test_dir/$client-$tag
329 local log=$ha_tmp_dir/$client-$tag
332 local start_time=$(date +%s)
334 cmd=${cmd//"{}"/$dir}
336 ha_info "Starting $tag on $client"
338 while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
339 ha_on $client "mkdir -p $dir && \
341 rm -rf $dir" >>"$log" 2>&1 || rc=$?
344 ha_dump_logs "$client ${ha_servers[*]}"
345 touch "$ha_fail_file"
346 touch "$ha_stop_file"
350 nr_loops=$((nr_loops + 1))
353 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
355 ha_info "$tag on $client stopped: rc $rc avg loop time ${avg_loop_time}s"
358 ha_start_nonmpi_loads()
365 for client in ${ha_clients[@]}; do
366 for ((load = 0; load < ${#ha_nonmpi_load_tags[@]}; load++)); do
367 tag=${ha_nonmpi_load_tags[$load]}
368 status=$ha_status_file_prefix-$tag-$client
369 ha_repeat_nonmpi_load $client $load $status &
370 ha_status_files+=("$status")
377 trap ha_trap_stop_signals $ha_stop_signals
378 ha_start_nonmpi_loads
385 trap - $ha_stop_signals
386 ha_info "Waiting for workloads to stop"
393 local end=$(($(date +%s) + ha_load_timeout))
395 ha_info "Waiting for workload status"
396 rm -f "${ha_status_files[@]}"
397 for file in "${ha_status_files[@]}"; do
398 until [ -e "$ha_stop_file" ] ||
400 if (($(date +%s) >= end)); then
401 ha_info "Timed out while waiting for load status file $file"
402 touch "$ha_fail_file"
405 ha_sleep 1 >/dev/null
414 ha_info "Powering down $node"
415 $ha_power_down_cmd $node
422 ha_info "Powering up $node"
423 $ha_power_up_cmd $node
429 # Print a random integer within [0, MAX).
436 # See "5.2 Bash Variables" from "info bash".
438 echo -n $((RANDOM * max / 32768))
443 local i=$(ha_rand ${#ha_victims[@]})
445 echo -n ${ha_victims[$i]}
451 local end=$(($(date +%s) + 5 * 60))
453 ha_info "Waiting for $node to boot up"
454 until pdsh -w $node -S hostname >/dev/null 2>&1 ||
455 [ -e "$ha_stop_file" ] ||
456 (($(date +%s) >= end)); do
457 ha_sleep 1 >/dev/null
463 ha_info "---------------8<---------------"
465 ha_info " Duration: $(($(date +%s) - $ha_start_time))s"
466 ha_info " Loops: $ha_nr_loops"
473 while (($(date +%s) < ha_start_time + ha_expected_duration)) &&
474 [ ! -e "$ha_stop_file" ]; do
475 ha_info "---------------8<---------------"
479 ha_info "Failing $node"
480 ha_sleep $(ha_rand 10)
483 ha_wait_loads || break
485 if [ -e $ha_stop_file ]; then
490 ha_info "Bringing $node back"
491 ha_sleep $(ha_rand 10)
495 # Wait for the failback to start.
498 ha_wait_loads || break
500 ha_sleep $(ha_rand 20)
502 ha_nr_loops=$((ha_nr_loops + 1))
503 ha_info "Loop $ha_nr_loops done"
510 ha_process_arguments "$@"
512 trap ha_trap_exit EXIT
514 ha_on ${ha_clients[0]} mkdir "$ha_test_dir"
517 if ha_wait_loads; then
518 if $ha_workloads_only; then
519 ha_sleep $((60 * 60))
524 ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
527 if [ -e "$ha_fail_file" ]; then