2 # -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
3 # vim:shiftwidth=4:softtabstop=4:tabstop=4:
7 # ha.sh - test Lustre HA (aka failover) configurations
15 # ha.sh tests Lustre HA (aka failover) configurations with a CRM.
23 # Specify client nodes.
26 # Specify server nodes.
29 # Specify victim nodes to be rebooted.
32 # Choose a parent of the test directory. "/mnt/lustre" if not specified.
35 # Define a duration for the test. 86400 seconds if not specified.
38 # Only run the workloads; no failure will be introduced.
42 # A Lustre file system is up and mounted on all client nodes. This script
43 # does not mount or unmount any Lustre targets or clients, let alone format
46 # Each target has a failnode, so that workloads can continue after a power
49 # Targets are automatically failed back when their primary node is back. This
50 # assumption avoids calling CRM-specific commands to trigger failbacks, making
51 # this script more CRM-neural.
53 # A crash dump mechanism is configured to catch LBUGs, panics, etc.
57 # Each client runs the same set of MPI and non-MPI workloads. These
58 # applications are run in short loops so that their exit status can be waited
59 # for and checked within reasonable time by ha_wait_loads.
61 # PROCESS STRUCTURE AND IPC
63 # On the node where this script is run, the processes look like this:
67 # ~ ha.sh (ha_repeat_mpi_load ior)
69 # ~ ha.sh (ha_repeat_mpi_load simul)
71 # ~ ... (one for each MPI load)
73 # ~ ha.sh (ha_repeat_nonmpi_load client2 dbench)
74 # ~ pdsh client2 dbench
75 # ~ ha.sh (ha_repeat_nonmpi_load client2 iozone)
76 # ~ pdsh client2 iozone
77 # ~ ha.sh (ha_repeat_nonmpi_load client5 iozone)
78 # ~ pdsh client5 iozone
79 # ~ ... (one for each non-MPI load on each client)
81 # Each tilde represents a process. Indentations imply parent-children
84 # IPC is done by files in the temporary directory.
89 echo "$0: $(date +%s):" "$@"
101 ha_error "Trap ERR triggered by:"
102 ha_error " $BASH_COMMAND"
103 ha_error "Call trace:"
104 for ((i = 0; i < ${#FUNCNAME[@]}; i++)); do
105 ha_error " ${FUNCNAME[$i]} [${BASH_SOURCE[$i]}:${BASH_LINENO[$i]}]"
112 declare ha_tmp_dir=/tmp/$(basename $0)-$$
113 declare ha_stop_file=$ha_tmp_dir/stop
114 declare ha_fail_file=$ha_tmp_dir/fail
115 declare ha_status_file_prefix=$ha_tmp_dir/status
116 declare -a ha_status_files
117 declare ha_machine_file=$ha_tmp_dir/machine_file
118 declare ha_power_down_cmd=${POWER_DOWN:-pm -0}
119 declare ha_power_up_cmd=${POWER_UP:-pm -1}
120 declare -a ha_clients
121 declare -a ha_servers
122 declare -a ha_victims
123 declare ha_test_dir=/mnt/lustre/$(basename $0)-$$
124 declare ha_start_time=$(date +%s)
125 declare ha_expected_duration=$((60 * 60 * 24))
126 declare ha_nr_loops=0
127 declare ha_stop_signals="SIGINT SIGTERM SIGHUP"
128 declare ha_load_timeout=$((60 * 10))
129 declare ha_workloads_only=false
130 declare -a ha_mpi_load_tags=(
134 declare -a ha_mpi_load_cmds=(
135 "/testsuite/tests/x86_64/rhel5/IOR/src/C/IOR -b 256m -o {}/f.ior -t 2m
137 "/testsuite/tests/x86_64/rhel5/simul/simul -d {}"
139 declare -a ha_nonmpi_load_tags=(
143 declare -a ha_nonmpi_load_cmds=(
144 "dd if=/dev/zero of={}/f.dd bs=1M count=256"
145 "tar cf - /etc/fonts | tar xf - -C {}"
150 ha_info "Usage: $0 -c HOST[,...] -s HOST[,...]" \
151 "-v HOST[,...] [-d DIRECTORY] [-u SECONDS]"
154 ha_process_arguments()
158 while getopts hc:s:v:d:u:w opt; do
165 ha_clients=(${OPTARG//,/ })
168 ha_servers=(${OPTARG//,/ })
171 ha_victims=(${OPTARG//,/ })
174 ha_test_dir=$OPTARG/$(basename $0)-$$
177 ha_expected_duration=$OPTARG
180 ha_workloads_only=true
189 if [ -z "${ha_clients[*]}" ] || \
190 [ -z "${ha_servers[*]}" ] || \
191 [ -z "${ha_victims[*]}" ]; then
192 ha_error "-c, -s, and -v are all mandatory"
204 pdsh -w $nodes PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin "$@" || rc=$?
210 if [ -e "$ha_fail_file" ]; then
211 ha_info "Test directory $ha_test_dir not removed"
212 ha_info "Temporary directory $ha_tmp_dir not removed"
214 ha_on ${ha_clients[0]} rm -rf "$ha_test_dir"
219 ha_trap_stop_signals()
221 ha_info "${ha_stop_signals// /,} received"
222 touch "$ha_stop_file"
229 ha_info "Sleeping for ${n}s"
231 # sleep(1) could interrupted.
240 until mkdir "$lock" >/dev/null 2>&1; do
241 ha_sleep 1 >/dev/null
254 local nodes=${1// /,}
255 local file=/tmp/$(basename $0)-$$-$(date +%s).dk
256 local lock=$ha_tmp_dir/lock-dump-logs
259 ha_info "Dumping lctl log to $file"
260 ha_on $nodes "lctl dk >$file" || true
268 local tag=${ha_mpi_load_tags[$load]}
269 local cmd=${ha_mpi_load_cmds[$load]}
270 local dir=$ha_test_dir/$tag
271 local log=$ha_tmp_dir/$tag
274 local start_time=$(date +%s)
276 cmd=${cmd//"{}"/$dir}
278 ha_info "Starting $tag"
280 while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
282 ha_on ${ha_clients[0]} mkdir -p "$dir" && \
283 mpirun ${MACHINEFILE_OPTION} "$ha_machine_file" \
284 -np ${#ha_clients[@]} $cmd && \
285 ha_on ${ha_clients[0]} rm -rf "$dir"
286 } >>"$log" 2>&1 || rc=$?
289 ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
290 touch "$ha_fail_file"
291 touch "$ha_stop_file"
295 nr_loops=$((nr_loops + 1))
298 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
300 ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time"
310 for client in ${ha_clients[@]}; do
311 echo $client >>"$ha_machine_file"
314 for ((load = 0; load < ${#ha_mpi_load_tags[@]}; load++)); do
315 tag=${ha_mpi_load_tags[$load]}
316 status=$ha_status_file_prefix-$tag
317 ha_repeat_mpi_load $load $status &
318 ha_status_files+=("$status")
322 ha_repeat_nonmpi_load()
327 local tag=${ha_nonmpi_load_tags[$load]}
328 local cmd=${ha_nonmpi_load_cmds[$load]}
329 local dir=$ha_test_dir/$client-$tag
330 local log=$ha_tmp_dir/$client-$tag
333 local start_time=$(date +%s)
335 cmd=${cmd//"{}"/$dir}
337 ha_info "Starting $tag on $client"
339 while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
340 ha_on $client "mkdir -p $dir && \
342 rm -rf $dir" >>"$log" 2>&1 || rc=$?
345 ha_dump_logs "$client ${ha_servers[*]}"
346 touch "$ha_fail_file"
347 touch "$ha_stop_file"
351 nr_loops=$((nr_loops + 1))
354 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
356 ha_info "$tag on $client stopped: rc $rc avg loop time ${avg_loop_time}s"
359 ha_start_nonmpi_loads()
366 for client in ${ha_clients[@]}; do
367 for ((load = 0; load < ${#ha_nonmpi_load_tags[@]}; load++)); do
368 tag=${ha_nonmpi_load_tags[$load]}
369 status=$ha_status_file_prefix-$tag-$client
370 ha_repeat_nonmpi_load $client $load $status &
371 ha_status_files+=("$status")
378 trap ha_trap_stop_signals $ha_stop_signals
379 ha_start_nonmpi_loads
386 trap - $ha_stop_signals
387 ha_info "Waiting for workloads to stop"
394 local end=$(($(date +%s) + ha_load_timeout))
396 ha_info "Waiting for workload status"
397 rm -f "${ha_status_files[@]}"
398 for file in "${ha_status_files[@]}"; do
399 until [ -e "$ha_stop_file" ] ||
401 if (($(date +%s) >= end)); then
402 ha_info "Timed out while waiting for load status file $file"
403 touch "$ha_fail_file"
406 ha_sleep 1 >/dev/null
415 ha_info "Powering down $node"
416 $ha_power_down_cmd $node
423 ha_info "Powering up $node"
424 $ha_power_up_cmd $node
430 # Print a random integer within [0, MAX).
437 # See "5.2 Bash Variables" from "info bash".
439 echo -n $((RANDOM * max / 32768))
444 local i=$(ha_rand ${#ha_victims[@]})
446 echo -n ${ha_victims[$i]}
452 local end=$(($(date +%s) + 5 * 60))
454 ha_info "Waiting for $node to boot up"
455 until pdsh -w $node -S hostname >/dev/null 2>&1 ||
456 [ -e "$ha_stop_file" ] ||
457 (($(date +%s) >= end)); do
458 ha_sleep 1 >/dev/null
464 ha_info "---------------8<---------------"
466 ha_info " Duration: $(($(date +%s) - $ha_start_time))s"
467 ha_info " Loops: $ha_nr_loops"
474 while (($(date +%s) < ha_start_time + ha_expected_duration)) &&
475 [ ! -e "$ha_stop_file" ]; do
476 ha_info "---------------8<---------------"
480 ha_info "Failing $node"
481 ha_sleep $(ha_rand 10)
484 ha_wait_loads || break
486 if [ -e $ha_stop_file ]; then
491 ha_info "Bringing $node back"
492 ha_sleep $(ha_rand 10)
496 # Wait for the failback to start.
499 ha_wait_loads || break
501 ha_sleep $(ha_rand 20)
503 ha_nr_loops=$((ha_nr_loops + 1))
504 ha_info "Loop $ha_nr_loops done"
511 ha_process_arguments "$@"
513 trap ha_trap_exit EXIT
515 ha_on ${ha_clients[0]} mkdir "$ha_test_dir"
518 if ha_wait_loads; then
519 if $ha_workloads_only; then
520 ha_sleep $((60 * 60))
525 ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
528 if [ -e "$ha_fail_file" ]; then