2 # vim: expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80
6 # ha.sh - test Lustre HA (aka failover) configurations
14 # ha.sh tests Lustre HA (aka failover) configurations with a CRM.
22 # Specify client nodes.
25 # Specify server nodes.
28 # Specify victim nodes to be rebooted.
31 # Choose a parent of the test directory. "/mnt/lustre" if not specified.
34 # Define a duration for the test. 86400 seconds if not specified.
37 # Only run the workloads; no failure will be introduced.
41 # A Lustre file system is up and mounted on all client nodes. This script
42 # does not mount or unmount any Lustre targets or clients, let alone format
45 # Each target has a failnode, so that workloads can continue after a power
48 # Targets are automatically failed back when their primary node is back. This
49 # assumption avoids calling CRM-specific commands to trigger failbacks, making
50 # this script more CRM-neural.
52 # A crash dump mechanism is configured to catch LBUGs, panics, etc.
56 # Each client runs the same set of MPI and non-MPI workloads. These
57 # applications are run in short loops so that their exit status can be waited
58 # for and checked within reasonable time by ha_wait_loads.
60 # PROCESS STRUCTURE AND IPC
62 # On the node where this script is run, the processes look like this:
66 # ~ ha.sh (ha_repeat_mpi_load ior)
68 # ~ ha.sh (ha_repeat_mpi_load simul)
70 # ~ ... (one for each MPI load)
72 # ~ ha.sh (ha_repeat_nonmpi_load client2 dbench)
73 # ~ pdsh client2 dbench
74 # ~ ha.sh (ha_repeat_nonmpi_load client2 iozone)
75 # ~ pdsh client2 iozone
76 # ~ ha.sh (ha_repeat_nonmpi_load client5 iozone)
77 # ~ pdsh client5 iozone
78 # ~ ... (one for each non-MPI load on each client)
80 # Each tilde represents a process. Indentations imply parent-children
83 # IPC is done by files in the temporary directory.
88 echo "$0: $(date +%s):" "$@"
100 ha_error "Trap ERR triggered by:"
101 ha_error " $BASH_COMMAND"
102 ha_error "Call trace:"
103 for ((i = 0; i < ${#FUNCNAME[@]}; i++)); do
104 ha_error " ${FUNCNAME[$i]} [${BASH_SOURCE[$i]}:${BASH_LINENO[$i]}]"
111 declare ha_tmp_dir=/tmp/$(basename $0)-$$
112 declare ha_stop_file=$ha_tmp_dir/stop
113 declare ha_fail_file=$ha_tmp_dir/fail
114 declare ha_status_file_prefix=$ha_tmp_dir/status
115 declare -a ha_status_files
116 declare ha_machine_file=$ha_tmp_dir/machine_file
117 declare ha_power_down_cmd=${POWER_DOWN:-pm -0}
118 declare ha_power_up_cmd=${POWER_UP:-pm -1}
119 declare -a ha_clients
120 declare -a ha_servers
121 declare -a ha_victims
122 declare ha_test_dir=/mnt/lustre/$(basename $0)-$$
123 declare ha_start_time=$(date +%s)
124 declare ha_expected_duration=$((60 * 60 * 24))
125 declare ha_nr_loops=0
126 declare ha_stop_signals="SIGINT SIGTERM SIGHUP"
127 declare ha_load_timeout=$((60 * 10))
128 declare ha_workloads_only=false
129 declare -a ha_mpi_load_tags=(
133 declare -a ha_mpi_load_cmds=(
134 "/testsuite/tests/x86_64/rhel5/IOR/src/C/IOR -b 256m -o {}/f.ior -t 2m
136 "/testsuite/tests/x86_64/rhel5/simul/simul -d {}"
138 declare -a ha_nonmpi_load_tags=(
142 declare -a ha_nonmpi_load_cmds=(
143 "dd if=/dev/zero of={}/f.dd bs=1M count=256"
144 "tar cf - /etc/fonts | tar xf - -C {}"
149 ha_info "Usage: $0 -c HOST[,...] -s HOST[,...]" \
150 "-v HOST[,...] [-d DIRECTORY] [-u SECONDS]"
153 ha_process_arguments()
157 while getopts hc:s:v:d:u:w opt; do
164 ha_clients=(${OPTARG//,/ })
167 ha_servers=(${OPTARG//,/ })
170 ha_victims=(${OPTARG//,/ })
173 ha_test_dir=$OPTARG/$(basename $0)-$$
176 ha_expected_duration=$OPTARG
179 ha_workloads_only=true
188 if [ -z "${ha_clients[*]}" ] || \
189 [ -z "${ha_servers[*]}" ] || \
190 [ -z "${ha_victims[*]}" ]; then
191 ha_error "-c, -s, and -v are all mandatory"
202 pdsh -w $nodes PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin "$@"
207 if [ -e "$ha_fail_file" ]; then
208 ha_info "Test directory $ha_test_dir not removed"
209 ha_info "Temporary directory $ha_tmp_dir not removed"
211 ha_on ${ha_clients[0]} rm -rf "$ha_test_dir"
216 ha_trap_stop_signals()
218 ha_info "${ha_stop_signals// /,} received"
219 touch "$ha_stop_file"
226 ha_info "Sleeping for ${n}s"
228 # sleep(1) could interrupted.
237 until mkdir "$lock" >/dev/null 2>&1; do
238 ha_sleep 1 >/dev/null
251 local nodes=${1// /,}
252 local file=/tmp/$(basename $0)-$$-$(date +%s).dk
253 local lock=$ha_tmp_dir/lock-dump-logs
256 ha_info "Dumping lctl log to $file"
257 ha_on $nodes "lctl dk >$file"
265 local tag=${ha_mpi_load_tags[$load]}
266 local cmd=${ha_mpi_load_cmds[$load]}
267 local dir=$ha_test_dir/$tag
268 local log=$ha_tmp_dir/$tag
271 local start_time=$(date +%s)
273 cmd=${cmd//"{}"/$dir}
275 ha_info "Starting $tag"
277 while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
279 ha_on ${ha_clients[0]} mkdir -p "$dir" && \
280 mpirun -np ${#ha_clients[@]} -machinefile "$ha_machine_file" \
282 ha_on ${ha_clients[0]} rm -rf "$dir"
283 } >>"$log" 2>&1 || rc=$?
286 ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
287 touch "$ha_fail_file"
288 touch "$ha_stop_file"
292 nr_loops=$((nr_loops + 1))
295 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
297 ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time"
307 for client in ${ha_clients[@]}; do
308 echo $client >>"$ha_machine_file"
311 for ((load = 0; load < ${#ha_mpi_load_tags[@]}; load++)); do
312 tag=${ha_mpi_load_tags[$load]}
313 status=$ha_status_file_prefix-$tag
314 ha_repeat_mpi_load $load $status &
315 ha_status_files+=("$status")
319 ha_repeat_nonmpi_load()
324 local tag=${ha_nonmpi_load_tags[$load]}
325 local cmd=${ha_nonmpi_load_cmds[$load]}
326 local dir=$ha_test_dir/$client-$tag
327 local log=$ha_tmp_dir/$client-$tag
330 local start_time=$(date +%s)
332 cmd=${cmd//"{}"/$dir}
334 ha_info "Starting $tag on $client"
336 while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
337 ha_on $client "mkdir -p $dir && \
339 rm -rf $dir" >>"$log" 2>&1 || rc=$?
342 ha_dump_logs "$client ${ha_servers[*]}"
343 touch "$ha_fail_file"
344 touch "$ha_stop_file"
348 nr_loops=$((nr_loops + 1))
351 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
353 ha_info "$tag on $client stopped: rc $rc avg loop time ${avg_loop_time}s"
356 ha_start_nonmpi_loads()
363 for client in ${ha_clients[@]}; do
364 for ((load = 0; load < ${#ha_nonmpi_load_tags[@]}; load++)); do
365 tag=${ha_nonmpi_load_tags[$load]}
366 status=$ha_status_file_prefix-$tag-$client
367 ha_repeat_nonmpi_load $client $load $status &
368 ha_status_files+=("$status")
375 trap ha_trap_stop_signals $ha_stop_signals
376 ha_start_nonmpi_loads
383 trap - $ha_stop_signals
384 ha_info "Waiting for workloads to stop"
391 local end=$(($(date +%s) + ha_load_timeout))
393 ha_info "Waiting for workload status"
394 rm -f "${ha_status_files[@]}"
395 for file in "${ha_status_files[@]}"; do
396 until [ -e "$ha_stop_file" ] ||
398 (($(date +%s) >= end)); do
399 ha_sleep 1 >/dev/null
408 ha_info "Powering down $node"
409 $ha_power_down_cmd $node
416 ha_info "Powering up $node"
417 $ha_power_up_cmd $node
423 # Print a random integer within [0, MAX).
430 # See "5.2 Bash Variables" from "info bash".
432 echo -n $((RANDOM * max / 32768))
437 local i=$(ha_rand ${#ha_victims[@]})
439 echo -n ${ha_victims[$i]}
445 local end=$(($(date +%s) + 5 * 60))
447 ha_info "Waiting for $node to boot up"
448 until pdsh -w $node -S hostname >/dev/null 2>&1 ||
449 [ -e "$ha_stop_file" ] ||
450 (($(date +%s) >= end)); do
451 ha_sleep 1 >/dev/null
457 ha_info "---------------8<---------------"
459 ha_info " Duration: $(($(date +%s) - $ha_start_time))s"
460 ha_info " Loops: $ha_nr_loops"
467 while (($(date +%s) < ha_start_time + ha_expected_duration)) &&
468 [ ! -e "$ha_stop_file" ]; do
469 ha_info "---------------8<---------------"
473 ha_info "Failing $node"
474 ha_sleep $(ha_rand 10)
477 ha_wait_loads || return
479 if [ -e $ha_stop_file ]; then
484 ha_info "Bringing $node back"
485 ha_sleep $(ha_rand 10)
489 # Wait for the failback to start.
492 ha_wait_loads || return
494 ha_sleep $(ha_rand 20)
496 ha_nr_loops=$((ha_nr_loops + 1))
497 ha_info "Loop $ha_nr_loops done"
504 ha_process_arguments "$@"
506 trap ha_trap_exit EXIT
508 ha_on ${ha_clients[0]} mkdir "$ha_test_dir"
513 if $ha_workloads_only; then
514 ha_sleep $((60 * 60))
522 if [ -e "$ha_fail_file" ]; then