#!/bin/bash
-# vim: expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80
+# -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
+# vim:shiftwidth=4:softtabstop=4:tabstop=4:
#
# NAME
#
ha_on()
{
local nodes=$1
+ local rc=0
shift
- pdsh -w $nodes PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin "$@"
+ pdsh -w $nodes PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin "$@" || rc=$?
+ return $rc
}
ha_trap_exit()
ha_lock "$lock"
ha_info "Dumping lctl log to $file"
- ha_on $nodes "lctl dk >$file"
+ ha_on $nodes "lctl dk >$file" || true
ha_unlock "$lock"
}
ha_info "Starting $tag"
- while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
- {
- ha_on ${ha_clients[0]} mkdir -p "$dir" && \
- mpirun -np ${#ha_clients[@]} -machinefile "$ha_machine_file" \
- $cmd && \
- ha_on ${ha_clients[0]} rm -rf "$dir"
- } >>"$log" 2>&1 || rc=$?
+ while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
+ {
+ ha_on ${ha_clients[0]} mkdir -p "$dir" && \
+ mpirun ${MACHINEFILE_OPTION} "$ha_machine_file" \
+ -np ${#ha_clients[@]} $cmd && \
+ ha_on ${ha_clients[0]} rm -rf "$dir"
+ } >>"$log" 2>&1 || rc=$?
- if ((rc != 0)); then
- ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
- touch "$ha_fail_file"
- touch "$ha_stop_file"
- fi
- echo $rc >"$status"
+ if ((rc != 0)); then
+ ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
+ touch "$ha_fail_file"
+ touch "$ha_stop_file"
+ fi
+ echo $rc >"$status"
- nr_loops=$((nr_loops + 1))
- done
+ nr_loops=$((nr_loops + 1))
+ done
- avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
+ avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
- ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time"
+ ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time"
}
ha_start_mpi_loads()
rm -f "${ha_status_files[@]}"
for file in "${ha_status_files[@]}"; do
until [ -e "$ha_stop_file" ] ||
- [ -e "$file" ] ||
- (($(date +%s) >= end)); do
+ [ -e "$file" ]; do
+ if (($(date +%s) >= end)); then
+ ha_info "Timed out while waiting for load status file $file"
+ touch "$ha_fail_file"
+ return 1
+ fi
ha_sleep 1 >/dev/null
done
done
ha_sleep $(ha_rand 10)
ha_power_down $node
ha_sleep 10
- ha_wait_loads || return
+ ha_wait_loads || break
if [ -e $ha_stop_file ]; then
ha_power_up $node
- break;
+ break
fi
ha_info "Bringing $node back"
# Wait for the failback to start.
#
ha_sleep 60
- ha_wait_loads || return
+ ha_wait_loads || break
ha_sleep $(ha_rand 20)
ha_on ${ha_clients[0]} mkdir "$ha_test_dir"
ha_start_loads
- ha_wait_loads
-
- if $ha_workloads_only; then
- ha_sleep $((60 * 60))
- else
- ha_killer
- ha_dump_logs
+ if ha_wait_loads; then
+ if $ha_workloads_only; then
+ ha_sleep $((60 * 60))
+ else
+ ha_killer
+ fi
fi
-
+ ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
ha_stop_loads
if [ -e "$ha_fail_file" ]; then