From: Elena Gryaznova Date: Wed, 22 Sep 2021 17:47:36 +0000 (+0300) Subject: LU-15028 tests: improve ha.sh to be more verbose X-Git-Tag: 2.14.56~172 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;ds=inline;h=a4a68cc6265d047b30f11085b9b2dfef4ddc785a;p=fs%2Flustre-release.git LU-15028 tests: improve ha.sh to be more verbose Patch adds some informing messages to make the failure reason detection simpler. Test-Parameters: trivial Signed-off-by: Elena Gryaznova HPE-bug-id: LUS-10286 Reviewed-by: Vladimir Saveliev Reviewed-by: Andriy Skulysh Change-Id: I3bef165f497d745c3e8ee3c8a91532096100bb99 Reviewed-on: https://review.whamcloud.com/45025 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/tests/ha.sh b/lustre/tests/ha.sh index 9fc76a4..a13cbce 100755 --- a/lustre/tests/ha.sh +++ b/lustre/tests/ha.sh @@ -144,6 +144,21 @@ ha_info() echo "$0: $(date +%H:%M:%S' '%s):" "$@" } +ha_touch() +{ + local date=$(date +%H:%M:%S' '%s) + + [[ $1 =~ stop ]] && + echo $date ${FUNCNAME[1]} $2 >> $ha_stop_file || + true + [[ $1 =~ fail ]] && + echo $date ${FUNCNAME[1]} $2 >> $ha_fail_file || + true + [[ $1 =~ lfsck ]] && + echo $date ${FUNCNAME[1]} $2 >> $ha_lfsck_stop || + true +} + ha_log() { local nodes=${1// /,} @@ -171,6 +186,7 @@ ha_trap_err() trap ha_trap_err ERR set -eE +declare ha_power_down_pids declare ha_tmp_dir=/tmp/$(basename $0)-$$ declare ha_stop_file=$ha_tmp_dir/stop declare ha_fail_file=$ha_tmp_dir/fail @@ -193,6 +209,7 @@ declare ha_power_down_cmd=${POWER_DOWN:-"pm -0"} declare ha_power_up_cmd=${POWER_UP:-"pm -1"} declare ha_power_delay=${POWER_DELAY:-60} declare ha_node_up_delay=${NODE_UP_DELAY:-10} +declare ha_wait_nodes_up=${WAIT_NODES_UP:-600} declare ha_pm_host=${PM_HOST:-$(hostname)} declare ha_failback_delay=${DELAY:-5} declare ha_failback_cmd=${FAILBACK:-""} @@ -369,7 +386,7 @@ ha_on() ha_trap_exit() { - touch "$ha_stop_file" + ha_touch stop trap 0 if [ -e "$ha_fail_file" ]; then ha_info "Test directories ${ha_testdirs[@]} not removed" @@ -384,8 +401,8 @@ ha_trap_exit() ha_trap_stop_signals() { - ha_info "${ha_stop_signals// /,} received" - touch "$ha_stop_file" + ha_info "${ha_stop_signals// /,} received" + ha_touch stop "${ha_stop_signals// /,} received" } ha_sleep() @@ -505,16 +522,14 @@ ha_repeat_mpi_load() # mustpass=0 means that failure is expected if (( rc !=0 )); then if (( mustpass != 0 )); then - touch "$ha_fail_file" - touch "$ha_stop_file" + ha_touch stop,fail $client,$tag ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}" else # Ok to fail rc=0 fi elif (( mustpass == 0 )); then - touch "$ha_fail_file" - touch "$ha_stop_file" + ha_touch stop,fail $client,$tag ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}" fi echo rc=$rc mustpass=$mustpass >"$status" @@ -632,8 +647,7 @@ ha_repeat_nonmpi_load() if ((rc != 0)); then ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}" - touch "$ha_fail_file" - touch "$ha_stop_file" + ha_touch stop,fail $client,$tag fi echo $rc >"$status" @@ -780,12 +794,10 @@ ha_start_lfsck() fi [ $rc -eq 0 ] || - { touch "$ha_fail_file"; touch "$ha_stop_file"; - touch $ha_lfsck_stop; return 1; } + { ha_touch stop,fail,lfsck; return 1; } ha_wait_lfsck_completed || - { touch "$ha_fail_file"; touch "$ha_stop_file"; - touch $ha_lfsck_stop; return 1; } + { ha_touch stop,fail,lfsck; return 1; } return 0 } @@ -798,7 +810,7 @@ ha_lfsck_repaired() awk '{sum += $1} END { print sum }') [ $n -eq 0] || { ha_info "Total repaired: $n"; - touch "$ha_fail_file"; return 1; } + ha_touch fail; return 1; } return 0 } @@ -813,7 +825,7 @@ ha_start_loads() ha_stop_loads() { - touch $ha_stop_file + ha_touch stop [[ -n $CMD_BG_PID ]] && wait $CMD_BG_PID || true # true because of lfsck_bg could be stopped already $ha_lfsck_bg && wait $LFSCK_BG_PID || true @@ -896,14 +908,29 @@ ha_power_down_cmd_fn() { local nodes=$1 local cmd + local pid + local rc=0 case $ha_power_down_cmd in # format is: POWER_DOWN=sysrqcrash - sysrqcrash) cmd="pdsh -S -w $nodes 'echo c > /proc/sysrq-trigger' &" ;; - *) cmd="$ha_power_down_cmd $nodes" ;; + sysrqcrash) + cmd="pdsh -S -w $nodes -u 120 \"echo c > /proc/sysrq-trigger\" &" + eval $cmd + pid=$! + ha_power_down_pids=$(echo $ha_power_down_pids $pid) + ha_info "ha_power_down_pids: $ha_power_down_pids" + [[ -z "$ha_power_down_pids" ]] || + ps aux | grep " ${ha_power_down_pids// / \| } " || + true + ;; + *) + cmd="$ha_power_down_cmd $nodes" + eval $cmd + rc=$? + ;; esac - eval $cmd + return $rc } ha_power_down() @@ -925,12 +952,17 @@ ha_power_down() fi ha_info "Powering down $nodes : cmd: $ha_power_down_cmd" + ha_power_down_pids="" for (( i=0; i<10; i++ )) { ha_info "attempt: $i" - ha_power_down_cmd_fn $nodes && - ha_powermanage $nodes $state && rc=0 && break - sleep $ha_power_delay + ha_power_down_cmd_fn $nodes || rc=1 + ha_sleep $ha_power_delay + ha_powermanage $nodes $state && rc=0 && break } + if [[ -n "$ha_power_down_pids" ]]; then + kill -9 $ha_power_down_pids || true + wait $ha_power_down_pids || true + fi [ $rc -eq 0 ] || { ha_info "Failed Powering down in $i attempts:" \ @@ -1052,14 +1084,29 @@ ha_aim() ha_wait_nodes() { local nodes=$1 - local end=$(($(date +%s) + 10 * 60)) + local end=$(($(date +%s) + $ha_wait_nodes_up)) - ha_info "Waiting for $nodes to boot up" + ha_info "Waiting for $nodes to boot up in $ha_wait_nodes_up" until ha_on $nodes hostname >/dev/null 2>&1 || [ -e "$ha_stop_file" ] || (($(date +%s) >= end)); do ha_sleep 1 >/dev/null done + + ha_info "Check where we are ..." + [ -e "$ha_stop_file" ] && + ha_info "$ha_stop_file found!" + + local -a nodes_up + nodes_up=($(ha_on $nodes hostname | awk '{ print $2 }')) + ha_info "Nodes $nodes are up: ${nodes_up[@]}" + local -a n=(${nodes//,/ }) + if [[ ${#nodes_up[@]} -ne ${#n[@]} ]]; then + ha_info "Failed boot up $nodes in $ha_wait_nodes_up sec!" + ha_touch fail,stop + return 1 + fi + return 0 } ha_failback()