Whamcloud - gitweb
LU-18274 tests: reboot the not up nodes only 15/56515/2
authorElena Gryaznova <elena.gryazanova@hpe.com>
Fri, 27 Sep 2024 10:49:12 +0000 (13:49 +0300)
committerOleg Drokin <green@whamcloud.com>
Mon, 16 Dec 2024 08:14:09 +0000 (08:14 +0000)
It is reasonable to reboot only those nodes which are
not up after previous reboot attempt.

Test-Parameters: trivial
Signed-off-by: Elena Gryaznova <elena.gryaznova@hpe.com>
HPE-bug-id: LUS-12442
Reviewed-by: Vladimir Saveliev <vladimir.saveliev@hpe.com>
Reviewed-by: Alexander Boyko <alexander.boyko@hpe.com>
Change-Id: I3e1181f10bdf8b7e74ab532bd2e77d092dcf1c9b
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/56515
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/tests/ha.sh

index 445ca14..1471a0c 100755 (executable)
@@ -1135,19 +1135,24 @@ ha_wait_nodes()
                        ha_info "$ha_stop_file found!"
 
                local -a nodes_up
+               local -a nodes_down
                nodes_up=($(ha_on $nodes hostname | awk '{ print $2 }'))
                ha_info "Nodes $nodes are up: ${nodes_up[@]}"
                local -a n=(${nodes//,/ })
                if [[ ${#nodes_up[@]} -ne ${#n[@]} ]]; then
-                       ha_info "Failed boot up $nodes in \
+                       nodes_down=($(echo ${n[@]} ${nodes_up[@]} |\
+                               tr ' ' '\n' | sort | uniq -u))
+                       ha_info "Failed boot up ${nodes_down[@]} in \
                                $ha_wait_nodes_up sec! attempt: $i"
                        if (( i == attempts )); then
                                ha_touch fail,stop
                                return 1
                        else
-                               ha_info "REBOOTING $ha_reboot $nodes, \
+                               local down=${nodes_down[@]}
+                               down=${down// /,/}
+                               ha_info "REBOOTING $ha_reboot $down \
                                        attempt: $i"
-                               local cmd="$ha_reboot $nodes"
+                               local cmd="$ha_reboot $down"
 
                                end=$(($(date +%s) + $ha_wait_nodes_up))
                                eval $cmd