Whamcloud - gitweb
LU-14773 tests: skip check_network() on working node 33/44033/4
authorAndreas Dilger <adilger@whamcloud.com>
Fri, 18 Jun 2021 20:55:51 +0000 (14:55 -0600)
committerOleg Drokin <green@whamcloud.com>
Wed, 18 Aug 2021 01:59:17 +0000 (01:59 +0000)
Don't call check_network() (which can take several seconds per node)
if the get_param command ran successfully on all of the nodes.  The
get_param success implies the connection to the remote nodes works
properly, and completes more quickly.

For consistency with previous behavior, still call check_network() if
get_param didn't return any output, since the modules may be unloaded.

Remove some extra visual clutter from every subtest.

Test-Parameters: trivial
Signed-off-by: Andreas Dilger <adilger@whamcloud.com>
Change-Id: I6a11cf8a1a6b43bebc3ff8f5506e1faac13ebbe5
Reviewed-on: https://review.whamcloud.com/44033
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: James Nunez <jnunez@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Elena Gryaznova <elena.gryaznova@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/tests/test-framework.sh

index 1dea3f5..1cbb4b5 100755 (executable)
@@ -6470,10 +6470,10 @@ check_mds() {
 }
 
 reset_fail_loc () {
-       echo -n "Resetting fail_loc on all nodes..."
-       do_nodes $(comma_list $(nodes_list)) "lctl set_param -n fail_loc=0 \
-           fail_val=0 2>/dev/null" || true
-       echo done.
+       #echo -n "Resetting fail_loc on all nodes..."
+       do_nodes $(comma_list $(nodes_list)) \
+               "lctl set_param -n fail_loc=0 fail_val=0 2>/dev/null" || true
+       #echo done.
 }
 
 
@@ -6482,7 +6482,8 @@ reset_fail_loc () {
 # Also appends a timestamp and prepends the testsuite name.
 #
 
-EQUALS="===================================================================================================="
+# ======================================================== 15:06:12 (1624050372)
+EQUALS="========================================================"
 banner() {
     msg="== ${TESTSUITE} $*"
     last=${msg: -1:1}
@@ -7264,15 +7265,18 @@ restore_lustre_params() {
 
 check_node_health() {
        local nodes=${1:-$(comma_list $(nodes_list))}
-
-       for node in ${nodes//,/ }; do
-               check_network "$node" 5
-               if [ $? -eq 0 ]; then
-                       do_node $node "$LCTL get_param catastrophe 2>&1" |
-                               grep -q "catastrophe=1" &&
-                               error "$node:LBUG/LASSERT detected" || true
-               fi
-       done
+       local health=$TMP/node_health.$$
+
+       do_nodes $nodes "$LCTL get_param catastrophe 2>&1" | tee $health |
+               grep "catastrophe=1" && error "LBUG/LASSERT detected"
+       # Only check/report network health if get_param isn't reported, since
+       # *clearly* the network is working if get_param returned something.
+       if (( $(grep -c catastro $health) != $(wc -w <<< ${nodes//,/ }) )); then
+               for node in ${nodes//,/}; do
+                       check_network $node 5
+               done
+       fi
+       rm -f $health
 }
 
 mdsrate_cleanup () {