Whamcloud - gitweb
LU-8226 tests: Change check_catastrophe() to check_node_health() 39/20539/5
authorParinay Kondekar <parinay.kondekar@seagate.com>
Thu, 29 Sep 2016 07:20:28 +0000 (12:50 +0530)
committerOleg Drokin <oleg.drokin@intel.com>
Sat, 8 Oct 2016 16:39:13 +0000 (16:39 +0000)
Change check_catastrophe() to check_node_health() and take care of
following,
    - whether the remote node was running,
    - whether there was an LBUG/LASSERT

and print a proper error message in both case for particular
node.

Seagate-bug-id: MRP-3380
Change-Id: Ia42600ab6e6ee3925d3bda9ac9561149d60d8d72
Signed-off-by: Parinay Kondekar <parinay.kondekar@seagate.com>
Reviewed-on: http://review.whamcloud.com/20539
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Jian Yu <jian.yu@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/tests/test-framework.sh

index 53d1072..541f90f 100755 (executable)
@@ -1931,55 +1931,59 @@ start_client_loads () {
 
 # only for remote client
 check_client_load () {
-    local client=$1
-    local var=$(node_var_name $client)_load
-    local TESTLOAD=run_${!var}.sh
-
-    ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
-
-    # bug 18914: try to connect several times not only when
-    # check ps, but  while check_catastrophe also
-    local tries=3
-    local RC=254
-    while [ $RC = 254 -a $tries -gt 0 ]; do
-        let tries=$tries-1
-        # assume success
-        RC=0
-        if ! check_catastrophe $client; then
-            RC=${PIPESTATUS[0]}
-            if [ $RC -eq 254 ]; then
-                # FIXME: not sure how long we shuold sleep here
-                sleep 10
-                continue
-            fi
-            echo "check catastrophe failed: RC=$RC "
-            return $RC
-        fi
-    done
-    # We can continue try to connect if RC=254
-    # Just print the warning about this
-    if [ $RC = 254 ]; then
-        echo "got a return status of $RC from do_node while checking catastrophe on $client"
-    fi
-
-    # see if the load is still on the client
-    tries=3
-    RC=254
-    while [ $RC = 254 -a $tries -gt 0 ]; do
-        let tries=$tries-1
-        # assume success
-        RC=0
-        if ! do_node $client "ps auxwww | grep -v grep | grep -q $TESTLOAD"; then
-            RC=${PIPESTATUS[0]}
-            sleep 30
-        fi
-    done
-    if [ $RC = 254 ]; then
-        echo "got a return status of $RC from do_node while checking (catastrophe and 'ps') the client load on $client"
-        # see if we can diagnose a bit why this is
-    fi
+       local client=$1
+       local var=$(node_var_name $client)_load
+       local testload=run_${!var}.sh
+
+       ps -C $testload | grep $client || return 1
+
+       # bug 18914: try to connect several times not only when
+       # check ps, but  while check_node_health also
+
+       local tries=3
+       local RC=254
+       while [ $RC = 254 -a $tries -gt 0 ]; do
+               let tries=$tries-1
+               # assume success
+               RC=0
+               if ! check_node_health $client; then
+                       RC=${PIPESTATUS[0]}
+                       if [ $RC -eq 254 ]; then
+                               # FIXME: not sure how long we shuold sleep here
+                               sleep 10
+                               continue
+                       fi
+                       echo "check node health failed: RC=$RC "
+                       return $RC
+               fi
+       done
+       # We can continue try to connect if RC=254
+       # Just print the warning about this
+       if [ $RC = 254 ]; then
+               echo "got a return status of $RC from do_node while checking " \
+               "node health on $client"
+       fi
+
+       # see if the load is still on the client
+       tries=3
+       RC=254
+       while [ $RC = 254 -a $tries -gt 0 ]; do
+               let tries=$tries-1
+               # assume success
+               RC=0
+               if ! do_node $client \
+                       "ps auxwww | grep -v grep | grep -q $testload"; then
+                       RC=${PIPESTATUS[0]}
+                       sleep 30
+               fi
+       done
+       if [ $RC = 254 ]; then
+               echo "got a return status of $RC from do_node while checking " \
+               "(node health and 'ps') the client load on $client"
+               # see if we can diagnose a bit why this is
+       fi
 
-    return $RC
+       return $RC
 }
 check_client_loads () {
    local clients=${1//,/ }
@@ -5112,7 +5116,7 @@ run_one() {
        cd $SAVE_PWD
        reset_fail_loc
        check_grant ${testnum} || error "check_grant $testnum failed with $?"
-       check_catastrophe || error "LBUG/LASSERT detected"
+       check_node_health
        check_dmesg_for_errors || error "Error in dmesg detected"
        if [ "$PARALLEL" != "yes" ]; then
                ps auxww | grep -v grep | grep -q multiop &&
@@ -5808,16 +5812,21 @@ restore_lustre_params() {
        done
 }
 
-check_catastrophe() {
+check_node_health() {
        local nodes=${1:-$(comma_list $(nodes_list))}
 
-       do_nodes $nodes "rc=0;
-val=\\\$($LCTL get_param -n catastrophe 2>&1);
-if [[ \\\$? -eq 0 && \\\$val -ne 0 ]]; then
-       echo \\\$(hostname -s): \\\$val;
-       rc=\\\$val;
-fi;
-exit \\\$rc"
+       for node in ${nodes//,/ }; do
+               check_network "$node" 5
+               if [ $? -eq 0 ]; then
+                       do_node $node "rc=0;
+                       val=\\\$($LCTL get_param -n catastrophe 2>&1);
+                       if [[ \\\$? -eq 0 && \\\$val -ne 0 ]]; then
+                               echo \\\$(hostname -s): \\\$val;
+                               rc=\\\$val;
+                       fi;
+                       exit \\\$rc" || error "$node:LBUG/LASSERT detected"
+               fi
+       done
 }
 
 mdsrate_cleanup () {