LU-734 tests: add sub-tests into recovery-*-scale tests

author Yu Jian <yujian@whamcloud.com>

Wed, 7 Mar 2012 14:45:27 +0000 (22:45 +0800)

committer Oleg Drokin <green@whamcloud.com>

Wed, 14 Mar 2012 17:05:53 +0000 (13:05 -0400)
author Yu Jian <yujian@whamcloud.com>
Wed, 7 Mar 2012 14:45:27 +0000 (22:45 +0800)
committer Oleg Drokin <green@whamcloud.com>
Wed, 14 Mar 2012 17:05:53 +0000 (13:05 -0400)
diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh

index 805fddc..18ca852 100644 (file)
--- a/lustre/tests/recovery-double-scale.sh
+++ b/lustre/tests/recovery-double-scale.sh
@@ -1,4 +1,5 @@
  #!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
  
  # All pairwise combinations of node failures.
  # Was cmd3-17
@@ -8,97 +9,73 @@
  # Script fails pair of nodes:
  # --  in parallel by default
  # --  in series if SERIAL is set
+set -e
  
-LUSTRE=${LUSTRE:-`dirname $0`/..}
-SETUP=${SETUP:-""}
-CLEANUP=${CLEANUP:-""}
-. $LUSTRE/tests/test-framework.sh
+ONLY=${ONLY:-"$*"}
  
-init_test_env $@
+# bug number for skipped test:
+ALWAYS_EXCEPT="$RECOVERY_DOUBLE_SCALE_EXCEPT"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  init_logging
  
-DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log
-
-exec 2>$DEBUGLOG
-echo "--- env ---" >&2
-env >&2
-echo "--- env ---" >&2
-set -x
+remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0
  
-[ "$SHARED_DIRECTORY" ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; }
+[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] &&
+    skip_env "need three or more clients" && exit 0
  
-check_shared_dir $SHARED_DIRECTORY ||
-    error "$SHARED_DIRECTORY isn't a shared directory"
-
-[ -n "$CLIENTS" ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; }
-
-[ $CLIENTCOUNT -ge 3 ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; }
-
-END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
-LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
-
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
-
-check_timeout || exit 1
+if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then
+    skip_env "SHARED_DIRECTORY should be specified with a shared directory \
+which is accessable on all of the nodes"
+    exit 0
+fi
  
  [[ $FAILURE_MODE = SOFT ]] && \
      log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797"
  
-build_test_filter
-
-check_and_setup_lustre
-rm -rf $DIR/[df][0-9]*
-
-# the test node needs to be insulated from a lustre failure as much as possible,
-# so not even loading the lustre modules is ideal.
-# -- umount lustre
-# -- remove hostname from clients list
-zconf_umount $(hostname) $MOUNT
-NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
-NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
+# Set SERIAL to serialize the failure through a recovery of the first failure.
+SERIAL=${SERIAL:-""}
+ERRORS_OK="yes"
  
-check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
+[ "$SERIAL" ] && ERRORS_OK=""
  
-MDTS=$(get_facets MDS)
-OSTS=$(get_facets OST)
+FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60 * 5))} # 5 minutes
  
-rm -f $END_RUN_FILE
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
  
  reboot_recover_node () {
      # item var contains a pair of clients if nodetype=clients
      # I would prefer to have a list here
      local item=$1
-    local nodetype=$2  
-    local timeout=$($LCTL get_param  -n timeout)
+    local nodetype=$2
+    local c
  
      # MDS, OST item contains the facet
      case $nodetype in
-       MDS|OST )    facet_failover $item
-                [ "$SERIAL" ] && wait_recovery_complete $item || true
-                ;;
-       clients) for c in ${item//,/ }; do
-                      # make sure the client loads die
-                      do_nodes $c "set -x; test -f $LOAD_PID_FILE &&
-                          { kill -s TERM \\\$(cat $LOAD_PID_FILE);
-                          rm -f $LOAD_PID_FILE || true; }"
-                      shutdown_client $c
-                      boot_node $c
-                      echo "Reintegrating $c"
-                      # one client fails; need dk logs from this client only 
-                      zconf_mount $c $MOUNT || NODES="$c $(mdts_nodes) $(osts_nodes)" error_exit "zconf_mount failed"
-                 done
-                 start_client_loads $item
-                 ;;
-                # script failure:
-                # don't use error (), the logs from all nodes not needed
-       * )      echo "reboot_recover_node: nodetype=$nodetype. Must be one of 'MDS', 'OST', or 'clients'."
-                exit 1;;
+        MDS|OST )   facet_failover $item
+                    [ "$SERIAL" ] && wait_recovery_complete $item || true
+                    ;;
+        clients)    for c in ${item//,/ }; do
+                        # make sure the client loads die
+                        stop_process $c $LOAD_PID_FILE
+                        shutdown_client $c
+                        boot_node $c
+                        echo "Reintegrating $c"
+                        zconf_mount $c $MOUNT ||
+                            error "mount $MOUNT on $c failed"
+                        client_up $c || error "start client on $c failed"
+                    done
+                    start_client_loads $item
+                    ;;
+        * )         echo "ERROR: invalid nodetype=$nodetype." \
+                         "Must be one of 'MDS', 'OST', or 'clients'."
+                    exit 1;;
      esac
  }
  
@@ -110,11 +87,9 @@ get_item_type () {
      case $type in
         MDS )    list=$MDTS;;
         OST )    list=$OSTS;;
-       clients) list=$NODES_TO_USE
-                ;;
-                # script failure:
-                # don't use error (), the logs from all nodes not needed
-       * )      echo "Invalid type=$type. Must be one of 'MDS', 'OST', or 'clients'."
+       clients) list=$NODES_TO_USE;;
+       * )      echo "ERROR: invalid type=$type." \
+                     "Must be one of 'MDS', 'OST', or 'clients'."
                  exit 1;;
      esac
  
@@ -125,8 +100,8 @@ get_item_type () {
          return
      fi
  
-    item=$(get_random_entry $list)
-    if [ "$type" = clients ] ; then
+    local item=$(get_random_entry $list)
+    if [ "$type" = "clients" ]; then
          item="$item $(get_random_entry $(exclude_items_from_list $list $item))"
          item=$(comma_list $item)
      fi
@@ -150,29 +125,26 @@ failover_pair() {
      local client2=
  
      log "
-==== START === $title "
+==== START === $title"
  
      item1=$(get_item_type $type1)
      [ "$item1" ] || \
          { echo "type1=$type1 item1 is empty" && return 0; }
      item2=$(get_item_type $type2 $item1)
      [ "$item2" ] || \
-        { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" && return 0; }
+        { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" \
+          && return 0; }
  
      # Check that our client loads are still running. If any have died,
      # that means they have died outside of recovery, which is unacceptable.
      log "==== Checking the clients loads BEFORE failover -- failure NOT OK"
-
      # FIXME. need print summary on exit
-    if ! check_client_loads $NODES_TO_USE; then
-        exit 4
-    fi
+    check_client_loads $NODES_TO_USE || exit $?
  
      log "Done checking client loads. Failing type1=$type1 item1=$item1 ... "
+    reboot_recover_node $item1 $type1 || exit $?
  
-    reboot_recover_node $item1 $type1
-
-    # Hendrix test17 description: 
+    # Hendrix test17 description:
      # Introduce a failure, wait at
      # least 5 minutes (for recovery),
      # introduce a 2nd
@@ -184,40 +156,27 @@ failover_pair() {
      # We have a "double failures" if SERIAL is not set,
      # do not need a sleep between failures for "double failures"
  
-    log "                            Failing type2=$type2 item2=$item2 ... "    
-    reboot_recover_node $item2 $type2
+    log "                            Failing type2=$type2 item2=$item2 ... "
+    reboot_recover_node $item2 $type2 || exit $?
  
      # Client loads are allowed to die while in recovery, so we just
      # restart them.
-    log "==== Checking the clients loads AFTER  failovers -- ERRORS_OK=$ERRORS_OK"
-    restart_client_loads $NODES_TO_USE $ERRORS_OK || return $? 
-    log "Done checking / re-Starting client loads. PASS"
+    log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK"
+    restart_client_loads $NODES_TO_USE $ERRORS_OK || exit $?
+    log "Done checking / re-starting client loads. PASS"
      return 0
  }
  
  summary_and_cleanup () {
      local rc=$?
-    local var
      trap 0
  
+    CURRENT_TS=$(date +%s)
+    ELAPSED=$((CURRENT_TS - START_TS))
+
      # Having not empty END_RUN_FILE means the failed loads only
      if [ -s $END_RUN_FILE ]; then
-        echo "Found the END_RUN_FILE file: $END_RUN_FILE"
-        cat $END_RUN_FILE
-        local END_RUN_NODE=
-        read END_RUN_NODE < $END_RUN_FILE
-
-        # A client load will stop if it found the END_RUN_FILE file.
-        # That does not mean the client load actually failed though.
-        # The first node in END_RUN_FILE is the one we are interested in.
-        if [ -n "$END_RUN_NODE" ]; then
-            var=$(node_var_name $END_RUN_NODE)_load
-            echo "Client load failed on node $END_RUN_NODE"
-            echo
-            echo "Client $END_RUN_NODE load stdout and debug files:
-                $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log
-                $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log"
-        fi
+        print_end_run_file $END_RUN_FILE
          rc=1
      fi
  
@@ -231,116 +190,123 @@ Server failover period: $FAILOVER_PERIOD seconds
  Exited after:           $ELAPSED seconds
  Status: $result: rc=$rc"
  
-    # make sure the client loads die
-    do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE &&
-        { kill -s TERM \\\$(cat $LOAD_PID_FILE);
-        rm -f $LOAD_PID_FILE || true; }"
-
-    # and free up the pdshes that started them, if any are still around
-    if [ -n "$CLIENT_LOAD_PIDS" ]; then
-        kill $CLIENT_LOAD_PIDS || true
-        sleep 5
-        kill -9 $CLIENT_LOAD_PIDS || true
-    fi
+    # stop the client loads
+    stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
  
      if [ $rc -ne 0 ]; then
          # we are interested in only on failed clients and servers
          local failedclients=$(cat $END_RUN_FILE | grep -v $0)
          # FIXME: need ostfailover-s nodes also for FLAVOR=OST
-        local product=$(gather_logs $(comma_list $(osts_nodes) \
-                        $(mdts_nodes) $mdsfailover_HOST $failedclients) 1)
-        echo $product
+        gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \
+                      $mdsfailover_HOST $failedclients)
      fi
  
-    [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
      exit $rc
  }
  
-trap summary_and_cleanup EXIT TERM INT
+################################## Main Flow ###################################
+build_test_filter
  
-#
-# MAIN
-#
-log "-----============= $0 starting =============-----"
+check_and_setup_lustre
+rm -rf $DIR/[Rdfs][0-9]*
  
-START_TS=$(date +%s)
-CURRENT_TS=$START_TS
-ELAPSED=0
+check_timeout || exit 1
  
-# Set SERIAL to serialize the failure through a recovery of the first failure. 
-SERIAL=${SERIAL:-""}
-ERRORS_OK="yes"
+# The test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $HOSTNAME $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
  
-[ "$SERIAL" ] && ERRORS_OK="" 
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
  
-FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes
+MDTS=$(get_facets MDS)
+OSTS=$(get_facets OST)
  
-# Start client loads.
-start_client_loads $NODES_TO_USE
+ELAPSED=0
+START_TS=$(date +%s)
+CURRENT_TS=$START_TS
  
-echo clients load pids:
-if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then
-    exit 3
-fi
+# Every pairwise combination of client failures (2 clients),
+# MDS failure, and OST failure will be tested.
+test_pairwise_fail() {
+    trap summary_and_cleanup EXIT TERM INT
  
-# FIXME: Do we want to have an initial sleep period where the clients 
-# just run before introducing a failure?
-sleep $FAILOVER_PERIOD
+    # Start client loads.
+    rm -f $END_RUN_FILE
+    start_client_loads $NODES_TO_USE
  
-#CMD_TEST_NUM=17.1
-failover_pair MDS OST     "test 1: failover MDS, then OST =========="
-sleep $FAILOVER_PERIOD
+    echo clients load pids:
+    do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
  
-#CMD_TEST_NUM=17.2
-failover_pair MDS clients "test 2: failover MDS, then 2 clients ===="
-sleep $FAILOVER_PERIOD
+    # FIXME: Do we want to have an initial sleep period where the clients
+    # just run before introducing a failure?
+    sleep $FAILOVER_PERIOD
  
-#CMD_TEST_NUM=17.3
-if [ $MDSCOUNT -gt 1 ]; then
-    failover_pair MDS MDS     "test 3: failover MDS, then another MDS =="
+    # CMD_TEST_NUM=17.1
+    failover_pair MDS OST "test 1: failover MDS, then OST =========="
      sleep $FAILOVER_PERIOD
-else
-    skip "$0 : $MDSCOUNT < 2 MDTs, test 3 skipped"
-fi 
  
-#CMD_TEST_NUM=17.4
-if [ $OSTCOUNT -gt 1 ]; then
-    failover_pair OST OST     "test 4: failover OST, then another OST =="
+    # CMD_TEST_NUM=17.2
+    failover_pair MDS clients "test 2: failover MDS, then 2 clients ===="
      sleep $FAILOVER_PERIOD
-else
-    skip "$0 : $OSTCOUNT < 2 OSTs, test 4 skipped"
-fi 
  
-#CMD_TEST_NUM=17.5
-failover_pair OST clients "test 5: failover OST, then 2 clients ===="
-sleep $FAILOVER_PERIOD
+    # CMD_TEST_NUM=17.3
+    if [ $MDSCOUNT -gt 1 ]; then
+        failover_pair MDS MDS "test 3: failover MDS, then another MDS =="
+        sleep $FAILOVER_PERIOD
+    else
+        skip_env "has less than 2 MDTs, test 3 skipped"
+    fi
  
-#CMD_TEST_NUM=17.6
-failover_pair OST MDS     "test 6: failover OST, then MDS =========="
-sleep $FAILOVER_PERIOD
+    # CMD_TEST_NUM=17.4
+    if [ $OSTCOUNT -gt 1 ]; then
+        failover_pair OST OST "test 4: failover OST, then another OST =="
+        sleep $FAILOVER_PERIOD
+    else
+        skip_env "has less than 2 OSTs, test 4 skipped"
+    fi
  
-#CMD_TEST_NUM=17.7
-failover_pair clients MDS "test 7: failover 2 clients, then MDS ===="
-sleep $FAILOVER_PERIOD
+    # CMD_TEST_NUM=17.5
+    failover_pair OST clients "test 5: failover OST, then 2 clients ===="
+    sleep $FAILOVER_PERIOD
  
-#CMD_TEST_NUM=17.8
-#failover_pair clients OST "test 8: failover 2 clients, then OST ===="
-sleep $FAILOVER_PERIOD
+    # CMD_TEST_NUM=17.6
+    failover_pair OST MDS "test 6: failover OST, then MDS =========="
+    sleep $FAILOVER_PERIOD
  
-#CMD_TEST_NUM=17.9
-if [ $CLIENTCOUNT -ge 5 ]; then
-    failover_pair clients clients "test 9: failover 2 clients, then 2 different clients =="
+    # CMD_TEST_NUM=17.7
+    failover_pair clients MDS "test 7: failover 2 clients, then MDS ===="
      sleep $FAILOVER_PERIOD
-fi
-log "==== Checking the clients loads AFTER  all failovers -- failure NOT OK"
-if ! check_client_loads $NODES_TO_USE; then
-    log "Client load failed after failover. Exiting"
-    exit 5
-fi
  
-CURRENT_TS=$(date +%s)
-ELAPSED=$((CURRENT_TS - START_TS))
+    # CMD_TEST_NUM=17.8
+    failover_pair clients OST "test 8: failover 2 clients, then OST ===="
+    sleep $FAILOVER_PERIOD
+
+    # CMD_TEST_NUM=17.9
+    if [ $CLIENTCOUNT -gt 4 ]; then
+        failover_pair clients clients \
+            "test 9: failover 2 clients, then 2 different clients =="
+        sleep $FAILOVER_PERIOD
+    else
+        skip_env "has less than 5 Clients, test 9 skipped"
+    fi
+
+    log "==== Checking the clients loads AFTER all failovers -- failure NOT OK"
+    if ! check_client_loads $NODES_TO_USE; then
+        log "Client load failed after failover. Exiting..."
+        exit 5
+    fi
+
+    exit 0
+}
+run_test pairwise_fail "pairwise combination of clients, MDS, and OST failures"
  
-log "Completed successfully in $ELAPSED seconds"
+zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed"
+client_up || error "start client on $HOSTNAME failed"
  
-exit 0
+complete $(basename $0) $SECONDS
+check_and_cleanup_lustre
+exit_status
diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh

index 3b016b9..6a914b3 100644 (file)
--- a/lustre/tests/recovery-mds-scale.sh
+++ b/lustre/tests/recovery-mds-scale.sh
@@ -1,86 +1,54 @@
  #!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
  
  # Was Test 11 in cmd3.
  # For duration of 24 hours repeatedly failover a random MDS at
  # 10 minute intervals and verify that no application errors occur.
  
  # Test runs one of CLIENT_LOAD progs on remote clients.
+set -e
  
-LUSTRE=${LUSTRE:-`dirname $0`/..}
-SETUP=${SETUP:-""}
-CLEANUP=${CLEANUP:-""}
-. $LUSTRE/tests/test-framework.sh
+ONLY=${ONLY:-"$*"}
  
-init_test_env $@
+# bug number for skipped test:
+ALWAYS_EXCEPT="$RECOVERY_MDS_SCALE_EXCEPT"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  init_logging
  
-DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log
-
-exec 2>$DEBUGLOG
-echo "--- env ---" >&2
-env >&2
-echo "--- env ---" >&2
-set -x
+remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0
  
-[ "$SHARED_DIRECTORY" ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; }
+[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] &&
+    skip_env "need three or more clients" && exit 0
  
-check_shared_dir $SHARED_DIRECTORY ||
-    error "$SHARED_DIRECTORY isn't a shared directory"
-
-[ -n "$CLIENTS" ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; }
-
-[ $CLIENTCOUNT -ge 3 ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; }
-
-END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
-LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
-VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
-
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
-
-build_test_filter
-
-check_and_setup_lustre
-rm -rf $DIR/[df][0-9]*
-
-max_recov_time=$(max_recovery_time)
-
-# the test node needs to be insulated from a lustre failure as much as possible,
-# so not even loading the lustre modules is ideal.
-# -- umount lustre
-# -- remove hostname from clients list
-zconf_umount $(hostname) $MOUNT
-NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
-NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
-
-check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
-
-MDTS=$(get_facets MDS)
-OSTS=$(get_facets OST)
+if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then
+    skip_env "SHARED_DIRECTORY should be specified with a shared directory \
+which is accessable on all of the nodes"
+    exit 0
+fi
  
  ERRORS_OK=""    # No application failures should occur during this test.
-FLAVOR=${FLAVOR:-"MDS"}
  
-if [ "$FLAVOR" == "MDS" ]; then
-    SERVERS=$MDTS
-else
-    SERVERS=$OSTS
-fi
- 
  if [ "$SLOW" = "no" ]; then
      DURATION=${DURATION:-$((60 * 30))}
-    SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))}
  else
      DURATION=${DURATION:-$((60 * 60 * 24))}
-    SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
  fi
+SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
  
-rm -f $END_RUN_FILE
+MINSLEEP=${MINSLEEP:-120}
+REQFAIL_PERCENT=${REQFAIL_PERCENT:-3}    # bug17839 comment 62
+REQFAIL=${REQFAIL:-$((DURATION / SERVER_FAILOVER_PERIOD *
+                      REQFAIL_PERCENT / 100))}
+
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
  
  server_numfailovers () {
      local facet=$1
@@ -102,27 +70,11 @@ servers_numfailovers () {
  
  summary_and_cleanup () {
      local rc=$?
-    local var
      trap 0
  
      # Having not empty END_RUN_FILE means the failed loads only
      if [ -s $END_RUN_FILE ]; then
-        echo "Found the END_RUN_FILE file: $END_RUN_FILE"
-        cat $END_RUN_FILE
-        local END_RUN_NODE=
-        read END_RUN_NODE < $END_RUN_FILE
-
-        # A client load will stop if it found the END_RUN_FILE file.
-        # That does not mean the client load actually failed though.
-        # The first node in END_RUN_FILE is the one we are interested in.
-        if [ -n "$END_RUN_NODE" ]; then
-            var=$(node_var_name $END_RUN_NODE)_load
-            echo "Client load failed on node $END_RUN_NODE"
-            echo
-            echo "Client $END_RUN_NODE load stdout and debug files:
-                $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log
-                $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log"
-        fi
+        print_end_run_file $END_RUN_FILE
          rc=1
      fi
  
@@ -131,159 +83,170 @@ summary_and_cleanup () {
      local result=PASS
      [ $rc -eq 0 ] || result=FAIL
  
-    log "Duration:                $DURATION
+    log "Duration:               $DURATION
  Server failover period: $SERVER_FAILOVER_PERIOD seconds
  Exited after:           $ELAPSED seconds
  Number of failovers before exit:
  $(servers_numfailovers)
  Status: $result: rc=$rc"
  
-    # stop the vmstats on the OSTs
-    if [ "$VMSTAT" ]; then
-        do_nodes $(comma_list $(osts_nodes)) "test -f $VMSTAT_PID_FILE &&
-            { kill -s TERM \\\$(cat $VMSTAT_PID_FILE);
-            rm -f $VMSTAT_PID_FILE || true; }"
-    fi
-
-    # make sure the client loads die
-    do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE &&
-        { kill -s TERM \\\$(cat $LOAD_PID_FILE);
-        rm -f $LOAD_PID_FILE || true; }"
+    # stop vmstat on OSS nodes
+    [ "$VMSTAT" ] && stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
  
-    # and free up the pdshes that started them, if any are still around
-    if [ -n "$CLIENT_LOAD_PIDS" ]; then
-        kill $CLIENT_LOAD_PIDS || true
-        sleep 5
-        kill -9 $CLIENT_LOAD_PIDS || true
-    fi
+    # stop the client loads
+    stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
  
      if [ $rc -ne 0 ]; then
          # we are interested in only on failed clients and servers
          local failedclients=$(cat $END_RUN_FILE | grep -v $0)
          # FIXME: need ostfailover-s nodes also for FLAVOR=OST
-        local product=$(gather_logs $(comma_list $(osts_nodes) \
-                        $(mdts_nodes) $mdsfailover_HOST $failedclients) 1)
-        echo $product
+        gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \
+                      $mdsfailover_HOST $failedclients)
      fi
  
-    [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
-
      exit $rc
  }
  
-#
-# MAIN
-#
-log "-----============= $0 starting =============-----"
+failover_target() {
+    local flavor=${1:-"MDS"}
+    local servers
+    local serverfacet
+    local var
  
-trap summary_and_cleanup EXIT INT
+    [ "$flavor" = "MDS" ] && servers=$MDTS || servers=$OSTS
  
-ELAPSED=0
+    trap summary_and_cleanup EXIT INT
  
-# vmstat the osts
-if [ "$VMSTAT" ]; then
-    do_nodes $(comma_list $(osts_nodes)) \
-        "vmstat 1 > $TESTLOG_PREFIX.vmstat.\\\$(hostname -s).log \
-        2>/dev/null </dev/null & echo \\\$! > $VMSTAT_PID_FILE"
-fi
+    # start vmstat on OSS nodes
+    [ "$VMSTAT" ] && start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
  
-# Start client loads.
-start_client_loads $NODES_TO_USE
+    # start client loads
+    rm -f $END_RUN_FILE
+    start_client_loads $NODES_TO_USE
  
-echo clients load pids:
-if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then
-    exit 3
-fi
+    echo client loads pids:
+    do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
  
-MINSLEEP=${MINSLEEP:-120}
-REQFAIL_PERCENT=${REQFAIL_PERCENT:-3}  # bug17839 comment 62
-REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))}
-reqfail=0
-sleep=0
+    ELAPSED=0
+    local sleep=0
+    local reqfail=0
+    local it_time_start
+    local start_ts=$(date +%s)
+    local current_ts=$start_ts
+
+    while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+        # In order to perform the
+        # expected number of failovers, we need to account the following:
+        # 1) the time that has elapsed during the client load checking
+        # 2) time takes for failover
+        it_time_start=$(date +%s)
  
-START_TS=$(date +%s)
-CURRENT_TS=$START_TS
+        serverfacet=$(get_random_entry $servers)
+        var=${serverfacet}_numfailovers
  
-while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+        # Check that our client loads are still running. If any have died,
+        # that means they have died outside of recovery, which is unacceptable.
+        log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
+             ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+        check_client_loads $NODES_TO_USE || exit 4
  
-    # In order to perform the
-    # expected number of failovers, we need to account the following :
-    # 1) the time that has elapsed during the client load checking
-    # 2) time takes for failover
+        log "Wait $serverfacet recovery complete before doing next failover..."
+        if ! wait_recovery_complete $serverfacet; then
+            echo "$serverfacet recovery is not completed!"
+            exit 7
+        fi
  
-    it_time_start=$(date +%s)
+        log "Checking clients are in FULL state before doing next failover..."
+        if ! wait_clients_import_state $NODES_TO_USE $serverfacet FULL; then
+            echo "Clients import not FULL, please consider to increase \
+SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD!"
+        fi
  
-    SERVERFACET=$(get_random_entry $SERVERS)
-    var=${SERVERFACET}_numfailovers
+        log "Starting failover on $serverfacet"
+        facet_failover "$serverfacet" || exit 1
  
-    # Check that our client loads are still running. If any have died,
-    # that means they have died outside of recovery, which is unacceptable.
+        # Check that our client loads are still running during failover.
+        # No application failures should occur.
+        log "==== Checking the clients loads AFTER failover -- failure NOT OK"
+        if ! check_client_loads $NODES_TO_USE; then
+            log "Client load failed during failover. Exiting..."
+            exit 5
+        fi
  
-    log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
-    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+        # Increment the number of failovers.
+        val=$((${!var} + 1))
+        eval $var=$val
  
-    if ! check_client_loads $NODES_TO_USE; then
-        exit 4
-    fi
+        current_ts=$(date +%s)
+        ELAPSED=$((current_ts - start_ts))
  
-    log "Wait $SERVERFACET recovery complete before doing next failover ...."
+        sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
  
-    if ! wait_recovery_complete $SERVERFACET ; then
-        echo "$SERVERFACET recovery is not completed!"
-        exit 7
-    fi
+        # Keep counting the number of iterations when
+        # time spent to failover and two client loads check exceeded
+        # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
+        if [ $sleep -lt $MINSLEEP ]; then
+            reqfail=$((reqfail + 1))
+            log "WARNING: failover and two check_client_loads time exceeded \
+SERVER_FAILOVER_PERIOD - MINSLEEP!
+Failed to load the filesystem with I/O for a minimum period of \
+$MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
+This iteration, the load was only applied for sleep=$sleep seconds.
+Estimated max recovery time: $MAX_RECOV_TIME
+Probably the hardware is taking excessively long time to boot.
+Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \
+bug 20918"
+            [ $reqfail -gt $REQFAIL ] && exit 6
+        fi
  
-    log "Checking clients are in FULL state before doing next failover"
-    if ! wait_clients_import_state $NODES_TO_USE $SERVERFACET FULL; then
-        echo "Clients import not FULL, please consider to increase SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD !"
+        log "$serverfacet has failed over ${!var} times, and counting..."
  
-    fi
-    log "Starting failover on $SERVERFACET"
+        [ $((ELAPSED + sleep)) -ge $DURATION ] && break
  
-    facet_failover "$SERVERFACET" || exit 1
+        if [ $sleep -gt 0 ]; then
+            echo "sleeping $sleep seconds... "
+            sleep $sleep
+        fi
+    done
+    exit 0
+}
  
-    # Check that our client loads are still running during failover.
-    # No application failures should occur.
+################################## Main Flow ###################################
+build_test_filter
  
-    log "==== Checking the clients loads AFTER  failover -- failure NOT OK"
-    if ! check_client_loads $NODES_TO_USE; then
-        log "Client load failed during failover. Exiting"
-        exit 5
-    fi
+check_and_setup_lustre
+rm -rf $DIR/[Rdfs][0-9]*
  
-    # Increment the number of failovers
-    val=$((${!var} + 1))
-    eval $var=$val
+MAX_RECOV_TIME=$(max_recovery_time)
  
-    CURRENT_TS=$(date +%s)
-    ELAPSED=$((CURRENT_TS - START_TS))
+# The test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $HOSTNAME $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
  
-    sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
  
-    # keep count the number of itterations when
-    # time spend to failover and two client loads check exceeded 
-    # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
-    if [ $sleep -lt $MINSLEEP ]; then
-        reqfail=$((reqfail +1))
-        log "WARNING: failover and two check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP !
-Failed to load the filesystem with I/O for a minimum period of $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
-This iteration, the load was only applied for sleep=$sleep seconds.
-Estimated max recovery time : $max_recov_time
-Probably the hardware is taking excessively long to boot.
-Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
-        [ $reqfail -gt $REQFAIL ] && exit 6
-    fi
+MDTS=$(get_facets MDS)
+OSTS=$(get_facets OST)
  
-    log "$SERVERFACET has failed over ${!var} times, and counting..."
+test_failover_mds() {
+    # failover a random MDS
+    failover_target MDS
+}
+run_test failover_mds "failover MDS"
  
-    if [ $((ELAPSED + sleep)) -ge $DURATION ]; then
-         break
-    fi
+test_failover_ost() {
+    # failover a random OST
+    failover_target OST
+}
+run_test failover_ost "failover OST"
  
-    if [ $sleep -gt 0 ]; then
-        echo "sleeping $sleep seconds ... "
-        sleep $sleep
-    fi
-done
+zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed"
+client_up || error "start client on $HOSTNAME failed"
  
-exit 0
+complete $(basename $0) $SECONDS
+check_and_cleanup_lustre
+exit_status
diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh

index 8b4506f..9de55c0 100644 (file)
--- a/lustre/tests/recovery-random-scale.sh
+++ b/lustre/tests/recovery-random-scale.sh
@@ -1,4 +1,5 @@
  #!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
  
  # client failure does not affect other clients
  
@@ -9,111 +10,75 @@
  # 10 minute intervals and verify that no application errors occur.
  
  # Test runs one of CLIENT_LOAD progs on remote clients.
+set -e
  
-LUSTRE=${LUSTRE:-`dirname $0`/..}
-SETUP=${SETUP:-""}
-CLEANUP=${CLEANUP:-""}
-. $LUSTRE/tests/test-framework.sh
+ONLY=${ONLY:-"$*"}
  
-init_test_env $@
+# bug number for skipped test:
+ALWAYS_EXCEPT="$RECOVERY_RANDOM_SCALE_EXCEPT"
+# UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
  
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env $@
  . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  init_logging
  
-DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log
-
-exec 2>$DEBUGLOG
-echo "--- env ---" >&2
-env >&2
-echo "--- env ---" >&2
-set -x
+remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0
+remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0
  
-[ "$SHARED_DIRECTORY" ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; }
+[ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] &&
+    skip_env "need three or more clients" && exit 0
  
-check_shared_dir $SHARED_DIRECTORY ||
-    error "$SHARED_DIRECTORY isn't a shared directory"
-
-[ -n "$CLIENTS" ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; }
-
-[ $CLIENTCOUNT -ge 3 ] || \
-    { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; }
-
-END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
-LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
-VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
-
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then
+    skip_env "SHARED_DIRECTORY should be specified with a shared directory \
+which is accessable on all of the nodes"
+    exit 0
+fi
  
  [[ $FAILURE_MODE = SOFT ]] && \
      log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797"
  
-build_test_filter
-
-check_and_setup_lustre
-rm -rf $DIR/[df][0-9]*
-
-max_recov_time=$(max_recovery_time)
-
-# the test node needs to be insulated from a lustre failure as much as possible,
-# so not even loading the lustre modules is ideal.
-# -- umount lustre
-# -- remove hostname from clients list
-zconf_umount $(hostname) $MOUNT
-NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
-NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname))
-
-check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
-
-MDTS=$(get_facets MDS)
+# Application failures are allowed for the failed client
+# but not for other clients.
+ERRORS_OK="yes"
  
  if [ "$SLOW" = "no" ]; then
      DURATION=${DURATION:-$((60 * 30))}
-    SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))}
  else
      DURATION=${DURATION:-$((60 * 60 * 24))}
-    SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
  fi
+SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
+
+MINSLEEP=${MINSLEEP:-120}
+REQFAIL_PERCENT=${REQFAIL_PERCENT:-3}    # bug17839 comment 62
+REQFAIL=${REQFAIL:-$((DURATION / SERVER_FAILOVER_PERIOD *
+                      REQFAIL_PERCENT / 100))}
  
-rm -f $END_RUN_FILE
+END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
+LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
  
  numfailovers () {
      local facet
      local var
  
-    for facet in $MDTS ${failed_clients//,/ }; do
+    for facet in $MDTS ${FAILED_CLIENTS//,/ }; do
          var=${facet}_nums
          val=${!var}
          if [ "$val" ] ; then
-            echo "$facet failed  over  $val times"
+            echo "$facet failed over $val times"
          fi
      done
  }
  
  summary_and_cleanup () {
      local rc=$?
-    local var
      trap 0
  
      # Having not empty END_RUN_FILE means the failed loads only
      if [ -s $END_RUN_FILE ]; then
-        echo "Found the END_RUN_FILE file: $END_RUN_FILE"
-        cat $END_RUN_FILE
-        local END_RUN_NODE=
-        read END_RUN_NODE < $END_RUN_FILE
-
-        # A client load will stop if it found the END_RUN_FILE file.
-        # That does not mean the client load actually failed though.
-        # The first node in END_RUN_FILE is the one we are interested in.
-        if [ -n "$END_RUN_NODE" ]; then
-            var=$(node_var_name $END_RUN_NODE)_load
-            echo "Client load failed on node $END_RUN_NODE"
-            echo
-            echo "Client $END_RUN_NODE load stdout and debug files:
-                $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log
-                $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log"
-        fi
+        print_end_run_file $END_RUN_FILE
          rc=1
      fi
  
@@ -122,183 +87,184 @@ summary_and_cleanup () {
      local result=PASS
      [ $rc -eq 0 ] || result=FAIL
  
-    log "Duration:                $DURATION
+    log "Duration:               $DURATION
  Server failover period: $SERVER_FAILOVER_PERIOD seconds
  Exited after:           $ELAPSED seconds
  Number of failovers before exit:
  $(numfailovers)
  Status: $result: rc=$rc"
  
-    # stop the vmstats on the OSTs
-    if [ "$VMSTAT" ]; then
-        do_nodes $(comma_list $(osts_nodes)) "test -f $VMSTAT_PID_FILE &&
-            { kill -s TERM \\\$(cat $VMSTAT_PID_FILE);
-            rm -f $VMSTAT_PID_FILE || true; }"
-    fi
+    # stop vmstat on OSS nodes
+    [ "$VMSTAT" ] && stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
  
-    # make sure the client loads die
-    do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE &&
-        { kill -s TERM \\\$(cat $LOAD_PID_FILE);
-        rm -f $LOAD_PID_FILE || true; }"
-
-    # and free up the pdshes that started them, if any are still around
-    if [ -n "$CLIENT_LOAD_PIDS" ]; then
-        kill $CLIENT_LOAD_PIDS || true
-        sleep 5
-        kill -9 $CLIENT_LOAD_PIDS || true
-    fi
+    # stop the client loads
+    stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
  
      if [ $rc -ne 0 ]; then
          # we are interested in only on failed clients and servers
          local failedclients=$(cat $END_RUN_FILE | grep -v $0)
          # FIXME: need ostfailover-s nodes also for FLAVOR=OST
-        local product=$(gather_logs $(comma_list $(osts_nodes) \
-                        $(mdts_nodes) $mdsfailover_HOST $failedclients) 1)
-        echo $product
+        gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \
+                      $mdsfailover_HOST $failedclients)
      fi
  
-    [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT
-
      exit $rc
  }
  
-#
-# MAIN 
-#
-log "-----============= $0 starting =============-----"
+################################## Main Flow ###################################
+build_test_filter
  
-trap summary_and_cleanup EXIT # INT
+check_and_setup_lustre
+rm -rf $DIR/[Rdfs][0-9]*
  
-ELAPSED=0
+MAX_RECOV_TIME=$(max_recovery_time)
  
-# vmstat the osts
-if [ "$VMSTAT" ]; then
-    do_nodes $(comma_list $(osts_nodes)) \
-        "vmstat 1 > $TESTLOG_PREFIX.vmstat.\\\$(hostname -s).log \
-        2>/dev/null </dev/null & echo \\\$! > $VMSTAT_PID_FILE"
-fi
+# The test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $HOSTNAME $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
  
-# Start client loads.
-start_client_loads $NODES_TO_USE
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
  
-echo clients load pids:
-if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then
-    exit 3
-fi
+MDTS=$(get_facets MDS)
  
-START_TS=$(date +%s)
-CURRENT_TS=$START_TS
+# Fail a random client and then failover a random MDS.
+test_fail_client_mds() {
+    local fail_client
+    local serverfacet
+    local client_var
+    local var
  
-MINSLEEP=${MINSLEEP:-120}
-REQFAIL_PERCENT=${REQFAIL_PERCENT:-3}  # bug17839 comment 62
-REQFAIL=${REQFAIL:-$(( DURATION / SERVER_FAILOVER_PERIOD * REQFAIL_PERCENT / 100))}
-reqfail=0
-sleep=0
+    trap summary_and_cleanup EXIT INT
  
-# This is used for FAIL_CLIENT only
-ERRORS_OK="yes"
-while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+    # start vmstat on OSS nodes
+    [ "$VMSTAT" ] && start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE
  
-    # In order to perform the 
-    # expected number of failovers, we need to account the following :
-    # 1) the time that has elapsed during the client load checking
-    # 2) time takes for failover
+    # start client loads
+    rm -f $END_RUN_FILE
+    start_client_loads $NODES_TO_USE
  
-    it_time_start=$(date +%s)
-    
-    FAIL_CLIENT=$(get_random_entry $NODES_TO_USE)
-    client_var=$(node_var_name $FAIL_CLIENT)_nums
+    echo client loads pids:
+    do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
  
-    # store the list of failed clients
-    # lists are comma separated
-    failed_clients=$(expand_list $failed_clients $FAIL_CLIENT)
+    ELAPSED=0
+    local sleep=0
+    local reqfail=0
+    local it_time_start
+    local start_ts=$(date +%s)
+    local current_ts=$start_ts
  
-    SERVERFACET=$(get_random_entry $MDTS)
-    var=${SERVERFACET}_nums
+    while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
+        # In order to perform the
+        # expected number of failovers, we need to account the following:
+        # 1) the time that has elapsed during the client load checking
+        # 2) time takes for failover
+        it_time_start=$(date +%s)
  
-    # Check that our client loads are still running. If any have died, 
-    # that means they have died outside of recovery, which is unacceptable.    
+        fail_client=$(get_random_entry $NODES_TO_USE)
+        client_var=$(node_var_name $fail_client)_nums
  
-    log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
-    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" 
+        # store the list of failed clients
+        # lists are comma separated
+        FAILED_CLIENTS=$(expand_list $FAILED_CLIENTS $fail_client)
  
-    if ! check_client_loads $NODES_TO_USE; then
-        exit 4
-    fi
+        serverfacet=$(get_random_entry $MDTS)
+        var=${serverfacet}_nums
  
-    log "FAIL CLIENT $FAIL_CLIENT ... "
-    shutdown_client $FAIL_CLIENT
+        # Check that our client loads are still running. If any have died,
+        # that means they have died outside of recovery, which is unacceptable.
+        log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
+             ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
+        check_client_loads $NODES_TO_USE || exit 4
  
-    log "Starting failover on $SERVERFACET"
+        log "FAIL CLIENT $fail_client..."
+        shutdown_client $fail_client
  
-    facet_failover "$SERVERFACET" || exit 1
-    if ! wait_recovery_complete $SERVERFACET ; then
-        echo "$SERVERFACET recovery is not completed!"
-        exit 7
-    fi
+        log "Starting failover on $serverfacet"
+        facet_failover "$serverfacet" || exit 1
  
-    boot_node $FAIL_CLIENT
-    echo "Reintegrating $FAIL_CLIENT"
-    zconf_mount $FAIL_CLIENT $MOUNT || exit $?
-
-    # Increment the number of failovers
-    val=$((${!var} + 1))
-    eval $var=$val
-    val=$((${!client_var} + 1))
-    eval $client_var=$val
-
-    # load script on failed clients could create END_RUN_FILE
-    # We shuold remove it and ignore the failure if this
-    # file contains the failed client only.
-    # We can not use ERRORS_OK when start all loads at the start of this script
-    # because the application errors allowed for random failed client only, but
-    # not for all clients.
-    if [ -e $END_RUN_FILE ]; then
-        read END_RUN_NODE < $END_RUN_FILE
-        [[ $END_RUN_NODE = $FAIL_CLIENT ]] &&
-            rm -f $END_RUN_FILE || exit 13
-    fi
+        if ! wait_recovery_complete $serverfacet; then
+            echo "$serverfacet recovery is not completed!"
+            exit 7
+        fi
  
-    restart_client_loads $FAIL_CLIENT $ERRORS_OK || exit $?
+        boot_node $fail_client
+        echo "Reintegrating $fail_client"
+        zconf_mount $fail_client $MOUNT || exit $?
+        client_up $fail_client || exit $?
+
+        # Increment the number of failovers
+        val=$((${!var} + 1))
+        eval $var=$val
+        val=$((${!client_var} + 1))
+        eval $client_var=$val
+
+        # load script on failed clients could create END_RUN_FILE
+        # We shuold remove it and ignore the failure if this
+        # file contains the failed client only.
+        # We can not use ERRORS_OK when start all loads at the start of
+        # this script because the application errors allowed for random
+        # failed client only, but not for all clients.
+        if [ -e $END_RUN_FILE ]; then
+            local end_run_node
+            read end_run_node < $END_RUN_FILE
+            [[ $end_run_node = $fail_client ]] &&
+                rm -f $END_RUN_FILE || exit 13
+        fi
  
-    # Check that not failed clients loads are still running.
-    # No application failures should occur on clients that was not failed.
+        restart_client_loads $fail_client $ERRORS_OK || exit $?
  
-    log "==== Checking the clients loads AFTER failed client reintegrated -- failure NOT OK"
-    if ! ERRORS_OK= check_client_loads $(exclude_items_from_list $NODES_TO_USE $FAIL_CLIENT); then
-        log "Client load failed. Exiting"
-        exit 5
-    fi
+        # Check that not failed clients loads are still running.
+        # No application failures should occur on clients that were not failed.
+        log "==== Checking the clients loads AFTER failed client reintegrated \
+-- failure NOT OK"
+        if ! ERRORS_OK= check_client_loads \
+            $(exclude_items_from_list $NODES_TO_USE $fail_client); then
+            log "Client load failed. Exiting..."
+            exit 5
+        fi
  
-    CURRENT_TS=$(date +%s)
-    ELAPSED=$((CURRENT_TS - START_TS))
-    sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
-
-    # keep count the number of itterations when
-    # time spend to failover and two client loads check exceeded 
-    # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
-    if [ $sleep -lt $MINSLEEP ]; then
-        reqfail=$((reqfail +1))
-        log "WARNING: failover, client reintegration and check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP !
-Failed to load the filesystem with I/O for a minimum period of $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
+        current_ts=$(date +%s)
+        ELAPSED=$((current_ts - start_ts))
+        sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
+
+        # Keep counting the number of iterations when
+        # time spent to failover and two client loads check exceeded
+        # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
+        if [ $sleep -lt $MINSLEEP ]; then
+            reqfail=$((reqfail + 1))
+            log "WARNING: failover, client reintegration and \
+check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP!
+Failed to load the filesystem with I/O for a minimum period of \
+$MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
  This iteration, the load was only applied for sleep=$sleep seconds.
-Estimated max recovery time : $max_recov_time
-Probably the hardware is taking excessively long to boot.
-Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
-        [ $reqfail -gt $REQFAIL ] && exit 6
-    fi
+Estimated max recovery time : $MAX_RECOV_TIME
+Probably the hardware is taking excessively long time to boot.
+Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \
+bug 20918"
+            [ $reqfail -gt $REQFAIL ] && exit 6
+        fi
  
-    log " Number of failovers:
+        log "Number of failovers:
  $(numfailovers)                and counting..."
  
-    if [ $((ELAPSED + sleep)) -ge $DURATION ]; then
-         break
-    fi
+        [ $((ELAPSED + sleep)) -ge $DURATION ] && break
  
-    if [ $sleep -gt 0 ]; then
-        echo "sleeping $sleep seconds ... "
-        sleep $sleep
-    fi
-done
+        if [ $sleep -gt 0 ]; then
+            echo "sleeping $sleep seconds... "
+            sleep $sleep
+        fi
+    done
+    exit 0
+}
+run_test fail_client_mds "fail client, then failover MDS"
+
+zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed"
+client_up || error "start client on $HOSTNAME failed"
  
-exit 0
+complete $(basename $0) $SECONDS
+check_and_cleanup_lustre
+exit_status
diff --git a/lustre/tests/run_IOR.sh b/lustre/tests/run_IOR.sh

index 4cd6933..9f8f816 100755 (executable)
--- a/lustre/tests/run_IOR.sh
+++ b/lustre/tests/run_IOR.sh
@@ -1,9 +1,11 @@
  #!/bin/bash
-set -x
  
  TMP=${TMP:-/tmp}
  
  TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
  LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
  DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
  
@@ -11,6 +13,7 @@ mkdir -p ${LOG%/*}
  
  rm -f $LOG $DEBUGLOG
  exec 2>$DEBUGLOG
+set -x
  
  . $(dirname $0)/functions.sh
  
@@ -46,19 +49,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
      load_pid=$!
      wait $load_pid
      if [ ${PIPESTATUS[0]} -eq 0 ]; then
-       echoerr "$(date +'%F %H:%M:%S'): IOR succeeded"
-       cd $TMP
-       rm -rf $TESTDIR
-       echoerr "$(date +'%F %H:%M:%S'): IOR run finished"
+        echoerr "$(date +'%F %H:%M:%S'): IOR succeeded"
+        cd $TMP
+        rm -rf $TESTDIR
+        echoerr "$(date +'%F %H:%M:%S'): IOR run finished"
      else
-       echoerr "$(date +'%F %H:%M:%S'): IOR failed"
-       if [ -z "$ERRORS_OK" ]; then
-           echo $(hostname) >> $END_RUN_FILE
-       fi
-       if [ $BREAK_ON_ERROR ]; then
-           # break
+        echoerr "$(date +'%F %H:%M:%S'): IOR failed"
+        if [ -z "$ERRORS_OK" ]; then
+            echo $(hostname) >> $END_RUN_FILE
+        fi
+        if [ $BREAK_ON_ERROR ]; then
+            # break
              CONTINUE=false
-       fi
+        fi
      fi
  done
  
diff --git a/lustre/tests/run_dbench.sh b/lustre/tests/run_dbench.sh

index d1a4a38..b6c2ac1 100755 (executable)
--- a/lustre/tests/run_dbench.sh
+++ b/lustre/tests/run_dbench.sh
@@ -1,9 +1,11 @@
  #!/bin/bash
-set -x
  
  TMP=${TMP:-/tmp}
  
  TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
  LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
  DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
  
@@ -11,6 +13,7 @@ mkdir -p ${LOG%/*}
  
  rm -f $LOG $DEBUGLOG
  exec 2>$DEBUGLOG
+set -x
  
  . $(dirname $0)/functions.sh
  
@@ -34,19 +37,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
  
      wait $load_pid
      if [ ${PIPESTATUS[0]} -eq 0 ]; then
-       echoerr "$(date +'%F %H:%M:%S'): dbench succeeded"
-       cd $TMP
-       rm -rf $TESTDIR
-       echoerr "$(date +'%F %H:%M:%S'): dbench run finished"
+        echoerr "$(date +'%F %H:%M:%S'): dbench succeeded"
+        cd $TMP
+        rm -rf $TESTDIR
+        echoerr "$(date +'%F %H:%M:%S'): dbench run finished"
      else
-       echoerr "$(date +'%F %H:%M:%S'): dbench failed"
-       if [ -z "$ERRORS_OK" ]; then
-           echo $(hostname) >> $END_RUN_FILE
-       fi
-       if [ $BREAK_ON_ERROR ]; then
-           # break
+        echoerr "$(date +'%F %H:%M:%S'): dbench failed"
+        if [ -z "$ERRORS_OK" ]; then
+            echo $(hostname) >> $END_RUN_FILE
+        fi
+        if [ $BREAK_ON_ERROR ]; then
+            # break
              CONTINUE=false
-       fi
+        fi
      fi
  done
  
diff --git a/lustre/tests/run_dd.sh b/lustre/tests/run_dd.sh

index 0f2a1f9..36af6ae 100755 (executable)
--- a/lustre/tests/run_dd.sh
+++ b/lustre/tests/run_dd.sh
@@ -1,9 +1,11 @@
  #!/bin/bash
-set -x
  
  TMP=${TMP:-/tmp}
  
  TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
  LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
  DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
  
@@ -11,6 +13,7 @@ mkdir -p ${LOG%/*}
  
  rm -f $LOG $DEBUGLOG
  exec 2>$DEBUGLOG
+set -x
  
  . $(dirname $0)/functions.sh
  
@@ -35,19 +38,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
      wait $load_pid
  
      if [ $? -eq 0 ]; then
-       echoerr "$(date +'%F %H:%M:%S'): dd succeeded"
-       cd $TMP
-       rm -rf $TESTDIR
-       echoerr "$(date +'%F %H:%M:%S'): dd run finished"
+        echoerr "$(date +'%F %H:%M:%S'): dd succeeded"
+        cd $TMP
+        rm -rf $TESTDIR
+        echoerr "$(date +'%F %H:%M:%S'): dd run finished"
      else
-       echoerr "$(date +'%F %H:%M:%S'): dd failed"
-       if [ -z "$ERRORS_OK" ]; then
-           echo $(hostname) >> $END_RUN_FILE
-       fi
-       if [ $BREAK_ON_ERROR ]; then
-           # break
+        echoerr "$(date +'%F %H:%M:%S'): dd failed"
+        if [ -z "$ERRORS_OK" ]; then
+            echo $(hostname) >> $END_RUN_FILE
+        fi
+        if [ $BREAK_ON_ERROR ]; then
+            # break
              CONTINUE=false
-       fi
+        fi
      fi
  done
  
diff --git a/lustre/tests/run_iozone.sh b/lustre/tests/run_iozone.sh

index 01eb9fe..642303c 100755 (executable)
--- a/lustre/tests/run_iozone.sh
+++ b/lustre/tests/run_iozone.sh
@@ -1,9 +1,11 @@
  #!/bin/bash
-set -x
  
  TMP=${TMP:-/tmp}
  
  TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
  LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
  DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
  
@@ -11,6 +13,7 @@ mkdir -p ${LOG%/*}
  
  rm -f $LOG $DEBUGLOG
  exec 2>$DEBUGLOG
+set -x
  
  . $(dirname $0)/functions.sh
  
@@ -32,24 +35,24 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
      load_pid=$!
      wait $load_pid
      if [ ${PIPESTATUS[0]} -eq 0 ]; then
-       echoerr "$(date +'%F %H:%M:%S'): iozone succeeded"
-       cd $TMP
-       rm -rf $TESTDIR
+        echoerr "$(date +'%F %H:%M:%S'): iozone succeeded"
+        cd $TMP
+        rm -rf $TESTDIR
          if [ -d $TESTDIR ]; then
-           echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR"
-           echo $(hostname) >> $END_RUN_FILE
+            echoerr "$(date +'%F %H:%M:%S'): failed to remove $TESTDIR"
+            echo $(hostname) >> $END_RUN_FILE
              CONTINUE=false
          fi
-       echoerr "$(date +'%F %H:%M:%S'): iozone run finished"
+        echoerr "$(date +'%F %H:%M:%S'): iozone run finished"
      else
-       echoerr "$(date +'%F %H:%M:%S'): iozone failed"
-       if [ -z "$ERRORS_OK" ]; then
-           echo $(hostname) >> $END_RUN_FILE
-       fi
-       if [ $BREAK_ON_ERROR ]; then
-           # break
+        echoerr "$(date +'%F %H:%M:%S'): iozone failed"
+        if [ -z "$ERRORS_OK" ]; then
+            echo $(hostname) >> $END_RUN_FILE
+        fi
+        if [ $BREAK_ON_ERROR ]; then
+            # break
              CONTINUE=false
-       fi
+        fi
      fi
  done
  
diff --git a/lustre/tests/run_tar.sh b/lustre/tests/run_tar.sh

index 9ad3a58..0b82ce1 100755 (executable)
--- a/lustre/tests/run_tar.sh
+++ b/lustre/tests/run_tar.sh
@@ -1,9 +1,11 @@
  #!/bin/bash
-set -x
  
  TMP=${TMP:-/tmp}
  
  TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale}
+TESTNAME=${TESTNAME:-""}
+[ -n "$TESTNAME" ] && TESTLOG_PREFIX=$TESTLOG_PREFIX.$TESTNAME
+
  LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
  DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
  
@@ -11,6 +13,7 @@ mkdir -p ${LOG%/*}
  
  rm -f $LOG $DEBUGLOG
  exec 2>$DEBUGLOG
+set -x
  
  . $(dirname $0)/functions.sh
  
@@ -42,19 +45,19 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
          RC=0
      fi
      if [ $RC -eq 0 ]; then
-       echoerr "$(date +'%F %H:%M:%S'): tar succeeded"
-       cd $TMP
-       rm -rf $TESTDIR
-       echoerr "$(date +'%F %H:%M:%S'): tar run finished"
+        echoerr "$(date +'%F %H:%M:%S'): tar succeeded"
+        cd $TMP
+        rm -rf $TESTDIR
+        echoerr "$(date +'%F %H:%M:%S'): tar run finished"
      else
-       echoerr "$(date +'%F %H:%M:%S'): tar failed"
-       if [ -z "$ERRORS_OK" ]; then
-           echo $(hostname) >> $END_RUN_FILE
-       fi
-       if [ $BREAK_ON_ERROR ]; then
-           # break
+        echoerr "$(date +'%F %H:%M:%S'): tar failed"
+        if [ -z "$ERRORS_OK" ]; then
+            echo $(hostname) >> $END_RUN_FILE
+        fi
+        if [ $BREAK_ON_ERROR ]; then
+            # break
              CONTINUE=false
-       fi
+        fi
      fi
  done
  
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh

index 9146697..880e55b 100644 (file)
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -1111,6 +1111,7 @@ start_client_load() {
                                END_RUN_FILE=$END_RUN_FILE \
                                LOAD_PID_FILE=$LOAD_PID_FILE \
                                TESTLOG_PREFIX=$TESTLOG_PREFIX \
+                              TESTNAME=$TESTNAME \
                                run_${load}.sh" &
      local ppid=$!
      log "Started client load: ${load} on $client"
@@ -1134,14 +1135,14 @@ start_client_loads () {
      sleep 2
  }
  
-# only for remote client 
+# only for remote client
  check_client_load () {
      local client=$1
      local var=$(node_var_name $client)_load
      local TESTLOAD=run_${!var}.sh
  
      ps auxww | grep -v grep | grep $client | grep -q "$TESTLOAD" || return 1
-    
+
      # bug 18914: try to connect several times not only when
      # check ps, but  while check_catastrophe also
      local tries=3
@@ -1219,7 +1220,7 @@ restart_client_loads () {
              if [ "$rc" != 0 ]; then
                  log "Client load failed to restart on node $client, rc=$rc"
                  # failure one client load means test fail
-                # we do not need to check other 
+                # we do not need to check other
                  return $rc
              fi
          else
@@ -1227,6 +1228,70 @@ restart_client_loads () {
          fi
      done
  }
+
+# Start vmstat and save its process ID in a file.
+start_vmstat() {
+    local nodes=$1
+    local pid_file=$2
+
+    [ -z "$nodes" -o -z "$pid_file" ] && return 0
+
+    do_nodes $nodes \
+        "vmstat 1 > $TESTLOG_PREFIX.$TESTNAME.vmstat.\\\$(hostname -s).log \
+        2>/dev/null </dev/null & echo \\\$! > $pid_file"
+}
+
+# Display the nodes on which client loads failed.
+print_end_run_file() {
+    local file=$1
+    local node
+
+    [ -s $file ] || return 0
+
+    echo "Found the END_RUN_FILE file: $file"
+    cat $file
+
+    # A client load will stop if it finds the END_RUN_FILE file.
+    # That does not mean the client load actually failed though.
+    # The first node in END_RUN_FILE is the one we are interested in.
+    read node < $file
+
+    if [ -n "$node" ]; then
+        local var=$(node_var_name $node)_load
+
+        local prefix=$TESTLOG_PREFIX
+        [ -n "$TESTNAME" ] && prefix=$prefix.$TESTNAME
+        local stdout_log=$prefix.run_${!var}_stdout.$node.log
+        local debug_log=$(echo $stdout_log | sed 's/\(.*\)stdout/\1debug/')
+
+        echo "Client load ${!var} failed on node $node:"
+        echo "$stdout_log"
+        echo "$debug_log"
+    fi
+}
+
+# Stop the process which had its PID saved in a file.
+stop_process() {
+    local nodes=$1
+    local pid_file=$2
+
+    [ -z "$nodes" -o -z "$pid_file" ] && return 0
+
+    do_nodes $nodes "test -f $pid_file &&
+        { kill -s TERM \\\$(cat $pid_file); rm -f $pid_file; }" || true
+}
+
+# Stop all client loads.
+stop_client_loads() {
+    local nodes=${1:-$CLIENTS}
+    local pid_file=$2
+
+    # stop the client loads
+    stop_process $nodes $pid_file
+
+    # clean up the processes that started them
+    [ -n "$CLIENT_LOAD_PIDS" ] && kill -9 $CLIENT_LOAD_PIDS 2>/dev/null || true
+}
  # End recovery-scale functions
  
  # verify that lustre actually cleaned up properly
@@ -2262,6 +2327,7 @@ setupall() {
      [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
      mount_client $MOUNT
      [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT
+    clients_up
  
      if [ "$MOUNT_2" ]; then
          mount_client $MOUNT2
@@ -2705,7 +2771,8 @@ check_and_cleanup_lustre() {
      fi
  
      if is_mounted $MOUNT; then
-        [ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]*
+        [ -n "$DIR" ] && rm -rf $DIR/[Rdfs][0-9]* ||
+            error "remove sub-test dirs failed"
          [ "$ENABLE_QUOTA" ] && restore_quota_type || true
      fi
  
@@ -3067,9 +3134,10 @@ error_noexit() {
  
      log " ${TESTSUITE} ${TESTNAME}: @@@@@@ ${TYPE}: $@ "
  
+    mkdir -p $LOGDIR
      # We need to dump the logs on all nodes
      if $dump; then
-        gather_logs $(comma_list $(nodes_list)) 0
+        gather_logs $(comma_list $(nodes_list))
      fi
  
      debugrestore
@@ -4248,12 +4316,11 @@ cleanup_pools () {
  
  gather_logs () {
      local list=$1
-    local tar_logs=$2
  
      local ts=$(date +%s)
      local docp=true
      [ -f $LOGDIR/shared ] && docp=false
- 
+
      # dump lustre logs, dmesg
  
      prefix="$TESTLOG_PREFIX.$TESTNAME"
@@ -4272,13 +4339,6 @@ gather_logs () {
           dmesg > ${prefix}.dmesg.\\\$(hostname -s).${suffix}"
      if [ ! -f $LOGDIR/shared ]; then
          do_nodes $list rsync -az "${prefix}.*.${suffix}" $HOSTNAME:$LOGDIR
-      fi
-
-    if [ $tar_logs == 1 ]; then
-        local archive=$LOGDIR/${TESTSUITE}-$ts.tar.bz2
-        tar -jcf $archive $LOGDIR/*$ts* $LOGDIR/*${TESTSUITE}*
-
-        echo $archive
      fi
  }
author	Yu Jian <yujian@whamcloud.com>
	Wed, 7 Mar 2012 14:45:27 +0000 (22:45 +0800)
committer	Oleg Drokin <green@whamcloud.com>
	Wed, 14 Mar 2012 17:05:53 +0000 (13:05 -0400)
lustre/tests/recovery-double-scale.sh		patch \| blob \| history
lustre/tests/recovery-mds-scale.sh		patch \| blob \| history
lustre/tests/recovery-random-scale.sh		patch \| blob \| history
lustre/tests/run_IOR.sh		patch \| blob \| history
lustre/tests/run_dbench.sh		patch \| blob \| history
lustre/tests/run_dd.sh		patch \| blob \| history
lustre/tests/run_iozone.sh		patch \| blob \| history
lustre/tests/run_tar.sh		patch \| blob \| history
lustre/tests/test-framework.sh		patch \| blob \| history