From 30d9df1a69d325c416ed7027ddd34464f097396f Mon Sep 17 00:00:00 2001
From: root <root@murdoch.sodor>
Date: Tue, 21 Dec 2010 14:00:06 +0000
Subject: [PATCH] LU-123 Port yaml and auster to b1_8

Changes to add the yaml data logging from the 2.0 branch in the 1.8 branch, this
patch was created by applying the 2.0 yml patch to 1.8 and then resolving the issues.

This is to say that the larger changes have been taken to be correct because of their
acceptance in the 2.0 master.

This patch also contains the addition of the auster.sh script which allows for the logging
of test results to the maloo database. The actual logging is carried out by the
maloo_upload.sh script.

For instructions on how to use auster use auster -?, for instructions on maloo refer to
the whamcloud wiki [where I will begin to place a resource of information!]

Change-Id: I602a3534f17544d857aa0a9f9f82d2873fb73a39
Signed-off-by: Chris Gearing <chris@whamcloud.com>
Signed-off-by: Bobi Jam <bobijam@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/421
Tested-by: Hudson
Reviewed-by: Yu Jian <yujian@whamcloud.com>
Reviewed-by: Johann Lombardi <johann@whamcloud.com>
---
 lustre/tests/Makefile.am                |   7 +-
 lustre/tests/acceptance-small.sh        | 154 ++++++++-------
 lustre/tests/auster.sh                  | 320 ++++++++++++++++++++++++++++++++
 lustre/tests/conf-sanity.sh             |  22 +--
 lustre/tests/insanity.sh                |  12 +-
 lustre/tests/large-scale.sh             |  13 +-
 lustre/tests/lfsck.sh                   |   3 +-
 lustre/tests/liblustre.sh               |   1 +
 lustre/tests/lnet-selftest.sh           |   9 +-
 lustre/tests/maloo_upload.sh            |  31 ++++
 lustre/tests/metadata-updates.sh        |  17 +-
 lustre/tests/mmp.sh                     |  15 +-
 lustre/tests/obdfilter-survey.sh        |   7 +-
 lustre/tests/ost-pools.sh               |  15 +-
 lustre/tests/parallel-scale.sh          |  27 +--
 lustre/tests/performance-sanity.sh      |   7 +-
 lustre/tests/racer.sh                   | 137 ++++++++++++--
 lustre/tests/recovery-double-scale.sh   |  19 +-
 lustre/tests/recovery-mds-scale.sh      |  33 ++--
 lustre/tests/recovery-random-scale.sh   |  33 ++--
 lustre/tests/recovery-small.sh          |  29 +--
 lustre/tests/replay-dual.sh             |   4 +-
 lustre/tests/replay-ost-single.sh       |  11 +-
 lustre/tests/replay-single.sh           |  23 +--
 lustre/tests/replay-vbr.sh              |   5 +-
 lustre/tests/rpc.sh                     |  15 +-
 lustre/tests/runtests                   |   1 +
 lustre/tests/sanity-benchmark.sh        |  17 +-
 lustre/tests/sanity-quota.sh            |   8 +-
 lustre/tests/sanity.sh                  |   6 +-
 lustre/tests/sanityn.sh                 |   4 +
 lustre/tests/sgpdd-survey.sh            |   3 +-
 lustre/tests/test-framework.sh          | 304 ++++++++++++++++++++----------
 lustre/tests/test-groups/regression     |  20 ++
 lustre/tests/test-groups/regression-mpi |   3 +
 lustre/tests/yaml.sh                    | 191 +++++++++++++++++++
 36 files changed, 1165 insertions(+), 361 deletions(-)
 create mode 100755 lustre/tests/auster.sh
 create mode 100755 lustre/tests/maloo_upload.sh
 create mode 100644 lustre/tests/test-groups/regression
 create mode 100644 lustre/tests/test-groups/regression-mpi
 create mode 100644 lustre/tests/yaml.sh

diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am
index 2262fa4..98d7398 100644
--- a/lustre/tests/Makefile.am
+++ b/lustre/tests/Makefile.am
@@ -23,8 +23,9 @@ noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh
 noinst_SCRIPTS += run_dbench.sh run_IOR.sh recovery-double-scale.sh
 noinst_SCRIPTS += recovery-random-scale.sh parallel-scale.sh metadata-updates.sh
 noinst_SCRIPTS += ost-pools.sh rpc.sh lnet-selftest.sh obdfilter-survey.sh mmp.sh
-noinst_SCRIPTS += sgpdd-survey.sh
+noinst_SCRIPTS += sgpdd-survey.sh auster.sh yaml.sh maloo_upload.sh
 nobase_noinst_SCRIPTS = cfg/local.sh
+nobase_noinst_SCRIPTS += test-groups/regression test-groups/regression-mpi
 nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh
 nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh
 nobase_noinst_SCRIPTS += racer/file_rm.sh racer/racer.sh racer/file_concat.sh
@@ -40,7 +41,7 @@ if MPITESTS
 SUBDIRS = mpi
 endif
 noinst_PROGRAMS = openunlink truncate directio writeme mlink utime it_test
-noinst_PROGRAMS += tchmod fsx test_brw 
+noinst_PROGRAMS += tchmod fsx test_brw
 noinst_PROGRAMS += createmany chownmany statmany multifstat createtest
 noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany checkstat
 noinst_PROGRAMS += statone runas openfile rmdirmany
@@ -48,7 +49,7 @@ noinst_PROGRAMS += small_write multiop ll_sparseness_verify
 noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany
 noinst_PROGRAMS += openfilleddirunlink rename_many memhog iopentest1 iopentest2
 noinst_PROGRAMS += mmap_sanity flock_test writemany reads flocks_test
-# noinst_PROGRAMS += copy_attr mkdirdeep 
+# noinst_PROGRAMS += copy_attr mkdirdeep
 bin_PROGRAMS = mcreate munlink
 testdir = $(libdir)/lustre/tests
 test_SCRIPTS = $(noinst_SCRIPTS) $(noinst_PROGRAMS)
diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh
index 524cb9b..83a9532 100755
--- a/lustre/tests/acceptance-small.sh
+++ b/lustre/tests/acceptance-small.sh
@@ -4,26 +4,37 @@
 #set -vx
 set -e
 
-export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL REPLAY_VBR INSANITY SANITY_QUOTA PERFORMANCE_SANITY LARGE_SCALE RECOVERY_MDS_SCALE RECOVERY_DOUBLE_SCALE RECOVERY_RANDOM_SCALE PARALLEL_SCALE METADATA_UPDATES OST_POOLS SANITY_BENCHMARK LNET_SELFTEST MMP OBDFILTER_SURVEY SGPDD_SURVEY"
+export MSKIPPED=0
+export OSKIPPED=0
+
+# This is the default set of tests to run.
+DEFAULT_SUITES="runtests sanity sanity-benchmark sanityn lfsck liblustre
+                racer replay-single conf-sanity recovery-small
+                replay-ost-single replay-dual replay-vbr insanity sanity-quota
+                performance-sanity large-scale recovery-mds-scale
+                recovery-double-scale recovery-random-scale parallel-scale
+                lustre_rsync-test metadata-updates ost-pools lnet-selftest
+                mmp obdfilter-survey sgpdd-survey"
+
+if [[ -n $@ ]]; then
+    ACC_SM_ONLY="${ACC_SM_ONLY} $@"
+fi
 
 if [ "$ACC_SM_ONLY" ]; then
-    for O in $TESTSUITE_LIST; do
-	export ${O}="no"
+    for O in $DEFAULT_SUITES; do
+        O=$(echo $O | tr "-" "_" | tr "[:lower:]" "[:upper:]")
+        export ${O}="no"
     done
     for O in $ACC_SM_ONLY; do
-	O=`echo ${O%.sh} | tr "-" "_"`
-	O=`echo $O | tr "[:lower:]" "[:upper:]"`
-	export ${O}="yes"
+        O=`echo ${O%.sh} | tr "-" "_"`
+        O=`echo $O | tr "[:lower:]" "[:upper:]"`
+        export ${O}="yes"
     done
 fi
 
-LIBLUSTRETESTS=${LIBLUSTRETESTS:-../liblustre/tests}
-
-RANTEST=""
-
 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
-init_test_env $@
+init_test_env
 
 SETUP=${SETUP:-setupall}
 FORMAT=${FORMAT:-formatall}
@@ -65,57 +76,21 @@ find_in_path() {
 title() {
     # update titlebar if stdin is attached to an xterm
     if ${UPDATE_TITLEBAR:-false}; then
-	if tty -s; then
-	    case $TERM in 
-		xterm*)
-		    echo -ne "\033]2; acceptance-small: $* \007" >&0
-		    ;;
-	    esac
-	fi
-    fi 
+        if tty -s; then
+            case $TERM in
+            xterm*)
+                echo -ne "\033]2; acceptance-small: $* \007" >&0
+                ;;
+            esac
+        fi
+    fi
     log "-----============= acceptance-small: "$*" ============----- `date`"
-    RANTEST=${RANTEST}$*", "
-}
-
-skip_remost() {
-	remote_ost_nodsh && log "SKIP: $1: remote OST with nodsh" && return 0
-	return 1
-}
-
-skip_remmds() {
-	remote_mds_nodsh && log "SKIP: $1: remote MDS with nodsh" && return 0
-	return 1
-}
-
-# cleanup the logs of all suites
-cleanup_log () {
-    local suite
-    local o=$(echo $O | tr "[:upper:]" "[:lower:]")
-    o=${o//_/-}
-    
-    rm -f ${TMP}/${o}.log
 }
 
-cleanup_logs () {
-    local suite
-    for suite in ${ACC_SM_ONLY:-$TESTSUITE_LIST}; do
-        cleanup_log $suite
-    done
-}
-
-export NAME MOUNT START CLEAN
-. $LUSTRE/tests/cfg/$NAME.sh
-
-assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
-assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
-assert_env FSNAME MOUNT MOUNT2
-
-setup_if_needed
-
-for s in ${ACC_SM_ONLY:-$TESTSUITE_LIST}; do
-    suite_name=$(echo ${s%.sh} | tr "[:upper:]_" "[:lower:]-" )
-    suite=$(echo ${suite_name} | tr "[:lower:]-" "[:upper:]_")
-    suite_only=ONLY # Change to ${suite}_ONLY after fixing YALA
+run_suite() {
+    local suite_name=$(echo ${1%.sh} | tr "[:upper:]_" "[:lower:]-" )
+    local suite=$(echo ${suite_name} | tr "[:lower:]-" "[:upper:]_")
+    local suite_only=ONLY # Change to ${suite}_ONLY after fixing YALA
 
     if is_sanity_benchmark ${suite_name}; then
         suite_only=suite_name
@@ -130,34 +105,55 @@ for s in ${ACC_SM_ONLY:-$TESTSUITE_LIST}; do
         suite_script=${suite_name}.sh
     else
         echo "Can't find test script for $suite_name"
-        exit 1
+        return 1
     fi
 
     echo "$suite_script located."
-
-    if [[ ${!suite} = no ]]; then
+    if [[ ${!suite} != no ]]; then
+        local rc
+        local status
+        local duration
+        local start_ts=$(date +%s)
+        rm -rf $TF_FAIL
+        title $suite_name
+        log_test $suite_name
+        bash $suite_script ${!suite_only}
+        rc=$?
+        duration=$(($(date +%s) - $start_ts))
+        if [ -f $TF_FAIL -o $rc -ne 0 ]; then
+            status="FAIL"
+        else
+            status="PASS"
+        fi
+        echo "Script: $status"
+        log_test_status $duration $status
+
+        $CLEANUP
+        [ x$suite = xSGPDD_SURVEY ] || $SETUP
+
+        eval ${suite}="done"
+    else
         echo "Skipping $suite_name"
-        continue
     fi
+}
 
-    start_ts=$(date +%s)
-    title $suite_name
-    bash $suite_script ${!suite_only}
-    rc=$?
-    duration=$(($(date +%s) - $start_ts))
-    if [ $rc -ne 0 ]; then
-        RC=$rc
-        status="FAIL"
-    else
-        status="PASS"
-    fi
-    echo "Script: $status"
+run_suites() {
+    for suite in $*; do
+        run_suite $suite
+    done
+}
+
+export NAME MOUNT START CLEAN
+. $LUSTRE/tests/cfg/$NAME.sh
 
+assert_env mds_HOST MDS_MKFS_OPTS
+assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
+assert_env FSNAME MOUNT MOUNT2
+
+setup_if_needed
+init_logging
 
-    $CLEANUP
-    [ x$suite = xSGPDD_SURVEY ] || $SETUP
-    eval ${suite}="done"
-done
+run_suites ${ACC_SM_ONLY:-$DEFAULT_SUITES}
 
 RC=$?
 title FINISHED
diff --git a/lustre/tests/auster.sh b/lustre/tests/auster.sh
new file mode 100755
index 0000000..17c60e1
--- /dev/null
+++ b/lustre/tests/auster.sh
@@ -0,0 +1,320 @@
+#!/bin/bash
+#
+#
+# auster - drive lustre tests
+# TODO
+#  1. --time-limt <seconds>  add per test time limit, kill test if it runs to long
+#  2. Read list of tests to run from a file. same syntax as cli, but one test per line
+#  3. Run test on remote node
+#  4. Use long opts for auster options
+
+set -e
+
+export TF_FAIL=/tmp/tf.fail
+
+usage() {
+    less -F <<EOF
+Usage ${0##*/} [options]  suite [suite optoins] [suite [suite options]]
+Run Lustre regression tests suites.
+      -c CONFIG Test environment config file
+      -d LOGDIR Top level directory for logs
+      -D FULLLOGDIR Full directory for logs
+      -f STR    Config name (cfg/<name>.sh)
+      -g GROUP  Test group file (Overrides tests listed on command line)
+      -i N      Repeat tests N times (default 1). A new directory
+                will be created under LOGDIR for each iteration.
+      -k        Don't stop when subtests fail
+      -R        Remount lustre between tests
+      -r        Reformat (during initial configuration if needed)
+      -s        SLOW=yes
+      -v        Verbose mode
+      -l        Send logs to the Maloo database after run
+                  (can be done later by running maloo_upload.sh)
+      -h        This help.
+
+Suite options
+These are suite specific options that can be specified after each suite on
+the command line.
+   suite-name  [options]
+      --only LIST         Run only specific list of subtests
+      --except LIST       Skip list of subtests
+      --start-at SUBTEST  Start testing from subtest
+      --stop-at SUBTEST   Stop testing at subtest
+      --time-limit LIMIT  Don't allow this suite to run longer
+                          than LIMT seconds. [UNIMPLEMENTED]
+
+Example usage:
+Run all of sanity and all of replay-single except for 70b with SLOW=y using
+the default "local" configuration.
+
+  auster -s sanity replay-single --except 70b
+
+Run all tests in the regression group 5 times using large config.
+
+  auster -f large -g test-groups/regression  -r 5
+
+EOF
+    exit
+}
+
+dry_run=false
+do_reset=false
+verbose=false
+repeat_count=1
+upload_logs=false
+reformat=false
+test_logs_dir=/tmp/test_logs/$(date +%Y-%m-%d)/$(date +%H%M%S)
+export SLOW=no
+export ${NAME:=local}
+while getopts "c:d:D:nkf:g:i:rRslhv" opt
+do
+    case "$opt" in
+	c) CONFIG=$OPTARG;;
+	d) test_logs_dir=$OPTARG/$(date +%Y-%m-%d)/$(date +%H%M%S);;
+	D) test_logs_dir=$OPTARG;;
+	g) test_group_file=$OPTARG;;
+	k) export FAIL_ON_ERROR=false;;
+        n) dry_run=:;;
+        v) verbose=:;;
+	i) repeat_count=$OPTARG;;
+	f) NAME=$OPTARG;;
+	R) do_reset=:;;
+	r) reformat=:;;
+	s) SLOW=yes;;
+	l) upload_logs=true;;
+        h|\?) usage;;
+    esac
+done
+
+# If a test_group_file is specified, then ignore rest of command line
+if [[ $test_group_file ]]; then
+    export TEST_GROUP=$(basename $test_group_file)
+    set $(sed 's/#.*$//' $test_group_file)
+else
+    shift $((OPTIND -1))
+fi
+
+reset_lustre() {
+    if $do_reset; then
+	stopall
+	setupall
+    fi
+}
+
+STARTTIME=`date +%s`
+
+LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+. $LUSTRE/tests/test-framework.sh
+init_test_env
+
+print_summary () {
+    trap 0
+    local form="%-13s %-17s %s\n"
+    printf "$form" "status" "script" "skipped tests E(xcluded) S(low)"
+    echo "------------------------------------------------------------------------------------"
+    echo "Done!"
+}
+
+
+setup_if_needed() {
+    nfs_client_mode && return
+    auster_cleanup=false
+
+    local MOUNTED=$(mounted_lustre_filesystems)
+    if $(echo $MOUNTED | grep -w -q $MOUNT); then
+        check_config_clients $MOUNT
+       # init_facets_vars
+       # init_param_vars
+        return
+    fi
+
+    echo "Lustre is not mounted, trying to do setup ... "
+    $reformat && formatall
+    setupall
+
+    MOUNTED=$(mounted_lustre_filesystems)
+    if ! $(echo $MOUNTED | grep -w -q $MOUNT); then
+        echo "Lustre is not mounted after setup! "
+        exit 1
+    fi
+    auster_cleanup=true
+}
+
+cleanup_if_needed() {
+    if $auster_cleanup; then
+	cleanupall
+    fi
+}
+
+find_script_in_path() {
+    target=$1
+    path=$2
+    for dir in $(tr : " " <<< $path); do
+      if [ -e $dir/$target ]; then
+	  echo $dir/$target
+          return 0
+      fi
+      if [ -e $dir/$target.sh ]; then
+	  echo $dir/$target.sh
+          return 0
+      fi
+    done
+    return 1
+}
+
+title() {
+    log "-----============= acceptance-small: "$*" ============----- `date`"
+}
+
+doit() {
+    if $dry_run; then
+        printf "Would have run: %s\n" "$*"
+        return 0
+    fi
+    if $verbose; then
+        printf "Running: %s\n" "$*"
+    fi
+    "$@"
+}
+
+
+run_suite() {
+    suite_name=$1
+    suite_script=$2
+    title $suite_name
+    log_test $suite_name
+
+    rm -f $TF_FAIL
+    local start_ts=$(date +%s)
+    doit bash $suite_script
+    rc=$?
+    duration=$(($(date +%s) - $start_ts))
+    if [ -f $TF_FAIL -o $rc -ne 0 ]; then
+        status="FAIL"
+    else
+        status="PASS"
+    fi
+    log_test_status $duration $status
+
+    reset_lustre
+}
+
+run_suite_logged() {
+    local suite_name=${1%.sh}
+    local suite=$(echo ${suite_name} | tr "[:lower:]-" "[:upper:]_")
+
+    suite_script=$(find_script_in_path $suite_name $PATH:$LUSTRE/tests)
+
+    if [[ -z $suite_script ]]; then
+        echo "Can't find test script for $suite_name"
+        return 1
+    fi
+
+    echo "run_suite $suite_name $suite_script"
+    local log_name=${suite_name}.suite_log.$(hostname).log
+    if $verbose; then
+	run_suite $suite_name $suite_script 2>&1 |tee  $LOGDIR/$log_name
+    else
+	run_suite $suite_name $suite_script > $LOGDIR/$log_name 2>&1
+    fi
+
+}
+
+#
+# Add this to test-framework somewhere.
+reset_logging() {
+    export LOGDIR=$1
+    unset YAML_LOG
+    init_logging
+}
+
+split_commas() {
+    echo "${*//,/ }"
+}
+
+run_suites() {
+    local n=0
+    local argv=("$@")
+    while ((n < repeat_count)); do
+	local RC=0
+	local logdir=${test_logs_dir}
+	((repeat_count > 1)) && logdir="$logdir/$n"
+	reset_logging $logdir
+	set -- "${argv[@]}"
+	while [[ -n $1 ]]; do
+	    unset ONLY EXCEPT START_AT STOP_AT
+	    local opts=""
+	    local time_limit=""
+#	    echo "argv: $*"
+	    suite=$1
+	    shift;
+	    while [[ -n $1 ]]; do
+		case "$1" in
+		    --only)
+			shift;
+			export ONLY=$(split_commas $1)
+			opts+="ONLY=$ONLY ";;
+		    --except)
+			shift;
+			export EXCEPT=$(split_commas $1)
+			opts+="EXCEPT=$EXCEPT ";;
+		    --start-at)
+			shift;
+			export START_AT=$1
+			opts+="START_AT=$START_AT ";;
+	            --stop-at)
+			shift;
+			export STOP_AT=$1
+			opts+="STOP_AT=$STOP_AT ";;
+		    --time-limit)
+			shift;
+			time_limit=$1;;
+		    *)
+			break;;
+		esac
+		shift
+	    done
+	    echo "running: $suite $opts"
+	    run_suite_logged $suite || RC=$?
+	    echo $suite returned $RC
+	done
+	if $upload_logs; then
+	    $upload_script $LOGDIR
+	fi
+	n=$((n + 1))
+    done
+}
+
+if [ $upload_logs = true ] ; then
+    upload_script=$(find_script_in_path maloo_upload.sh $PATH:$LUSTRE/tests)
+    if [[ -z $upload_script ]]; then
+        echo "Can't find maloo_upload.sh script"
+        exit 1
+    fi
+
+    if [ ! -r ~/.maloorc ] ; then
+        echo "A ~/.maloorc file is required in order to upload results."
+        echo "Visit your maloo web interface to download your .maloorc file"
+        exit 1
+    fi
+fi
+
+export NAME MOUNT START CLEAN
+. ${CONFIG:-$LUSTRE/tests/cfg/$NAME.sh}
+
+assert_env mds_HOST MDS_MKFS_OPTS
+assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
+assert_env FSNAME MOUNT MOUNT2
+
+echo "Started at `date`"
+setup_if_needed
+
+run_suites "$@"
+RC=$?
+
+if [[ $RC -eq 0 ]]; then
+    cleanup_if_needed
+fi
+
+echo "Finished at `date` in $((`date +%s` - $STARTTIME))s"
+echo "$0: completed with rc $RC" && exit $RC
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh
index d2c8f148..f2e34b4 100644
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -31,6 +31,7 @@ HOSTNAME=`hostname`
 
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
+init_logging
 # STORED_MDSSIZE is used in test_18
 if [ -n "$MDSSIZE" ]; then
     STORED_MDSSIZE=$MDSSIZE
@@ -40,15 +41,14 @@ MDSSIZE=40000
 OSTSIZE=40000
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 
+require_dsh_mds || exit 0
+require_dsh_ost || exit 0
+
 if ! combined_mgs_mds; then
     # bug number for skipped test:    23954
     ALWAYS_EXCEPT="$ALWAYS_EXCEPT       24b"
 fi
 
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
-
-#
 [ "$SLOW" = "no" ] && EXCEPT_SLOW="30 31 45"
 
 assert_DIR
@@ -456,7 +456,7 @@ test_5f() {
 
 	sleep 5
 
-	if ! ps -f -p $pid >/dev/null; then 
+	if ! ps -f -p $pid >/dev/null; then
 		wait $pid
 		rc=$?
 		grep " $MOUNT " /etc/mtab && echo "test 5f: mtab after mount"
@@ -469,7 +469,7 @@ test_5f() {
 	# start mds
 	start_mds
 
-	# mount should succeed after start mds 
+	# mount should succeed after start mds
 	wait $pid
 	rc=$?
 	[ $rc -eq 0 ] || error "mount returned $rc"
@@ -649,7 +649,7 @@ test_18() {
         echo "mount mds with large journal..."
         local OLD_MDS_MKFS_OPTS=$MDS_MKFS_OPTS
 
-        local opts="--mdt --fsname=$FSNAME --device-size=$myMDSSIZE --param sys.timeout=$TIMEOUT $MDSOPT" 
+        local opts="--mdt --fsname=$FSNAME --device-size=$myMDSSIZE --param sys.timeout=$TIMEOUT $MDSOPT"
 
         if combined_mgs_mds ; then
             MDS_MKFS_OPTS="--mgs $opts"
@@ -983,7 +983,7 @@ test_27b() {
         setup
 
 	# interop 1.8 <-> 2.0:
-	# 1.8: group_acquire_expire, 2.0: identity_acquire_expire 
+	# 1.8: group_acquire_expire, 2.0: identity_acquire_expire
 	local acquire_expire=$(do_facet mds lctl get_param md*.$FSNAME-MDT0000.*acquire_expire | \
 		cut -d= -f1 | cut -d. -f3)
 	facet_failover mds
@@ -1511,7 +1511,7 @@ test_35b() { # bug 18674
 		return 1
 
 	local at_max_saved=0
-	# adaptive timeouts may prevent seeing the issue 
+	# adaptive timeouts may prevent seeing the issue
 	if at_is_enabled; then
 		at_max_saved=$(at_max_get mds)
 		at_max_set 0 mds client
@@ -1869,7 +1869,7 @@ cleanup_46a() {
 		stop ost${count} -f || rc=$?
 		let count=count-1
 	done	
-	stop_mds || rc=$? 
+	stop_mds || rc=$?
 	cleanup_nocli || rc=$?
 	#writeconf to remove all ost2 traces for subsequent tests
 	writeconf
@@ -1887,7 +1887,7 @@ test_46a() {
 	mount_client $MOUNT || return 3
 	trap "cleanup_46a $OSTCOUNT" EXIT ERR
 
-	local i 
+	local i
 	for (( i=2; i<=$OSTCOUNT; i++ )); do
 	    start ost$i `ostdevname $i` $OST_MOUNT_OPTS || return $((i+2))
 	done
diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh
index 8f40d52..8206a85 100755
--- a/lustre/tests/insanity.sh
+++ b/lustre/tests/insanity.sh
@@ -9,7 +9,7 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
 init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
-
+init_logging
 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
 
 if [ "$FAILURE_MODE" = "HARD" ]; then
@@ -33,8 +33,8 @@ assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
 assert_env LIVE_CLIENT FSNAME
 
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+require_dsh_mds || exit 0
+require_dsh_ost || exit 0
 
 # FAIL_CLIENTS list should not contain the LIVE_CLIENT
 FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
@@ -64,9 +64,9 @@ fail_clients() {
 
     log "Request clients to fail: ${num}. Num of clients to fail: ${FAIL_NUM}, already failed: $DOWN_NUM"
     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
-	num=$((FAIL_NUM - DOWN_NUM)) 
+	num=$((FAIL_NUM - DOWN_NUM))
     fi
-    
+
     if [ -z "$num" ] || [ "$num" -le 0 ]; then
         log "No clients failed!"
         return
@@ -505,7 +505,7 @@ run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 
 ############### Ninth Failure Mode ###############
 test_9() {
-    echo 
+    echo
 
     #Create files
     echo "Verify Lustre filesystem is up and running"
diff --git a/lustre/tests/large-scale.sh b/lustre/tests/large-scale.sh
index 51b8777..d7b6ce2 100644
--- a/lustre/tests/large-scale.sh
+++ b/lustre/tests/large-scale.sh
@@ -15,8 +15,9 @@ CLEANUP=${CLEANUP:-""}
 init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
-remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
 
 [ -n "$CLIENTS" ] || { skip_env "$0: Need two or more clients" && exit 0; }
 [ $CLIENTCOUNT -ge 2 ] || \
@@ -35,7 +36,7 @@ rm -rf $DIR/[df][0-9]*
 
 # VBR scale tests
 check_vbr () {
-    do_nodes $CLIENTS "$LCTL get_param mdc.*.connect_flags | grep version_recovery" 
+    do_nodes $CLIENTS "$LCTL get_param mdc.*.connect_flags | grep version_recovery"
 }
 
 check_vbr || \
@@ -119,7 +120,7 @@ test_1c() {
         replay_barrier mds
         do_nodes $CLIENTS "createmany -o $DIR/$tfile-\\\$(hostname)" 25
         # XXX For FAILURE_MODE=HARD it is better to exclude
-        # shutdown_facet and reboot_facet time 
+        # shutdown_facet and reboot_facet time
         fail_mds
 
         local current_ts=`date +%s`
@@ -178,7 +179,7 @@ test_3a() {
 
     local -a nodes=(${CLIENTS//,/ })
 
-    # INCREMENT is a number of clients 
+    # INCREMENT is a number of clients
     # a half of clients by default
     increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))}
 
@@ -205,7 +206,7 @@ test_3a() {
     local num=$increment
 
     while [ $num -le $CLIENTCOUNT ]; do
-        list=$(comma_list ${nodes[@]:0:$num}) 
+        list=$(comma_list ${nodes[@]:0:$num})
 
         generate_machine_file $list $machinefile ||
             { error "can not generate machinefile"; exit 1; }
@@ -231,7 +232,7 @@ test_3a() {
             fi
 
             duration=$(do_facet mds lctl get_param -n $procfile | grep recovery_duration)
-            
+
             res=( "${res[@]}" "$num" )
             res=( "${res[@]}" "$duration" )
             echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num  $duration"
diff --git a/lustre/tests/lfsck.sh b/lustre/tests/lfsck.sh
index b23559d..926949e 100644
--- a/lustre/tests/lfsck.sh
+++ b/lustre/tests/lfsck.sh
@@ -9,6 +9,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 NUMFILES=${NUMFILES:-10}
 NUMDIRS=${NUMDIRS:-4}
@@ -156,7 +157,7 @@ get_files() {
     esac
 
     local files=""
-    local f 
+    local f
     for f in $(seq -f testfile.%g $first $last); do
         test_file=$test_dir/$f
         files="$files $test_file"
diff --git a/lustre/tests/liblustre.sh b/lustre/tests/liblustre.sh
index 12af4d7..0ad8c35 100644
--- a/lustre/tests/liblustre.sh
+++ b/lustre/tests/liblustre.sh
@@ -8,6 +8,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 LIBLUSTRETESTS=${LIBLUSTRETESTS:-$LUSTRE/liblustre/tests}
 
diff --git a/lustre/tests/lnet-selftest.sh b/lustre/tests/lnet-selftest.sh
index f4dd5b3..be4b2e8 100755
--- a/lustre/tests/lnet-selftest.sh
+++ b/lustre/tests/lnet-selftest.sh
@@ -4,6 +4,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 #
 ALWAYS_EXCEPT="$ALWAYS_EXCEPT $LNET_SELFTEST_EXCEPT"
@@ -104,7 +105,6 @@ test_smoke_sub () {
     echo 'trap "cleanup $pid" INT TERM'
     echo sleep $smoke_DURATION
     echo 'cleanup $pid'
-    
 }
 
 run_lst () {
@@ -137,24 +137,23 @@ test_smoke () {
     local log=$TMP/$tfile.log
     local rc=0
 
-    test_smoke_sub $servers $clients 2>&1 > $runlst 
+    test_smoke_sub $servers $clients 2>&1 > $runlst
 
     cat $runlst
 
     run_lst $runlst | tee $log
     rc=${PIPESTATUS[0]}
     [ $rc = 0 ] || error "$runlst failed: $rc"
-    
+
     lst_end_session --verbose | tee -a $log
 
     # error counters in "lst show_error" should be checked
     check_lst_err $log
-    
 }
 run_test smoke "lst regression test"
 
 complete $(basename $0) $SECONDS
 if [ "$RESTORE_MOUNT" = yes ]; then
     setupall
-fi 
+fi
 exit_status
diff --git a/lustre/tests/maloo_upload.sh b/lustre/tests/maloo_upload.sh
new file mode 100755
index 0000000..dc81ed0
--- /dev/null
+++ b/lustre/tests/maloo_upload.sh
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+FILENAME=$1
+
+if [ -r ~/.maloorc ] ; then
+        source ~/.maloorc
+else
+        echo "Error: ~/.maloorc not found.  Please obtain this file from the maloo web interface, under 'Upload results'"
+        exit 1
+fi
+
+if [ -z $FILENAME ] ; then
+        echo "Usage: ${0} <tarball or directory>"
+        exit 2
+fi
+
+
+if [ ! -r $FILENAME ] ; then
+        echo "Input file '$FILENAME' not found"
+        exit 3
+fi
+
+echo Uploading $FILENAME to $MALOO_URL
+if [ -d $FILENAME ] ; then
+	pushd $FILENAME
+	tar czf - * | curl -F "user_id=${MALOO_USER_ID}" -F "upload=@-" -F "user_upload_token=${MALOO_UPLOAD_TOKEN}" ${MALOO_URL} > /dev/null
+	popd
+else
+	curl -F "user_id=${MALOO_USER_ID}" -F "upload=@${FILENAME}" -F "user_upload_token=${MALOO_UPLOAD_TOKEN}" ${MALOO_URL} > /dev/null
+fi
+echo Complete.
diff --git a/lustre/tests/metadata-updates.sh b/lustre/tests/metadata-updates.sh
index 9ef46ee..a698981 100755
--- a/lustre/tests/metadata-updates.sh
+++ b/lustre/tests/metadata-updates.sh
@@ -10,6 +10,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 TRACE=${TRACE:-"+x"}
 
@@ -71,7 +72,7 @@ do_write () {
     do_nodes $NODES_TO_USE "set $TRACE;
 TESTFILE=$TESTDIR/\\\$(hostname)/$FILE;
 dd if=/dev/zero of=\\\$TESTFILE bs=$FILE_SIZE count=1 2>/dev/null || exit 54;
-echo \\\$(hostname) | dd of=\\\$TESTFILE conv=notrunc 2>/dev/null || exit 55; 
+echo \\\$(hostname) | dd of=\\\$TESTFILE conv=notrunc 2>/dev/null || exit 55;
 md5sum \\\$TESTFILE >> $SUMFILE; " || return ${PIPESTATUS[0]}
     return 0
 }
@@ -90,7 +91,7 @@ do_truncate () {
 
      do_nodes $NODES_TO_USE "set $TRACE;
 TESTFILE=$TESTDIR/\\\$(hostname)/$FILE;
-$TRUNCATE \\\$TESTFILE 0" || return ${PIPESTATUS[0]} 
+$TRUNCATE \\\$TESTFILE 0" || return ${PIPESTATUS[0]}
 
     FILE_SIZE=0
     return 0
@@ -103,7 +104,7 @@ get_stat () {
     echo "Checking file(s) attributes ... "
 
     do_nodesv $NODES_TO_USE "set $TRACE;
-for HOST in ${HOSTS//,/ } ; do 
+for HOST in ${HOSTS//,/ } ; do
     TESTFILE=$TESTDIR/\\\$HOST/$FILE;
     tmp=\\\$(stat -c \\\"%u %g %s 0%a\\\" \\\$TESTFILE);
     echo \\\"\\\$TESTFILE [ uid gid size mode ] expected : $attr ;  got : \\\$tmp \\\";
@@ -112,7 +113,7 @@ for HOST in ${HOSTS//,/ } ; do
         exit 56;
     fi;
 done " || return ${PIPESTATUS[0]}
-    return 0 
+    return 0
 }
 
 do_chmod () {
@@ -121,7 +122,7 @@ do_chmod () {
     do_nodes $NODES_TO_USE "set $TRACE;
 TESTFILE=$TESTDIR/\\\$(hostname)/$FILE;
 chmod $NEW_MODE \\\$TESTFILE" || return ${PIPESTATUS[0]}
- 
+
     CURRENT_MODE=$NEW_MODE
     return 0
 }
@@ -146,7 +147,7 @@ do_check_timestamps () {
     echo "Checking atime, mtime ... "
 
     do_nodesv $NODES_TO_USE "set $TRACE;
-for HOST in ${HOSTS//,/ } ; do 
+for HOST in ${HOSTS//,/ } ; do
     TESTFILE=$TESTDIR/\\\$HOST/$FILE;
     tmp=\\\$(stat -c \\\"%X %Y\\\" \\\$TESTFILE);
     if [ x\\\"\\\$tmp\\\" != x\\\"$times\\\" ] ; then
@@ -155,7 +156,7 @@ for HOST in ${HOSTS//,/ } ; do
     fi;
 done;
 exit \\\$RC" || return ${PIPESTATUS[0]}
-    return 0 
+    return 0
 }
 
 do_fill_dir () {
@@ -176,7 +177,7 @@ check_dir_contents () {
 
     echo "Checking dir contents ... (should exist files : f$num_files ... f$NUM_FILES) ... "
     do_nodes $NODES_TO_USE "set $TRACE;
-for HOST in ${HOSTS//,/ } ; do 
+for HOST in ${HOSTS//,/ } ; do
     DIR=$TESTDIR/\\\$HOST;
     for i in \\\$(seq $NUM_FILES -1 $num_files) ; do
         if ! [ -f \\\$DIR/f\\\$i ] ; then
diff --git a/lustre/tests/mmp.sh b/lustre/tests/mmp.sh
index 6b7c256..4eca25c 100755
--- a/lustre/tests/mmp.sh
+++ b/lustre/tests/mmp.sh
@@ -22,9 +22,10 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+require_dsh_mds || exit 0
+require_dsh_ost || exit 0
 
 # unmount and cleanup the Lustre filesystem
 MMP_RESTORE_MOUNT=false
@@ -164,7 +165,7 @@ mmp_init() {
     fi
 
     local var=${MMP_OSS}failover_HOST
- 
+
     if [ -z "${!var}" ]; then
         log "Failover is not used on OSS, enabling MMP manually..."
         enable_mmp $MMP_OSS $MMP_OSTDEV || \
@@ -204,7 +205,7 @@ mmp_fini() {
     return 0
 }
 
-# Mount the shared target on the failover server after some interval it's 
+# Mount the shared target on the failover server after some interval it's
 # mounted on the primary server.
 mount_after_interval_sub() {
     local interval=$1
@@ -269,7 +270,7 @@ mount_after_interval() {
     return 0
 }
 
-# Mount the shared target on the failover server 
+# Mount the shared target on the failover server
 # during unmounting it on the primary server.
 mount_during_unmount() {
     local device=$1
@@ -309,7 +310,7 @@ mount_during_unmount() {
     return 0
 }
 
-# Mount the shared target on the failover server 
+# Mount the shared target on the failover server
 # after clean unmounting it on the primary server.
 mount_after_unmount() {
     local device=$1
@@ -323,7 +324,7 @@ mount_after_unmount() {
     start $facet $device $mnt_opts || return ${PIPESTATUS[0]}
 
     log "Unmounting $device on $facet..."
-    stop $facet || return ${PIPESTATUS[0]} 
+    stop $facet || return ${PIPESTATUS[0]}
 
     log "Mounting $device on $failover_facet..."
     start $failover_facet $device $mnt_opts || return ${PIPESTATUS[0]}
diff --git a/lustre/tests/obdfilter-survey.sh b/lustre/tests/obdfilter-survey.sh
index 043883f..cc84b4d 100644
--- a/lustre/tests/obdfilter-survey.sh
+++ b/lustre/tests/obdfilter-survey.sh
@@ -5,12 +5,13 @@ set -e
 LUSTRE=${LUSTRE:-`dirname $0`/..}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
+init_logging
 
 nobjhi=${nobjhi:-1}
-thrhi=${thrhi:-16} 
+thrhi=${thrhi:-16}
 size=${size:-1024}
 
-# the summary file a bit smaller than OSTSIZE  
+# the summary file a bit smaller than OSTSIZE
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 
 [ "$SLOW" = no ] && { nobjhi=1; thrhi=4; }
@@ -85,7 +86,7 @@ print_jbd () {
 	local varsvc=${facet}_svc
 	local dev=$(ldiskfs_canon "*.${!varsvc}.mntdev" $facet)
 
-	# ext4: /proc/fs/jbd2/sda1:8/history 
+	# ext4: /proc/fs/jbd2/sda1:8/history
 	# ext3: /proc/fs/jbd/sdb1/history
 
 	do_facet $facet cat /proc/fs/jbd*/${dev}*/$file
diff --git a/lustre/tests/ost-pools.sh b/lustre/tests/ost-pools.sh
index c47dd3e..79e9d9c 100644
--- a/lustre/tests/ost-pools.sh
+++ b/lustre/tests/ost-pools.sh
@@ -25,6 +25,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 check_and_setup_lustre
 
@@ -142,7 +143,7 @@ check_file_in_osts() {
         local ost_count=$($GETSTRIPE $file | grep 0x | wc -l)
         [[ -n "$count" ]] && [[ $ost_count -ne $count ]] && \
             { error "Stripe count $count expected; got $ost_count" && return 1; }
-                
+
         return 0
 }
 
@@ -681,10 +682,10 @@ test_12() {
     add_pool $POOL2 $FSNAME-OST[$TGT_FIRST] "$FIRST_UUID "
     do_facet $SINGLEMDS lctl pool_list $FSNAME.$POOL2
 
-    echo Checking the files again    
+    echo Checking the files again
     check_dir_in_pool $POOL_ROOT/dir1 $POOL
     check_dir_in_pool $POOL_ROOT/dir2 $POOL2
-    check_file_in_osts $POOL_ROOT/file1 "$TGT_LIST2"    
+    check_file_in_osts $POOL_ROOT/file1 "$TGT_LIST2"
     check_file_in_osts $POOL_ROOT/file2 "$(seq $start 2 $TGT_MAX)"
 
     echo Creating some more files
@@ -693,14 +694,14 @@ test_12() {
     create_file $POOL_ROOT/file3 $POOL
     create_file $POOL_ROOT/file4 $POOL2
 
-    echo Checking the new files 
+    echo Checking the new files
     check_file_in_pool $POOL_ROOT/file3 $POOL
     check_file_in_pool $POOL_ROOT/file4 $POOL2
 
     destroy_pool $POOL
     destroy_pool $POOL2
 
-    return 0    
+    return 0
 }
 run_test 12 "OST Pool Membership"
 
@@ -786,7 +787,7 @@ test_14() {
 
     create_dir $POOL_ROOT/dir1 $POOL 1
     create_file $POOL_ROOT/dir1/file $POOL 1
-    local OST=$($GETSTRIPE $POOL_ROOT/dir1/file | grep 0x | cut -f2)    
+    local OST=$($GETSTRIPE $POOL_ROOT/dir1/file | grep 0x | cut -f2)
     i=0
     while [[ $i -lt $numfiles ]];
     do
@@ -1297,7 +1298,7 @@ test_24() {
               error "Stripe count ($count) not inherited in $file ($count1)"
           [[ "$size" != "$size1" ]] && [[ "$size" != "0" ]] && \
               error "Stripe size ($size) not inherited in $file ($size1)"
-      done 
+      done
     done
 
     rm -rf $POOL_ROOT
diff --git a/lustre/tests/parallel-scale.sh b/lustre/tests/parallel-scale.sh
index 061db6d..73e0040 100644
--- a/lustre/tests/parallel-scale.sh
+++ b/lustre/tests/parallel-scale.sh
@@ -6,6 +6,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 # bug number:
 ALWAYS_EXCEPT="$PARALLEL_SCALE_EXCEPT"
@@ -139,7 +140,7 @@ test_compilebench() {
     mkdir -p $testdir
 
     local savePWD=$PWD
-    cd $cbench_DIR 
+    cd $cbench_DIR
     local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej"
 
     log "$cmd"
@@ -147,7 +148,7 @@ test_compilebench() {
     local rc=0
     eval $cmd
     rc=$?
-        
+
     cd $savePWD
     [ $rc = 0 ] || error "compilebench failed: $rc"
     rm -rf $testdir
@@ -260,9 +261,9 @@ test_connectathon() {
     #    -s  special
     #    -l  lock
     #    -a  all of the above
-    #   
+    #
     # -f      a quick functionality test
-    # 
+    #
 
     tests="-b -g -s"
     # Include lock tests unless we're running on nfsv4
@@ -306,7 +307,7 @@ test_ior() {
 
         echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)"
     fi
- 
+
     generate_machine_file $clients $MACHINEFILE || return $?
 
     print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
@@ -316,13 +317,13 @@ test_ior() {
     # mpi_run uses mpiuser
     chmod 0777 $testdir
     if [ "$NFSCLIENT" ]; then
-        setstripe_nfsserver $testdir -c -1 || 
-            { error "setstripe on nfsserver failed" && return 1; } 
+        setstripe_nfsserver $testdir -c -1 ||
+            { error "setstripe on nfsserver failed" && return 1; }
     else
         $LFS setstripe $testdir -c -1 ||
             { error "setstripe failed" && return 2; }
     fi
-    # 
+    #
     # -b N  blockSize -- contiguous bytes to write per task  (e.g.: 8, 4k, 2m, 1g)"
     # -o S  testFileName
     # -t N  transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
@@ -342,7 +343,7 @@ test_ior() {
     rm -rf $testdir
 }
 run_test ior "ior"
- 
+
 test_cascading_rw() {
     if [ "$NFSCLIENT" ]; then
         skip "skipped for NFSCLIENT mode"
@@ -369,7 +370,7 @@ test_cascading_rw() {
     # mpi_run uses mpiuser
     chmod 0777 $testdir
 
-    # -g: debug mode 
+    # -g: debug mode
     # -n: repeat test # times
 
     local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
@@ -391,7 +392,7 @@ test_write_append_truncate() {
         return
     fi
 
-    # location is lustre/tests dir 
+    # location is lustre/tests dir
     if ! which write_append_truncate > /dev/null 2>&1 ; then
         skip_env "write_append_truncate not found"
         return
@@ -578,9 +579,9 @@ test_statahead () {
 
     cancel_lru_locks mdc
 
-    local cmd="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir $testdir --nfiles $num_files --filefmt 'f%%d'"    
+    local cmd="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir $testdir --nfiles $num_files --filefmt 'f%%d'"
     echo "+ $cmd"
-    
+
     mpi_run -np $((num_clients * 32)) -machinefile ${MACHINEFILE} $cmd
 
     local rc=$?
diff --git a/lustre/tests/performance-sanity.sh b/lustre/tests/performance-sanity.sh
index 918b891..b217d0d 100644
--- a/lustre/tests/performance-sanity.sh
+++ b/lustre/tests/performance-sanity.sh
@@ -11,13 +11,14 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
 init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 [ -x "$MDSRATE" ] || FAIL_ON_ERROR=true error "No mdsrate program. Aborting."
 which mpirun > /dev/null 2>&1 || \
-	FAIL_ON_ERROR=true error "No mpirun program. Aborting." 
+	FAIL_ON_ERROR=true error "No mpirun program. Aborting."
 
 # Skip these tests
-# bug number:  15266 15266 
+# bug number:  15266 15266
 ALWAYS_EXCEPT="1     2    $PERFORMANCE_SANITY_EXCEPT"
 
 build_test_filter
@@ -28,7 +29,7 @@ test_1() {
 }
 run_test 1 "single-client IO perf ====="
 
-# parallel-IOR-rates 
+# parallel-IOR-rates
 test_2() {
     echo "MPI coordinated test of parallel filesystem system calls and library functions"
 }
diff --git a/lustre/tests/racer.sh b/lustre/tests/racer.sh
index 3567ebd..327f051 100644
--- a/lustre/tests/racer.sh
+++ b/lustre/tests/racer.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
+# vim:autoindent:shiftwidth=4:tabstop=4:
+
 #set -vx
 set -e
 
@@ -7,9 +10,11 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 racer=$LUSTRE/tests/racer/racer.sh
 echo racer: $racer
+[ -z "$racer" ] && echo racer is not installed && exit 1
 
 CLIENTS=${CLIENTS:-$HOSTNAME}
 RACERDIRS=${RACERDIRS:-$DIR}
@@ -23,38 +28,140 @@ done
 DURATION=${DURATION:-900}
 [ "$SLOW" = "no" ] && DURATION=300
 
+PIDFILE=$TMP/racer.$$
+
+assert_env CLIENTS
+
+timer_on () {
+	sleep $1 && kill -s ALRM $$ &
+	TIMERPID=$!
+	echo TIMERPID=$TIMERPID
+}
+
+do_racer_cleanup () {
+	trap 0
+
+	local WAIT=0
+	local INTERVAL=5
+        local pids
+	local rc=0
+	local TMAX
+
+	local RDIR=$1
+
+	echo "DOING RACER CLEANUP ... "
+
+	# Check if all processes are killed
+
+	local clients=$CLIENTS
+	local num_clients=$(get_node_count ${clients//,/ })
+
+	if at_is_enabled; then
+		TMAX=$(at_max_get mds)
+	else
+		TMAX=$(lctl get_param -n timeout)
+	fi
+
+	[ $TMAX -gt $((num_clients * 60)) ] || TMAX=$((num_clients * 60))
+	# 1.Let chance to racer to kill all it's processes
+	# FIXME: not sure how long does it take for racer to kill all processes
+	# 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec
+	while [ $WAIT -lt $TMAX ]; do
+		running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true)
+		[ -z "$running" ] && rc=0 && break
+		echo "clients $clients are still running the racer processes. Waited $WAIT secs"
+		echo $running
+		rc=1
+		[ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL))
+		sleep $INTERVAL
+		WAIT=$((WAIT + INTERVAL))
+	done
+
+	# 2. Kill the remaining processes
+	if [ $rc -ne 0 ]; then
+		for C in ${clients//,/ } ; do
+			pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true)
+			if [ ! -z "$pids" ]; then
+				echo "client $C still running racer processes after $WAIT seconds. Killing $pids"
+				do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)"
+				do_node $C kill -TERM $pids || true
+				# let processes to be killed, there maybe many threads to be killed, so give 20 sec gap
+				sleep 20
+	# 3. Check if the processes were killed
+	# exit error if the processes still exist
+				for pid in $pids; do
+					do_node $C "ps -P $pid" && RC=1 || true
+				done
+			else
+				echo "All processes on client $C exited after $WAIT seconds. OK."
+			fi
+		done
+	else
+		echo "No racer processes running after $WAIT seconds. OK."
+		wait_remote_prog $racer 10
+	fi
+}
+
+racer_cleanup () {
+	if [ "$timeout" == "timeout" ]; then
+		echo $timeout killing RACERPID=$RACERPID
+		kill $RACERPID || true
+		sleep 2	# give chance racer to kill it's processes
+		local dir
+		for dir in $RDIRS; do
+			do_racer_cleanup $dir
+		done
+	else
+		echo "Racer completed before DURATION=$DURATION expired. Cleaning up..."
+		kill $TIMERPID || true
+		for dir in $RDIRS; do
+			do_racer_cleanup $dir
+		done
+	fi
+}
+
+racer_timeout () {
+	timeout="timeout"
+	RACERPID=$(cat $PIDFILE)
+	rm -f $PIDFILE
+	racer_cleanup
+	echo "$0: completed $RC"
+	return $RC
+}
+
 build_test_filter
 check_and_setup_lustre
+trap racer_timeout ALRM
 
 # run racer
 test_1() {
-    local rrc=0
-    local rc=0
-    local clients=${CLIENTS:-$(hostname)}
+    RC=0
 
-    check_progs_installed $clients $racer || \
-        { skip_env "$racer not found" && return 0; }
+    timer_on $((DURATION + 5))
 
-    local rpids=""
+    RACERPID=""
     for rdir in $RDIRS; do
-        do_nodes $clients "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" &
+        do_nodes $CLIENTS "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" &
         pid=$!
-        rpids="$rpids $pid"
+        RACERPID="$RACERPID $pid"
     done
 
-    echo racers pids: $rpids
-    for pid in $rpids; do
-        wait $pid
+    echo RACERPID=$RACERPID
+    echo $RACERPID > $PIDFILE
+    for rpid in $RACERPID; do
+        wait $rpid
         rc=$?
-        echo "pid=$pid rc=$rc"
+        echo "rpid=$rpid rc=$rc"
         if [ $rc != 0 ]; then
-            rrc=$((rrc + 1))
+            RC=$((RC + 1))
         fi
     done
 
-    return $rrc
+    racer_cleanup
+
+    return $RC
 }
-run_test 1 "racer on clients: ${CLIENTS:-$(hostname)} DURATION=$DURATION"
+run_test 1 "racer on clients: $CLIENTS DURATION=$DURATION"
 
 complete $(basename $0) $SECONDS
 check_and_cleanup_lustre
diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh
index 3f83867..4dff18f 100644
--- a/lustre/tests/recovery-double-scale.sh
+++ b/lustre/tests/recovery-double-scale.sh
@@ -17,6 +17,7 @@ CLEANUP=${CLEANUP:-""}
 init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)}
 DEBUGLOG=$TESTSUITELOG.debug
 
@@ -84,7 +85,7 @@ reboot_recover_node () {
                       shutdown_client $c
                       boot_node $c
                       echo "Reintegrating $c"
-                      # one client fails; need dk logs from this client only 
+                      # one client fails; need dk logs from this client only
                       zconf_mount $c $MOUNT || NODES="$c $(facet_host mds) $(osts_nodes)" error_exit "zconf_mount failed"
                  done
                  start_client_loads $item
@@ -166,7 +167,7 @@ failover_pair() {
 
     reboot_recover_node $item1 $type1
 
-    # Hendrix test17 description: 
+    # Hendrix test17 description:
     # Introduce a failure, wait at
     # least 5 minutes (for recovery),
     # introduce a 2nd
@@ -178,13 +179,13 @@ failover_pair() {
     # We have a "double failures" if SERIAL is not set,
     # do not need a sleep between failures for "double failures"
 
-    log "                            Failing type2=$type2 item2=$item2 ... "    
+    log "                            Failing type2=$type2 item2=$item2 ... "
     reboot_recover_node $item2 $type2
 
     # Client loads are allowed to die while in recovery, so we just
     # restart them.
     log "==== Checking the clients loads AFTER  failovers -- ERRORS_OK=$ERRORS_OK"
-    restart_client_loads $NODES_TO_USE $ERRORS_OK || return $? 
+    restart_client_loads $NODES_TO_USE $ERRORS_OK || return $?
     log "Done checking / re-Starting client loads. PASS"
     return 0
 }
@@ -209,7 +210,7 @@ summary_and_cleanup () {
             echo "Client load failed on node $END_RUN_NODE"
             echo
             echo "client $END_RUN_NODE load debug output :"
-            local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug 
+            local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug
             do_node ${END_RUN_NODE} "set -x; [ -e $logfile ] && cat $logfile " || true
         fi
         rc=1
@@ -260,11 +261,11 @@ START_TS=$(date +%s)
 CURRENT_TS=$START_TS
 ELAPSED=0
 
-# Set SERIAL to serialize the failure through a recovery of the first failure. 
+# Set SERIAL to serialize the failure through a recovery of the first failure.
 SERIAL=${SERIAL:-""}
 ERRORS_OK="yes"
 
-[ "$SERIAL" ] && ERRORS_OK="" 
+[ "$SERIAL" ] && ERRORS_OK=""
 
 FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes
 
@@ -275,7 +276,7 @@ if ! do_nodesv $NODES_TO_USE "cat $TMP/client-load.pid"; then
         exit 3
 fi
 
-# FIXME: Do we want to have an initial sleep period where the clients 
+# FIXME: Do we want to have an initial sleep period where the clients
 # just run before introducing a failure?
 sleep $FAILOVER_PERIOD
 
@@ -296,7 +297,7 @@ if [ $OSTCOUNT -gt 1 ]; then
     sleep $FAILOVER_PERIOD
 else
     skip "$0 : $OSTCOUNT < 2 OSTs, test 4 skipped"
-fi 
+fi
 
 #CMD_TEST_NUM=17.5
 failover_pair OST clients "test 5: failover OST, then 2 clients ===="
diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh
index 82a5507..496c71c 100644
--- a/lustre/tests/recovery-mds-scale.sh
+++ b/lustre/tests/recovery-mds-scale.sh
@@ -14,6 +14,7 @@ CLEANUP=${CLEANUP:-""}
 init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)}
 DEBUGLOG=$TESTSUITELOG.debug
@@ -69,7 +70,7 @@ if [ "$FLAVOR" == "MDS" ]; then
 else
     SERVERS=$OSTS
 fi
- 
+
 if [ "$SLOW" = "no" ]; then
     DURATION=${DURATION:-$((60 * 30))}
     SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))}
@@ -119,7 +120,7 @@ summary_and_cleanup () {
     # the one we are really interested in.
         if [ -n "$END_RUN_NODE" ]; then
             var=$(node_var_name $END_RUN_NODE)_load
-            echo "Client load failed on node $END_RUN_NODE" 
+            echo "Client load failed on node $END_RUN_NODE"
             echo
             echo "client $END_RUN_NODE load stdout and debug files :
               ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}
@@ -127,7 +128,7 @@ summary_and_cleanup () {
         fi
         rc=1
     fi
-     
+
     echo $(date +'%F %H:%M:%S') Terminating clients loads ...
     echo "$0" >> $END_RUN_FILE
     local result=PASS
@@ -172,7 +173,7 @@ Status: $result: rc=$rc"
 }
 
 #
-# MAIN 
+# MAIN
 #
 log "-----============= $0 starting =============-----"
 
@@ -204,21 +205,21 @@ CURRENT_TS=$START_TS
 
 while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
 
-    # In order to perform the 
+    # In order to perform the
     # expected number of failovers, we need to account the following :
     # 1) the time that has elapsed during the client load checking
     # 2) time takes for failover
 
     it_time_start=$(date +%s)
-    
+
     SERVERFACET=$(get_random_entry $SERVERS)
     var=${SERVERFACET}_numfailovers
 
-    # Check that our client loads are still running. If any have died, 
-    # that means they have died outside of recovery, which is unacceptable.    
+    # Check that our client loads are still running. If any have died,
+    # that means they have died outside of recovery, which is unacceptable.
 
     log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
-    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" 
+    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
 
     if ! check_client_loads $NODES_TO_USE; then
         exit 4
@@ -234,7 +235,7 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     log "Checking clients are in FULL state before doing next failover"
     if ! wait_clients_import_state $NODES_TO_USE $SERVERFACET FULL; then
         echo "Clients import not FULL, please consider to increase SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD !"
-        
+
     fi
     log "Starting failover on $SERVERFACET"
 
@@ -252,14 +253,14 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     # Increment the number of failovers
     val=$((${!var} + 1))
     eval $var=$val
- 
+
     CURRENT_TS=$(date +%s)
     ELAPSED=$((CURRENT_TS - START_TS))
- 
+
     sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
 
     # keep count the number of itterations when
-    # time spend to failover and two client loads check exceeded 
+    # time spend to failover and two client loads check exceeded
     # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
     if [ $sleep -lt $MINSLEEP ]; then
         reqfail=$((reqfail +1))
@@ -269,8 +270,8 @@ This iteration, the load was only applied for sleep=$sleep seconds.
 Estimated max recovery time : $max_recov_time
 Probably the hardware is taking excessively long to boot.
 Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
-        [ $reqfail -gt $REQFAIL ] && exit 6 
-    fi  
+        [ $reqfail -gt $REQFAIL ] && exit 6
+    fi
 
     log "$SERVERFACET has failed over ${!var} times, and counting..."
 
@@ -278,7 +279,7 @@ Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug
          break
     fi
 
-    if [ $sleep -gt 0 ]; then 
+    if [ $sleep -gt 0 ]; then
         echo "sleeping $sleep seconds ... "
         sleep $sleep
     fi
diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh
index 2fced26..fb281e1 100644
--- a/lustre/tests/recovery-random-scale.sh
+++ b/lustre/tests/recovery-random-scale.sh
@@ -18,6 +18,7 @@ CLEANUP=${CLEANUP:-""}
 init_test_env $@
 
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)}
 DEBUGLOG=$TESTSUITELOG.debug
@@ -123,7 +124,7 @@ summary_and_cleanup () {
     # the one we are really interested in.
         if [ -n "$END_RUN_NODE" ]; then
             var=$(node_var_name $END_RUN_NODE)_load
-            echo "Client load failed on node $END_RUN_NODE" 
+            echo "Client load failed on node $END_RUN_NODE"
             echo
             echo "client $END_RUN_NODE load stdout and debug files :
               ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}
@@ -179,7 +180,7 @@ Status: $result: rc=$rc"
 }
 
 #
-# MAIN 
+# MAIN
 #
 log "-----============= $0 starting =============-----"
 
@@ -213,13 +214,13 @@ sleep=0
 ERRORS_OK="yes"
 while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
 
-    # In order to perform the 
+    # In order to perform the
     # expected number of failovers, we need to account the following :
     # 1) the time that has elapsed during the client load checking
     # 2) time takes for failover
 
     it_time_start=$(date +%s)
-    
+
     FAIL_CLIENT=$(get_random_entry $NODES_TO_USE)
     client_var=$(node_var_name $FAIL_CLIENT)_nums
 
@@ -230,11 +231,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     SERVERFACET=$(get_random_entry $MDTS)
     var=${SERVERFACET}_nums
 
-    # Check that our client loads are still running. If any have died, 
-    # that means they have died outside of recovery, which is unacceptable.    
+    # Check that our client loads are still running. If any have died,
+    # that means they have died outside of recovery, which is unacceptable.
 
     log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
-    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" 
+    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
 
     if ! check_client_loads $NODES_TO_USE; then
         exit 4
@@ -246,11 +247,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     log "Starting failover on $SERVERFACET"
 
     facet_failover "$SERVERFACET" || exit 1
-    if ! wait_recovery_complete $SERVERFACET ; then 
+    if ! wait_recovery_complete $SERVERFACET ; then
         echo "$SERVERFACET recovery is not completed!"
         exit 7
     fi
- 
+
     boot_node $FAIL_CLIENT
     echo "Reintegrating $FAIL_CLIENT"
     zconf_mount $FAIL_CLIENT $MOUNT || exit $?
@@ -269,10 +270,10 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     # not for all clients.
     if [ -e $END_RUN_FILE ]; then
         read END_RUN_NODE < $END_RUN_FILE
-        [[ $END_RUN_NODE = $FAIL_CLIENT ]] && 
+        [[ $END_RUN_NODE = $FAIL_CLIENT ]] &&
             rm -f $END_RUN_FILE || exit 13
     fi
-   
+
     restart_client_loads $FAIL_CLIENT $ERRORS_OK || exit $?
 
     # Check that not failed clients loads are still running.
@@ -286,11 +287,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
 
     CURRENT_TS=$(date +%s)
     ELAPSED=$((CURRENT_TS - START_TS))
- 
+
     sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
 
     # keep count the number of itterations when
-    # time spend to failover and two client loads check exceeded 
+    # time spend to failover and two client loads check exceeded
     # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
     if [ $sleep -lt $MINSLEEP ]; then
         reqfail=$((reqfail +1))
@@ -300,8 +301,8 @@ This iteration, the load was only applied for sleep=$sleep seconds.
 Estimated max recovery time : $max_recov_time
 Probably the hardware is taking excessively long to boot.
 Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
-        [ $reqfail -gt $REQFAIL ] && exit 6 
-    fi  
+        [ $reqfail -gt $REQFAIL ] && exit 6
+    fi
 
     log " Number of failovers:
 $(numfailovers)                and counting..."
@@ -310,7 +311,7 @@ $(numfailovers)                and counting..."
          break
     fi
 
-    if [ $sleep -gt 0 ]; then 
+    if [ $sleep -gt 0 ]; then
         echo "sleeping $sleep seconds ... "
         sleep $sleep
     fi
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh
index 0897f01..e3558c1 100755
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -10,8 +10,9 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
 
 # also long tests: 19, 21a, 21e, 21f, 23, 27
 #                                   1  2.5  2.5    4    4          (min)"
@@ -136,7 +137,7 @@ run_test 11 "wake up a thread waiting for completion after eviction (b=2460)"
 
 #b=2494
 test_12(){
-    $LCTL mark multiop $DIR/$tfile OS_c 
+    $LCTL mark multiop $DIR/$tfile OS_c
     do_facet mds "lctl set_param fail_loc=0x115"
     clear_failloc mds $((TIMEOUT * 2)) &
     multiop_bg_pause $DIR/$tfile OS_c || return 1
@@ -262,7 +263,7 @@ test_18a() {
     local osc2dev=`lctl get_param -n devices | grep ${ost2_svc}-osc- | awk '{print $1}'`
     $LCTL --device $osc2dev deactivate || return 3
     # my understanding is that there should be nothing in the page
-    # cache after the client reconnects?     
+    # cache after the client reconnects?
     rc=0
     pgcache_empty || rc=2
     $LCTL --device $osc2dev activate
@@ -383,7 +384,7 @@ test_20a() {	# bug 2983 - ldlm_handle_enqueue cleanup
 	rc=$?
 	[ $rc -eq 0 ] && error "multiop didn't fail enqueue: rc $rc" || true
 }
-run_test 20a "ldlm_handle_enqueue error (should return error)" 
+run_test 20a "ldlm_handle_enqueue error (should return error)"
 
 test_20b() {	# bug 2986 - ldlm_handle_enqueue error during open
 	remote_ost_nodsh && skip "remote OST with nodsh" && return 0
@@ -693,7 +694,7 @@ test_26a() {      # was test_26 bug 5921 - evict dead exports by pinger
 	echo starting with $OST_NEXP OST exports
 # OBD_FAIL_PTLRPC_DROP_RPC 0x505
 	do_facet client lctl set_param fail_loc=0x505
-	# evictor takes up to 2.25x to evict.  But if there's a 
+	# evictor takes up to 2.25x to evict.  But if there's a
 	# race to start the evictor from various obds, the loser
 	# might have to wait for the next ping.
 
@@ -732,8 +733,8 @@ test_26b() {      # bug 10140 - evict dead exports by pinger
 	# PING_INTERVAL max(obd_timeout / 4, 1U)
 	# PING_EVICT_TIMEOUT (PING_INTERVAL * 6)
 
-	# evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.  
-	# But if there's a race to start the evictor from various obds, 
+	# evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict.
+	# But if there's a race to start the evictor from various obds,
 	# the loser might have to wait for the next ping.
 	# = 9 * PING_INTERVAL + PING_INTERVAL
 	# = 10 PING_INTERVAL = 10 obd_timeout / 4 = 2.5 obd_timeout
@@ -762,7 +763,7 @@ test_27() {
 	facet_failover mds
 	#no crashes allowed!
         kill -USR1 $CLIENT_PID
-	wait $CLIENT_PID 
+	wait $CLIENT_PID
 	true
 	FAILURE_MODE=$save_FAILURE_MODE
 }
@@ -802,7 +803,7 @@ test_50() {
 	# client process should see no problems even though MDS went down
 	sleep $TIMEOUT
         kill -USR1 $CLIENT_PID
-	wait $CLIENT_PID 
+	wait $CLIENT_PID
 	rc=$?
 	echo writemany returned $rc
 	#these may fail because of eviction due to slow AST response.
@@ -833,7 +834,7 @@ test_51() {
 	# and recovery was interrupted
 	sleep $TIMEOUT
         kill -USR1 $CLIENT_PID
-	wait $CLIENT_PID 
+	wait $CLIENT_PID
 	rc=$?
 	echo writemany returned $rc
 	[ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true
@@ -931,8 +932,8 @@ test_55() {
 	count=0
 	echo  "step2: testing ......"
 	while [ $count -le 64 ]; do
-	    dd_name="`ps x | awk '$1 == '$DDPID' { print $5 }'`"	    
-	    if [ -z  $dd_name ]; then 
+	    dd_name="`ps x | awk '$1 == '$DDPID' { print $5 }'`"
+	    if [ -z  $dd_name ]; then
                 ls -l $DIR/$tdir
 		echo  "debug: (dd_name=$dd_name, dd_pid=$DDPID, time=$count)"
 		error "dd shouldn't be finished!"
@@ -971,7 +972,7 @@ test_56() { # b=11277
 run_test 56 "do not allow reconnect to busy exports"
 
 test_57_helper() {
-        # no oscs means no client or mdt 
+        # no oscs means no client or mdt
         while lctl get_param osc.*.* > /dev/null 2>&1; do
                 : # loop until proc file is removed
         done
@@ -1038,7 +1039,7 @@ test_61()
 	$LFS setstripe -c 1 --index 0 $DIR/d61
 
 	replay_barrier mds
-	createmany -o $DIR/d61/$tfile-%d 10 
+	createmany -o $DIR/d61/$tfile-%d 10
 	local oid=`do_facet ost1 "lctl get_param -n obdfilter.${ost1_svc}.last_id"`
 
 	fail_abort mds
diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh
index 449c4ab..871ecd5 100755
--- a/lustre/tests/replay-dual.sh
+++ b/lustre/tests/replay-dual.sh
@@ -14,10 +14,10 @@ MOUNT_2=${MOUNT_2:-"yes"}
 . $LUSTRE/tests/test-framework.sh
 
 init_test_env $@
-
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
 
 [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 5 14"
 
diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh
index 563a27a..40afe70 100755
--- a/lustre/tests/replay-ost-single.sh
+++ b/lustre/tests/replay-ost-single.sh
@@ -8,20 +8,21 @@ CLEANUP=${CLEANUP:-""}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 # While we do not use OSTCOUNT=1 setup anymore,
 # ost1failover_HOST is used
 #ostfailover_HOST=${ostfailover_HOST:-$ost_HOST}
 #failover= must be defined in OST_MKFS_OPTIONS if ostfailover_HOST != ost_HOST
 
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+require_dsh_ost || exit 0
 
 # Tests that fail on uml
 CPU=`awk '/model/ {print $4}' /proc/cpuinfo`
 [ "$CPU" = "UML" ] && EXCEPT="$EXCEPT 6"
 
 # Skip these tests
-# BUG NUMBER: 
+# BUG NUMBER:
 ALWAYS_EXCEPT="$REPLAY_OST_SINGLE_EXCEPT"
 
 #					
@@ -34,7 +35,7 @@ assert_DIR
 rm -rf $DIR/[df][0-9]*
 
 TDIR=$DIR/d0.${TESTSUITE}
-mkdir -p $TDIR 
+mkdir -p $TDIR
 $LFS setstripe $TDIR -i 0 -c 1
 $LFS getstripe $TDIR
 
@@ -67,11 +68,11 @@ run_test 1 "touch"
 test_2() {
     for i in `seq 10`; do
         echo "tag-$i" > $TDIR/$tfile-$i
-    done 
+    done
     fail ost1
     for i in `seq 10`; do
       grep -q "tag-$i" $TDIR/$tfile-$i || error "f2-$i"
-    done 
+    done
     rm -f $TDIR/$tfile-*
 }
 run_test 2 "|x| 10 open(O_CREAT)s"
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index ad7b1e3..eacbecb 100644
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -13,10 +13,11 @@ CLEANUP=${CLEANUP:-}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 CHECK_GRANT=${CHECK_GRANT:-"yes"}
 GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
 
-remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
+require_dsh_mds || exit 0
 
 # Skip these tests
 # bug number:
@@ -906,7 +907,7 @@ test_45() {
     [ "$mdcdev" ] || return 2
     [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; }
 
-    $LCTL --device $mdcdev recover || return 6 
+    $LCTL --device $mdcdev recover || return 6
 
     multiop_bg_pause $DIR/$tfile O_c || return 1
     pid=$!
@@ -2041,7 +2042,7 @@ test_80b() {
         { skip "sync journal is not implemeted" && return; }
 
     do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
-    
+
     replay_barrier ost1
     lfs setstripe -i 0 -c 1 $DIR/$tfile
     dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 || error "Cannot write"
@@ -2131,14 +2132,14 @@ test_85a() { #bug 16774
     createmany -o $DIR/$tfile- 100
     ls -l $DIR/ > /dev/null
 
-    lov_id=`lctl dl | grep "clilov"` 
+    lov_id=`lctl dl | grep "clilov"`
     addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'`
     count=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count`
     echo "before recovery: unused locks count = $count"
     [ $count -ne 0 ] || error "unused locks should not be zero before recovery"
 
     fail mds
-    
+
     count2=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count`
     echo "after recovery: unused locks count = $count2"
 
@@ -2161,13 +2162,13 @@ test_85b() { #bug 16774
         dd if=$DIR/$tfile-$i of=/dev/null bs=4096 count=32 >/dev/null 2>&1
     done
 
-    lov_id=`lctl dl | grep "clilov"` 
+    lov_id=`lctl dl | grep "clilov"`
     addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'`
     count=`lctl get_param -n ldlm.namespaces.*OST0000*$addr.lock_unused_count`
     echo "before recovery: unused locks count = $count"
 
     fail ost1
-    
+
     count2=`lctl get_param -n ldlm.namespaces.*OST0000*$addr.lock_unused_count`
     echo "after recovery: unused locks count = $count2"
 
@@ -2202,7 +2203,7 @@ test_87() { #bug 17485
     local mdtosc=$(get_mdtosc_proc_path $OST)
     local last_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id)
     local next_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id)
-    echo "before test: last_id = $last_id, next_id = $next_id" 
+    echo "before test: last_id = $last_id, next_id = $next_id"
 
     echo "Creating to objid $last_id on ost $OST..."
     createmany -o $DIR/$tdir/f-%d $next_id $((last_id - next_id + 2))
@@ -2213,7 +2214,7 @@ test_87() { #bug 17485
 
     last_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id)
     next_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id)
-    echo "before recovery: last_id = $last_id2, next_id = $next_id2" 
+    echo "before recovery: last_id = $last_id2, next_id = $next_id2"
 
     # if test uses shutdown_facet && reboot_facet instead of facet_failover ()
     # it has to take care about the affected facets, bug20407
@@ -2237,9 +2238,9 @@ test_87() { #bug 17485
 
     last_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id)
     next_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id)
-    echo "after recovery: last_id = $last_id2, next_id = $next_id2" 
+    echo "after recovery: last_id = $last_id2, next_id = $next_id2"
 
-    # create new files, which should use new objids, and ensure the orphan 
+    # create new files, which should use new objids, and ensure the orphan
     # cleanup phase for ost1 is completed at the same time
     for i in `seq 8`; do
         file_id=$(($last_id + 10 + $i))
diff --git a/lustre/tests/replay-vbr.sh b/lustre/tests/replay-vbr.sh
index 7a3c8f9..4c09fc6 100644
--- a/lustre/tests/replay-vbr.sh
+++ b/lustre/tests/replay-vbr.sh
@@ -13,13 +13,14 @@ CLEANUP=${CLEANUP:-""}
 . $LUSTRE/tests/test-framework.sh
 
 init_test_env $@
-
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 [ -n "$CLIENTS" ] || { skip_env "Need two or more clients" && exit 0; }
 [ $CLIENTCOUNT -ge 2 ] || \
     { skip_env "Need two or more remote clients, have $CLIENTCOUNT" && exit 0; }
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
+
+require_dsh_mds || exit 0
 
 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
 
diff --git a/lustre/tests/rpc.sh b/lustre/tests/rpc.sh
index 15e960a..79c1327 100755
--- a/lustre/tests/rpc.sh
+++ b/lustre/tests/rpc.sh
@@ -3,12 +3,19 @@ export PATH=`dirname $0`/../utils:$PATH
 NAME=${NAME:-local}
 
 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
+
+if [ ! -f $LUSTRE/tests/rpc.sh ]; then
+    LUSTRE=$(cd $(dirname $(which $0))/..; echo $PWD)
+fi
+
 . $LUSTRE/tests/test-framework.sh
 init_test_env
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 
-cmd=$1
-shift
-$cmd $@
+# Reset the trap on ERR set by the framework.  Noticing this failure is the
+# framework's job.
+trap - ERR
+
+# Execute the command
+"$@"
 
-exit $?
diff --git a/lustre/tests/runtests b/lustre/tests/runtests
index f99f69f..1416303 100755
--- a/lustre/tests/runtests
+++ b/lustre/tests/runtests
@@ -13,6 +13,7 @@ export NAME=${NAME:-local}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 RUNTESTS_SRC=${RUNTESTS_SRC:-"/etc /bin"}
 [ "$COUNT" ] || COUNT=1000
diff --git a/lustre/tests/sanity-benchmark.sh b/lustre/tests/sanity-benchmark.sh
index 4c19a53..2ea5b3d 100644
--- a/lustre/tests/sanity-benchmark.sh
+++ b/lustre/tests/sanity-benchmark.sh
@@ -12,6 +12,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 # bug number:
 ALWAYS_EXCEPT="$SANITY_BENCHMARK_EXCEPT"
@@ -58,7 +59,7 @@ test_dbench() {
     local SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'`
     DB_THREADS=$((SPACE / 50000))
     [ $THREADS -lt $DB_THREADS ] && DB_THREADS=$THREADS
-    
+
     $DEBUG_OFF
     myUID=$RUNAS_ID
     myGID=$RUNAS_GID
@@ -113,7 +114,7 @@ test_iozone() {
     fi
 
     export O_DIRECT
-    
+
     local IOZDIR=$DIR/d0.iozone
     mkdir -p $IOZDIR
     $LFS setstripe -c -1 $IOZDIR
@@ -138,7 +139,7 @@ test_iozone() {
 	{ error "iozone (1) failed" && return 1; }
     rm -f $IOZLOG
     $DEBUG_ON
-    
+
     # check if O_DIRECT support is implemented in kernel
     if [ -z "$O_DIRECT" ]; then
 	touch $DIR/f.iozone
@@ -245,7 +246,7 @@ space_check () {
     local num_runs=$(echo ${pios_THREADCOUNT//,/ } | wc -w)
     size=$(( size * $num_runs))
     space=$((space * 1024))
-    echo size=$size space=$space 
+    echo size=$size space=$space
     if [ $space -le $size ]; then
         local ratio=$(( size / space + 1 ))
         echo "Need free space atleast $size, available $space, ratio=$ratio"
@@ -260,7 +261,7 @@ space_check () {
     fi
 }
 
-pios_setup() { 
+pios_setup() {
     local testdir=$DIR/$tdir
     mkdir -p $testdir
 
@@ -285,8 +286,8 @@ run_pios () {
     local cmd="$PIOSBIN  -t $pios_THREADCOUNT -n $pios_REGIONCOUNT \
                          -c $pios_CHUNKSIZE -s $pios_REGIONSIZE    \
                          -o $pios_OFFSET $@ -p $testdir"
-    
-    if [ ! -d $testdir ]; then  
+
+    if [ ! -d $testdir ]; then
         error "No test directory created, setup_pios must have failed"
         return 20
     fi
@@ -314,7 +315,7 @@ test_pios_ssf() {
         return 0
     fi
     run_pios || return
-    run_pios  --verify || rc=$? 
+    run_pios  --verify || rc=$?
     pios_cleanup $rc
     return $rc
 }
diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh
index 23b3523..5d15058 100755
--- a/lustre/tests/sanity-quota.sh
+++ b/lustre/tests/sanity-quota.sh
@@ -53,12 +53,13 @@ LUSTRE=${LUSTRE:-`dirname $0`/..}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 DIRECTIO=${DIRECTIO:-$LUSTRE/tests/directio}
 
 unset ENABLE_QUOTA
 
-remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
-remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
+require_dsh_mds || exit 0
+require_dsh_ost || exit 0
 
 [ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 18b 21"
 
@@ -1089,7 +1090,7 @@ test_11() {
            echo ""
            PROCS=$(ps -ef | grep -v grep | grep "dd if /dev/zero of $TESTDIR" | wc -l)
            LAST_USED=0
-           while [ $PROCS -gt 0 ]; do 
+           while [ $PROCS -gt 0 ]; do
              sleep 20
              SECS=$((SECS + sleep))
              PROCS=$(ps -ef | grep -v grep | grep "dd if /dev/zero of $TESTDIR" | wc -l)
@@ -1867,7 +1868,6 @@ test_24() {
 
         set_blk_unitsz $((128 * 1024))
         set_blk_tunesz $((128 * 1024 / 2))
-        
 }
 run_test_with_stat 24 "test if lfs draws an asterix when limit is reached (16646) ==========="
 
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 637e0a6..17c87a1 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -65,7 +65,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
-
+init_logging
 [ "$SLOW" = "no" ] && EXCEPT_SLOW="24o 27m 36f 36g 36h 51b 51c 60c 63 64b 68 71 73 77f 78 101 103 115 120g 124b"
 
 FAIL_ON_ERROR=${FAIL_ON_ERROR:-false}
@@ -6856,7 +6856,7 @@ test_201c() {
 
 	do_facet mgs $LCTL pool_destroy $FSNAME.$POOL
 	
-	sleep 2                        
+	sleep 2
     # striping on an empty/nonexistant pool should fall back to "pool of everything"
 	touch ${POOL_DIR}/$tfile || error "failed to use fallback striping for missing pool"
 	# setstripe on an empty pool should fail
@@ -6940,4 +6940,4 @@ check_and_cleanup_lustre
 if [ "$I_MOUNTED" != "yes" ]; then
 	lctl set_param debug="$OLDDEBUG" 2> /dev/null || true
 fi
-exit_status 
+exit_status
diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh
index f0521bb..9a909ac 100644
--- a/lustre/tests/sanityn.sh
+++ b/lustre/tests/sanityn.sh
@@ -38,6 +38,7 @@ CLEANUP=${CLEANUP:-:}
 SETUP=${SETUP:-:}
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
+init_logging
 
 [ "$SLOW" = "no" ] && EXCEPT_SLOW="12 16"
 
@@ -64,6 +65,9 @@ check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS
 
 build_test_filter
 
+mkdir -p $MOUNT2
+mount_client $MOUNT2
+
 test_1a() {
 	touch $DIR1/f1
 	[ -f $DIR2/f1 ] || error
diff --git a/lustre/tests/sgpdd-survey.sh b/lustre/tests/sgpdd-survey.sh
index ca9b3d6..0f6d2e5 100644
--- a/lustre/tests/sgpdd-survey.sh
+++ b/lustre/tests/sgpdd-survey.sh
@@ -5,11 +5,12 @@ set -e
 LUSTRE=${LUSTRE:-`dirname $0`/..}
 . $LUSTRE/tests/test-framework.sh
 init_test_env $@
+init_logging
 
 # QE uses the following parameters:
 # size=128 crghi=16 thrhi=32
 crghi=${crghi:-2}
-thrhi=${thrhi:-16} 
+thrhi=${thrhi:-16}
 size=${size:-1024}
 
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
index fae3a3a7..ea93c40 100644
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -16,6 +16,7 @@ export CATASTROPHE=${CATASTROPHE:-/proc/sys/lnet/catastrophe}
 # function used by scripts run on remote nodes
 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
 . $LUSTRE/tests/functions.sh
+. $LUSTRE/tests/yaml.sh
 
 LUSTRE_TESTS_CFG_DIR=${LUSTRE_TESTS_CFG_DIR:-${LUSTRE}/tests/cfg}
 
@@ -48,14 +49,15 @@ usage() {
 
 print_summary () {
     trap 0
-    [ "$TESTSUITE" == "lfscktest" ] && return 0
+    [ "$TESTSUITE" == "lfsck" ] && return 0
     [ -n "$ONLY" ] && echo "WARNING: ONLY is set to $(echo $ONLY)"
     local details
     local form="%-13s %-17s %-9s %s %s\n"
     printf "$form" "status" "script" "Total(sec)" "E(xcluded) S(low)"
     echo "------------------------------------------------------------------------------------"
-    for O in $TESTSUITE_LIST; do
+    for O in $DEFAULT_SUITES; do
         [ "${!O}" = "no" ] && continue || true
+        O=$(echo $O  | tr "-" "_" | tr "[:lower:]" "[:upper:]")
         local o=$(echo $O | tr "[:upper:]" "[:lower:]")
         o=${o//_/-}
         local log=${TMP}/${o}.log
@@ -82,23 +84,25 @@ print_summary () {
         printf "$form" "-" "-" "-" "S=$(echo $slow)"
     done
 
-    for O in $TESTSUITE_LIST; do
+    for O in $DEFAULT_SUITES; do
+        O=$(echo $O  | tr "-" "_" | tr "[:lower:]" "[:upper:]")
         if [ "${!O}" = "no" ]; then
             # FIXME.
             # only for those tests suits which are run directly from acc-sm script:
             # bonnie, iozone, etc.
             if [ -f "$TESTSUITELOG" ] && grep FAIL $TESTSUITELOG | grep -q ' '$O  ; then
-               printf "$form" "UNFINISHED" "$O" ""  
+               printf "$form" "UNFINISHED" "$O" ""
             else
                printf "$form" "Skipped" "$O" ""
             fi
         fi
     done
 
-    # print the detailed tests durations if DDETAILS=true
-    if $DDETAILS; then
-        echo "$details"
-    fi
+    for O in $DEFAULT_SUITES; do
+        O=$(echo $O  | tr "-" "_" | tr "[:lower:]" "[:upper:]")
+        [ "${!O}" = "done" -o "${!O}" = "no" ] || \
+            printf "$form" "UNFINISHED" "$O" ""
+    done
 }
 
 init_test_env() {
@@ -134,12 +138,16 @@ init_test_env() {
     #[ -d /r ] && export ROOT=${ROOT:-/r}
     export TMP=${TMP:-$ROOT/tmp}
     export TESTSUITELOG=${TMP}/${TESTSUITE}.log
+    if [[ -z $LOGDIRSET ]]; then
+        export LOGDIR=${LOGDIR:-${TMP}/test_logs/}/$(date +%s)
+        export LOGDIRSET=true
+    fi
     export HOSTNAME=${HOSTNAME:-`hostname`}
     if ! echo $PATH | grep -q $LUSTRE/utils; then
-	export PATH=$PATH:$LUSTRE/utils
+        export PATH=$PATH:$LUSTRE/utils
     fi
     if ! echo $PATH | grep -q $LUSTRE/test; then
-	export PATH=$PATH:$LUSTRE/tests
+        export PATH=$PATH:$LUSTRE/tests
     fi
     if ! echo $PATH | grep -q $LUSTRE/../lustre-iokit/sgpdd-survey; then
         export PATH=$PATH:$LUSTRE/../lustre-iokit/sgpdd-survey
@@ -154,7 +162,7 @@ init_test_env() {
     export MDSRATE=${MDSRATE:-"$LUSTRE/tests/mpi/mdsrate"}
     [ ! -f "$MDSRATE" ] && export MDSRATE=$(which mdsrate 2> /dev/null)
     if ! echo $PATH | grep -q $LUSTRE/tests/racer; then
-        export PATH=$PATH:$LUSTRE/tests/racer
+        export PATH=$LUSTRE/tests/racer:$PATH:
     fi
     if ! echo $PATH | grep -q $LUSTRE/tests/mpi; then
         export PATH=$PATH:$LUSTRE/tests/mpi
@@ -353,7 +361,7 @@ load_modules () {
     if $LOAD_MODULES_REMOTE ; then
         local list=$(comma_list $(remote_nodes_list))
         echo loading modules on $list
-        do_rpc_nodes $list load_modules 
+        do_rpc_nodes $list load_modules
     fi
 }
 
@@ -534,7 +542,7 @@ quota_save_version() {
         $LFS quotaoff -ug $MOUNT # just in case
         [ -n "$ver" ] && quota_set_version $ver
     else
-        echo mds running $lustre_version 
+        echo mds running $lustre_version
         [ -n "$ver" -a "$ver" != "3" ] && error "wrong quota version specifier"
     fi
 
@@ -682,7 +690,7 @@ fi"
 }
 
 sanity_mount_check_servers () {
-    [ "$CLIENTONLY" ] && 
+    [ "$CLIENTONLY" ] &&
         { echo "CLIENTONLY mode, skip mount_check_servers"; return 0; } || true
     echo Checking servers environments
 
@@ -1575,12 +1583,12 @@ do_node() {
 
     if [ "$myPDSH" = "rsh" ]; then
 # we need this because rsh does not return exit code of an executed command
-	local command_status="$TMP/cs"
-	rsh $HOST ":> $command_status"
-	rsh $HOST "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin;
-		    cd $RPWD; sh -c \"$@\") ||
-		    echo command failed >$command_status"
-	[ -n "$($myPDSH $HOST cat $command_status)" ] && return 1 || true
+        local command_status="$TMP/cs"
+        rsh $HOST ":> $command_status"
+        rsh $HOST "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin;
+                    cd $RPWD; sh -c \"$@\") ||
+                    echo command failed >$command_status"
+        [ -n "$($myPDSH $HOST cat $command_status)" ] && return 1 || true
         return 0
     fi
 
@@ -1616,7 +1624,7 @@ do_nodes() {
     local rnodes=$1
     shift
 
-    if $(single_local_node $rnodes); then
+    if single_local_node $rnodes; then
         if $verbose; then
            do_nodev $rnodes "$@"
         else
@@ -1714,7 +1722,7 @@ stopall() {
         rm -f $TMP/ost${num}active
     done
     if ! combined_mgs_mds ; then
-        stop mgs 
+        stop mgs
     fi
 
     return 0
@@ -1753,12 +1761,12 @@ mkfs_opts () {
     [[ $facet = mgs ]] && echo $opt && return
 
     # 1.
-    # --failnode options 
+    # --failnode options
     local var=${facet}failover_HOST
     if [ x"${!var}" != x ] && [ x"${!var}" != x$(facet_host $facet) ] ; then
         local failnode=$(h2$NETTYPE ${!var})
         failnode="--failnode=$failnode"
-        # options does not contain 
+        # options does not contain
         # or contains wrong --failnode=
         if [[ $opt != *${failnode}* ]]; then
             opt=$(echo $opt | sed 's/--failnode=.* / /')
@@ -1824,8 +1832,8 @@ mount_client() {
 
 remount_client()
 {
-	zconf_umount `hostname` $1 || error "umount failed"
-	zconf_mount `hostname` $1 || error "mount failed"
+        zconf_umount `hostname` $1 || error "umount failed"
+        zconf_mount `hostname` $1 || error "mount failed"
 }
 
 writeconf_facet () {
@@ -1894,7 +1902,7 @@ setupall() {
 }
 
 mounted_lustre_filesystems() {
-	awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts
+        awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts
 }
 
 init_facet_vars () {
@@ -1930,7 +1938,7 @@ init_facet_vars () {
     # get mount point of already mounted device
     # is facet_dev is already mounted then use the real
     #  mount point of this facet; otherwise use $(facet_mntpt $facet)
-    # i.e. ${facet}_MOUNT if specified by user or default 
+    # i.e. ${facet}_MOUNT if specified by user or default
     local mntpt=$(do_facet ${facet} cat /proc/mounts | \
             awk '"'${!dev}'" == $1 && $3 == "lustre" { print $2 }')
     if [ -z $mntpt ]; then
@@ -1981,7 +1989,7 @@ nfs_client_mode () {
         declare -a nfsexport=(`grep ' '$MOUNT' ' /proc/mounts | awk '{print $1}' | awk -F: '{print $1 " "  $2}'`)
         if [[ ${#nfsexport[@]} -eq 0 ]]; then
                 error_exit NFSCLIENT=$NFSCLIENT mode, but no NFS export found!
-        fi 
+        fi
         do_nodes ${nfsexport[0]} "echo \\\$(hostname); df -T  ${nfsexport[1]}"
         return
     fi
@@ -1999,7 +2007,7 @@ check_config_client () {
         # in theory someone could create a new,
         # client-only config file that assumed lustre was already
         # configured and didn't set the MGSNID. If MGSNID is not set,
-        # then we should use the mgs nid currently being used 
+        # then we should use the mgs nid currently being used
         # as the default value. bug 18021
         [[ x$MGSNID = x ]] &&
             MGSNID=${mgc//MGC/}
@@ -2109,7 +2117,7 @@ check_and_setup_lustre() {
                     restore_mount $MOUNT2
                     export I_MOUNTED2=yes
                 fi
-            fi 
+            fi
 
     # 5.
     # MOUNT is mounted MOUNT2 is not mounted
@@ -2145,7 +2153,7 @@ cleanup_mount () {
     local clients=${CLIENTS:-$HOSTNAME}
     local mntpt=$1
 
-    zconf_umount_clients $clients $mntpt    
+    zconf_umount_clients $clients $mntpt
 }
 
 cleanup_and_setup_lustre() {
@@ -2153,7 +2161,7 @@ cleanup_and_setup_lustre() {
         lctl set_param debug=0 || true
         cleanupall
         if [ "$ONLY" == "cleanup" ]; then
-    	    exit 0
+            exit 0
         fi
     fi
     check_and_setup_lustre
@@ -2219,7 +2227,7 @@ generate_db() {
     local dev
     local tmp_file
 
-    tmp_file=$(mktemp -p $SHARED_DIRECTORY || 
+    tmp_file=$(mktemp -p $SHARED_DIRECTORY ||
         error_exit "fail to create file in $SHARED_DIRECTORY")
 
     # make sure everything gets to the backing store
@@ -2299,7 +2307,6 @@ wait_for_function () {
     if [ "$1" = "--quiet" ]; then
         shift
         quiet=" > /dev/null 2>&1"
-        
     fi
 
     local fn=$1
@@ -2347,7 +2354,7 @@ comma_list() {
 list_member () {
     local list=$1
     local item=$2
-    echo $list | grep -qw $item  
+    echo $list | grep -qw $item
 }
 
 # list, excluded are the comma separated lists
@@ -2599,7 +2606,6 @@ debugrestore() {
 
 error_noexit() {
     local TYPE=${TYPE:-"FAIL"}
-    local ERRLOG
     local tmp=$TMP
     [ -d "$SHARED_DIR_LOGS" ] && tmp=$SHARED_DIR_LOGS
 
@@ -2612,17 +2618,14 @@ error_noexit() {
 
     log " ${TESTSUITE} ${TESTNAME}: @@@@@@ ${TYPE}: $@ "
 
+    # We need to dump the logs on all nodes
     if $dump; then
-        ERRLOG=$tmp/lustre_${TESTSUITE}_${TESTNAME}.$(date +%s)
-        [[ $cntlog -eq 0 ]] || ERRLOG=$ERRLOG.$cntlog
-        (( cntlog+=1 )) 
-        echo "Dumping lctl log to $ERRLOG"
-        # We need to dump the logs on all nodes
-        do_nodes $(comma_list $(nodes_list)) $NODE $LCTL dk $ERRLOG
+        gather_logs $(comma_list $(nodes_list))
     fi
+
     debugrestore
     [ "$TESTSUITELOG" ] && echo "$0: ${TYPE}: $TESTNAME $@" >> $TESTSUITELOG
-    TEST_FAILED=true
+    echo "$@" > $LOGDIR/err
 }
 
 exit_status () {
@@ -2684,7 +2687,7 @@ build_test_filter() {
     done
     for G in $GRANT_CHECK_LIST; do
         eval GCHECK_ONLY_${G}=true
-   	done
+    done
 }
 
 basetest() {
@@ -2705,13 +2708,13 @@ run_test() {
         testname=ONLY_$1
         if [ ${!testname}x != x ]; then
             [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED=
-            run_one $1 "$2"
+            run_one_logged $1 "$2"
             return $?
         fi
         testname=ONLY_$base
         if [ ${!testname}x != x ]; then
             [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED=
-            run_one $1 "$2"
+            run_one_logged $1 "$2"
             return $?
         fi
         LAST_SKIPPED="y"
@@ -2744,7 +2747,7 @@ run_test() {
     fi
 
     LAST_SKIPPED=
-    run_one $1 "$2"
+    run_one_logged $1 "$2"
 
     return $?
 }
@@ -2790,9 +2793,13 @@ complete () {
 }
 
 pass() {
-    local status=PASS
-    $TEST_FAILED && status=FAIL
-    echo "$status $testnum $@" 2>&1 | tee -a $TESTSUITELOG
+    # Set TEST_STATUS here; will be used for logging the result
+    if [ -f $LOGDIR/err ]; then
+        TEST_STATUS="FAIL"
+    else
+        TEST_STATUS="PASS"
+    fi
+    echo $TEST_STATUS " " $@
 }
 
 check_mds() {
@@ -2812,28 +2819,48 @@ run_one() {
     message=$2
     tfile=f${testnum}
     export tdir=d0.${TESTSUITE}/d${base}
-
+    export TESTNAME=test_$testnum
     local SAVE_UMASK=`umask`
     umask 0022
 
-    local BEFORE=`date +%s`
     echo
-    log "== test $testnum: $message == `date +%H:%M:%S` ($BEFORE)"
-    export TESTNAME=test_$testnum
-    TEST_FAILED=false
-    cntlog=0
+    log "== test $testnum: $message == `date +%H:%M:%S`"
     test_${testnum} || error "test_$testnum failed with $?"
     cd $SAVE_PWD
     reset_fail_loc
-    check_grant ${testnum} || $TEST_FAILED || error "check_grant $testnum failed"
-    check_catastrophe || $TEST_FAILED || error "LBUG/LASSERT detected"
-    ps auxww | grep -v grep | grep -q multiop && ($TEST_FAILED || error "multiop still running")
-    pass "($((`date +%s` - $BEFORE))s)"
-    TEST_FAILED=false
-    cntlog=0
+    check_grant ${testnum} || error "check_grant $testnum failed with $?"
+    check_catastrophe || error "LBUG/LASSERT detected"
+    ps auxww | grep -v grep | grep -q multiop && error "multiop still running"
     unset TESTNAME
     unset tdir
     umask $SAVE_UMASK
+    return 0
+}
+
+run_one_logged() {
+    local BEFORE=`date +%s`
+    local TEST_ERROR
+    local name=${TESTSUITE}.test_${1}.test_log.$(hostname).log
+    local test_log=$LOGDIR/$name
+    rm -rf $LOGDIR/err
+
+    log_sub_test_begin test_${1}
+    (run_one $1 "$2") 2>&1 | tee $test_log
+    local RC=${PIPESTATUS[0]}
+
+    [ $RC -ne 0 ] && [ ! -f $LOGDIR/err ] && \
+        echo "test_$1 returned $RC" | tee $LOGDIR/err
+
+    duration=$((`date +%s` - $BEFORE))
+    pass "(${duration}s)"
+    [ -f $LOGDIR/err ] && TEST_ERROR=$(cat $LOGDIR/err)
+    log_sub_test_end $TEST_STATUS $duration "$RC" "$TEST_ERROR"
+
+    if [ -f $LOGDIR/err ]; then
+        $FAIL_ON_ERROR && exit $RC
+    fi
+
+    return 0
 }
 
 canonical_path() {
@@ -2906,6 +2933,13 @@ remote_mds_nodsh()
     remote_mds && [ "$PDSH" = "no_dsh" -o -z "$PDSH" -o -z "$mds_HOST" ]
 }
 
+require_dsh_mds()
+{
+        remote_mds_nodsh && echo "SKIP: $TESTSUITE: remote MDS with nodsh" && \
+            MSKIPPED=1 && return 1
+        return 0
+}
+
 remote_ost ()
 {
     local node
@@ -2917,10 +2951,17 @@ remote_ost ()
 
 remote_ost_nodsh()
 {
-    [ "$CLIENTONLY" ] && return 0 || true 
+    [ "$CLIENTONLY" ] && return 0 || true
     remote_ost && [ "$PDSH" = "no_dsh" -o -z "$PDSH" -o -z "$ost_HOST" ]
 }
 
+require_dsh_ost()
+{
+        remote_ost_nodsh && echo "SKIP: $TESTSUITE: remote OST with nodsh" && \
+            OSKIPPED=1 && return 1
+        return 0
+}
+
 remote_mgs_nodsh()
 {
     local MGS
@@ -3140,7 +3181,7 @@ do_and_time () {
 
     SECONDS=0
     eval '$cmd'
-    
+
     [ ${PIPESTATUS[0]} -eq 0 ] || rc=1
 
     echo $SECONDS
@@ -3210,19 +3251,19 @@ exit \\\$rc;"
 # $2 file
 # $3 $RUNAS
 get_stripe_info() {
-	local tmp_file
+        local tmp_file
 
-	stripe_size=0
-	stripe_count=0
-	stripe_index=0
-	tmp_file=$(mktemp)
+        stripe_size=0
+        stripe_count=0
+        stripe_index=0
+        tmp_file=$(mktemp)
 
-	do_facet $1 $3 lfs getstripe -v $2 > $tmp_file
+        do_facet $1 $3 lfs getstripe -v $2 > $tmp_file
 
-	stripe_size=`awk '$1 ~ /size/ {print $2}' $tmp_file`
-	stripe_count=`awk '$1 ~ /count/ {print $2}' $tmp_file`
-	stripe_index=`awk '$1 ~ /stripe_offset/ {print $2}' $tmp_file`
-	rm -f $tmp_file
+        stripe_size=`awk '$1 ~ /size/ {print $2}' $tmp_file`
+        stripe_count=`awk '$1 ~ /count/ {print $2}' $tmp_file`
+        stripe_index=`awk '$1 ~ /stripe_offset/ {print $2}' $tmp_file`
+        rm -f $tmp_file
 }
 
 mdsrate_cleanup () {
@@ -3341,7 +3382,7 @@ get_md_name () {
 
 ########################
 
-convert_facet2label() { 
+convert_facet2label() {
     local facet=$1
 
     if [ x$facet = xost ]; then
@@ -3352,7 +3393,7 @@ convert_facet2label() {
 
     if [ -n ${!varsvc} ]; then
         echo ${!varsvc}
-    else  
+    else
         error "No lablel for $facet!"
     fi
 }
@@ -3420,10 +3461,10 @@ wait_osc_import_state() {
     CONN_PROC="osc.${ost}.ost_server_uuid"
     CONN_STATE=$(do_facet $facet lctl get_param -n $CONN_PROC 2>/dev/null | cut -f2)
     while [ "${CONN_STATE}" != "${expected}" ]; do
-        if [ "${expected}" == "DISCONN" ]; then 
+        if [ "${expected}" == "DISCONN" ]; then
             # for disconn we can check after proc entry is removed
             [ "x${CONN_STATE}" == "x" ] && return 0
-            #  with AT enabled, we can have connect request timeout near of 
+            #  with AT enabled, we can have connect request timeout near of
             # reconnect timeout and test can't see real disconnect
             [ "${CONN_STATE}" == "CONNECTING" ] && return 0
         fi
@@ -3438,7 +3479,6 @@ wait_osc_import_state() {
     log "${ost_facet} now in ${CONN_STATE} state"
     return 0
 }
-
 get_clientmdc_proc_path() {
     echo "${1}-mdc-*"
 }
@@ -3447,7 +3487,8 @@ do_rpc_nodes () {
     local list=$1
     shift
 
-    local RPATH="PATH=$LUSTRE/tests/:$PATH"
+    # Add paths to lustre tests for 32 and 64 bit systems.
+    local RPATH="PATH=$RLUSTRE/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:$PATH"
     do_nodesv $list "${RPATH} NAME=${NAME} sh rpc.sh $@ "
 }
 
@@ -3545,27 +3586,30 @@ gather_logs () {
     # of writing the file to an NFS directory so it doesn't need to be copied.
     local tmp=$TMP
     local docp=true
-    [ -d "$SHARED_DIR_LOGS" ] && tmp=$SHARED_DIR_LOGS && docp=false
+    [ -f $LOGDIR/shared ] && docp=false
 
     # dump lustre logs, dmesg
-    do_nodes $list "log=$tmp/\\\$(hostname)-debug-$ts.log ;
-lctl dk \\\$log >/dev/null;
-log=$tmp/\\\$(hostname)-dmesg-$ts.log;
-dmesg > \\\$log; "
 
-    # FIXME: does it make sense to collect the logs for $ts only, but all
-    # TESTSUITE logs?
-    # rsync $TMP/*${TESTSUITE}* to gather the logs dumped by error fn
-    local logs=$TMP/'*'${TESTSUITE}'*'
-    if $docp; then
-        logs=$logs' '$tmp/'*'$ts'*'
+    prefix="$LOGDIR/${TESTSUITE}.${TESTNAME}"
+    suffix="$ts.log"
+    echo "Dumping lctl log to ${prefix}.*.${suffix}"
+
+    if [ "$CLIENTONLY" -o "$PDSH" == "no_dsh" ]; then
+        echo "Dumping logs only on local client."
+        $LCTL dk > ${prefix}.debug_log.$(hostname).${suffix}
+        dmesg > ${prefix}.dmesg.$(hostname).${suffix}
+        return
     fi
-    for node in ${list//,/ }; do
-        rsync -az $node:"$logs" $TMP
-    done
 
-    local archive=$TMP/${TESTSUITE}-$ts.tar.bz2
-    tar -jcf $archive $tmp/*$ts* $TMP/*${TESTSUITE}*
+    do_nodes --verbose $list \
+        "$LCTL dk > ${prefix}.debug_log.\\\$(hostname).${suffix};
+         dmesg > ${prefix}.dmesg.\\\$(hostname).${suffix}"
+    if [ ! -f $LOGDIR/shared ]; then
+        do_nodes $list rsync -az "${prefix}.*.${suffix}" $HOSTNAME:$LOGDIR
+      fi
+
+    local archive=$LOGDIR/${TESTSUITE}-$ts.tar.bz2
+    tar -jcf $archive $LOGDIR/*$ts* $LOGDIR/*${TESTSUITE}*
 
     echo $archive
 }
@@ -3610,11 +3654,11 @@ do_ls () {
 
 max_recovery_time () {
     local init_connect_timeout=$(( TIMEOUT / 20 ))
-    [[ $init_connect_timeout > 5 ]] || init_connect_timeout=5 
+    [[ $init_connect_timeout > 5 ]] || init_connect_timeout=5
 
     local service_time=$(( $(at_max_get client) + $(( 2 * $(( 25 + 1  + init_connect_timeout)) )) ))
 
-    echo $service_time 
+    echo $service_time
 }
 
 remove_mdt_files() {
@@ -3708,3 +3752,65 @@ min_ost_size () {
     $LCTL get_param -n osc.*.kbytesavail | sort -n | head -n1
 }
 
+check_logdir() {
+    local dir=$1
+    # Checking for shared logdir
+    if [ ! -d $dir ]; then
+        # Not found. Create local logdir
+        mkdir -p $dir
+    else
+        touch $dir/node.$(hostname).yml
+    fi
+    return 0
+}
+
+check_write_access() {
+    local dir=$1
+    for node in $(nodes_list); do
+        if [ ! -f "$dir/node.${node}.yml" ]; then
+            # Logdir not accessible/writable from this node.
+            return 1
+        fi
+    done
+    return 0
+}
+
+init_logging() {
+    if [[ -n $YAML_LOG ]]; then
+        return
+    fi
+    export YAML_LOG=${LOGDIR}/results.yml
+    mkdir -p $LOGDIR
+    init_clients_lists
+
+    do_rpc_nodes $(comma_list $(nodes_list)) check_logdir $LOGDIR
+    if check_write_access $LOGDIR; then
+        touch $LOGDIR/shared
+        echo "Logging to shared log directory: $LOGDIR"
+    else
+        echo "Logging to local directory: $LOGDIR"
+    fi
+
+    yml_nodes_file $LOGDIR >> $YAML_LOG
+    yml_results_file >> $YAML_LOG
+}
+
+log_test() {
+    yml_log_test $1 >> $YAML_LOG
+}
+
+log_sub_test() {
+    yml_log_sub_test $@ >> $YAML_LOG
+}
+
+log_test_status() {
+     yml_log_test_status $@ >> $YAML_LOG
+}
+
+log_sub_test_begin() {
+    yml_log_sub_test_begin $@ >> $YAML_LOG
+}
+
+log_sub_test_end() {
+    yml_log_sub_test_end $@ >> $YAML_LOG
+}
diff --git a/lustre/tests/test-groups/regression b/lustre/tests/test-groups/regression
new file mode 100644
index 0000000..1c79bc8
--- /dev/null
+++ b/lustre/tests/test-groups/regression
@@ -0,0 +1,20 @@
+sanity
+metadata-updates
+sanity-benchmark
+sanityn
+lfsck
+liblustre
+racer
+replay-single
+conf-sanity
+recovery-small
+replay-ost-single
+replay-dual
+replay-vbr
+insanity
+sanity-quota
+ost-pools
+lnet-selftest
+mmp
+obdfilter-survey
+sgpdd-survey
diff --git a/lustre/tests/test-groups/regression-mpi b/lustre/tests/test-groups/regression-mpi
new file mode 100644
index 0000000..fd44302
--- /dev/null
+++ b/lustre/tests/test-groups/regression-mpi
@@ -0,0 +1,3 @@
+performance-sanity
+large-scale
+parallel-scale
diff --git a/lustre/tests/yaml.sh b/lustre/tests/yaml.sh
new file mode 100644
index 0000000..f5803e2
--- /dev/null
+++ b/lustre/tests/yaml.sh
@@ -0,0 +1,191 @@
+#!/bin/bash
+# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4:
+
+#
+# Shell routines for logging results to a yaml file.
+#
+
+split_output() {
+    while read line; do
+        host=${line%%:*};
+        echo "$line" | sed "s/^${host}: //" | sed "s/^${host}://" \
+            >> $logdir/node.$host.yml;
+    done
+}
+
+yml_nodes_file() {
+    export logdir=$1
+
+    if [ -f $logdir/shared ]; then
+        do_rpc_nodes $(comma_list $(nodes_list)) \
+            "yml_node >> $logdir/node.\\\$(hostname).yml"
+    else
+        do_rpc_nodes $(comma_list $(nodes_list)) yml_node | split_output
+    fi
+    yml_entities
+}
+
+yml_results_file() {
+    export logdir=$1
+
+    #TestGroup
+    yml_test_group
+
+    # Tests
+    printf "Tests:\n"
+}
+
+# Called on the node for which we the info is needed.
+yml_node() {
+    local node=$(hostname)
+    logdir=$1
+
+    printf "Build:\n"
+    yml_build_info
+    printf "\n"
+
+    printf "Node:\n"
+    yml_node_info
+    printf "\n"
+
+    printf "LustreEntities:\n"
+}
+
+yml_test_group() {
+    TEST_GROUP=${TEST_GROUP:-"acc-sm-$(hostname)"}
+    TEST_HOST=${TEST_HOST:-$(hostname)}
+    TEST_USER=${TEST_USER:-$USER}
+
+    # TestGroup information
+    cat <<EOF
+TestGroup:
+    test_group: $TEST_GROUP
+    testhost: $TEST_HOST
+    submission: $(date)
+    user_name: $TEST_USER
+
+EOF
+}
+
+release() {
+   if [ -r /etc/lsb-release ]; then
+      dist=$(grep 'DISTRIB_ID' /etc/lsb-release | sed 's/DISTRIB_ID=//' | head -1)
+   elif [ -r /etc/redhat-release ]; then
+       dist=$(awk '/release/ { printf("%s %s %s", $1, $2, $3)}' /etc/redhat-release)
+   elif [ -r /etc/*-release ]; then
+       dist=$(find /etc/ -maxdepth 1 -name '*release' 2> /dev/null | \
+           sed -e 's/\/etc\///' -e 's/-release//' | head -1)
+   else
+       dist="UNKNOWN"
+   fi
+
+   echo $dist
+}
+
+yml_build_info() {
+    TEST_DISTRO=$(release)
+    LUSTRE_VERSION=$(lctl lustre_build_version | awk '/Lustre version:/ {print $3}')
+    LUSTRE_BUILD=$(sed 's/-.*//' <<<$LUSTRE_VERSION)
+
+cat <<EOF
+    lbats_build_id: $LBATS_ID
+    lbats_build_name: $LBATS_NAME
+    architecture: $(uname -m)
+    os: $(uname -o)
+    os_distribution: $TEST_DISTRO
+    lustre_version: $LUSTRE_VERSION
+    lustre_build: $LUSTRE_BUILD
+    kernel_version: $(uname -r)
+EOF
+}
+
+yml_node_info()
+{
+    mem=$(awk '/MemTotal:/ {print $2 " " $3}' /proc/meminfo)
+cat <<EOF
+    node_name: $(hostname)
+    mem_size: $mem
+    architecture: $(uname -m)
+    networks:
+EOF
+    for nw in $(lctl list_nids | grep -v @lo | cut -f 2 -d '@' | uniq); do
+        printf "        - $nw\n"
+    done
+}
+
+yml_entity() {
+    cat<<EOF
+-
+    node_type: $1
+    node_name: $2
+EOF
+}
+
+yml_entities() {
+    local host
+    for num in $(seq $MDSCOUNT); do
+        host=$(facet_active_host mds$num)
+        yml_entity "MDS $num" $host >> $logdir/node.$host.yml
+    done
+
+    for num in $(seq $OSTCOUNT); do
+        host=$(facet_active_host ost$num)
+        yml_entity "OST $num" $host >> $logdir/node.$host.yml
+    done
+
+    i=1
+    for host in ${CLIENTS//,/ }; do
+        yml_entity "Client $i" $host >> $logdir/node.$host.yml
+        i=$((i+1))
+    done
+}
+
+yml_log_test() {
+    if [ $1 != "FINISHED" ]; then
+        cat <<EOF
+-
+        name: $1
+        description: $TESTSUITE $1
+        submission: $(date)
+        report_version: 2
+        SubTests:
+EOF
+    fi
+}
+
+yml_log_test_status() {
+    cat <<EOF
+        duration: $1
+        status: $2
+EOF
+}
+
+yml_log_sub_test_begin() {
+    cat <<EOF
+        -
+            name: $1
+EOF
+}
+
+yml_log_sub_test_end() {
+    cat <<EOF
+            status: $1
+            duration: $2
+            return_code: $3
+EOF
+    shift 3
+    if [ -z "$*" ]; then
+        printf '            error:\n'
+    else
+        printf '            error: "%q"\n' "$*"
+    fi
+}
+
+yml_log_sub_test_log() {
+    cat <<EOF
+        -
+            name: $1
+            type: $2
+            location: $3
+EOF
+}
-- 
1.8.3.1