From 30d9df1a69d325c416ed7027ddd34464f097396f Mon Sep 17 00:00:00 2001 From: root Date: Tue, 21 Dec 2010 14:00:06 +0000 Subject: [PATCH] LU-123 Port yaml and auster to b1_8 Changes to add the yaml data logging from the 2.0 branch in the 1.8 branch, this patch was created by applying the 2.0 yml patch to 1.8 and then resolving the issues. This is to say that the larger changes have been taken to be correct because of their acceptance in the 2.0 master. This patch also contains the addition of the auster.sh script which allows for the logging of test results to the maloo database. The actual logging is carried out by the maloo_upload.sh script. For instructions on how to use auster use auster -?, for instructions on maloo refer to the whamcloud wiki [where I will begin to place a resource of information!] Change-Id: I602a3534f17544d857aa0a9f9f82d2873fb73a39 Signed-off-by: Chris Gearing Signed-off-by: Bobi Jam Reviewed-on: http://review.whamcloud.com/421 Tested-by: Hudson Reviewed-by: Yu Jian Reviewed-by: Johann Lombardi --- lustre/tests/Makefile.am | 7 +- lustre/tests/acceptance-small.sh | 154 ++++++++------- lustre/tests/auster.sh | 320 ++++++++++++++++++++++++++++++++ lustre/tests/conf-sanity.sh | 22 +-- lustre/tests/insanity.sh | 12 +- lustre/tests/large-scale.sh | 13 +- lustre/tests/lfsck.sh | 3 +- lustre/tests/liblustre.sh | 1 + lustre/tests/lnet-selftest.sh | 9 +- lustre/tests/maloo_upload.sh | 31 ++++ lustre/tests/metadata-updates.sh | 17 +- lustre/tests/mmp.sh | 15 +- lustre/tests/obdfilter-survey.sh | 7 +- lustre/tests/ost-pools.sh | 15 +- lustre/tests/parallel-scale.sh | 27 +-- lustre/tests/performance-sanity.sh | 7 +- lustre/tests/racer.sh | 137 ++++++++++++-- lustre/tests/recovery-double-scale.sh | 19 +- lustre/tests/recovery-mds-scale.sh | 33 ++-- lustre/tests/recovery-random-scale.sh | 33 ++-- lustre/tests/recovery-small.sh | 29 +-- lustre/tests/replay-dual.sh | 4 +- lustre/tests/replay-ost-single.sh | 11 +- lustre/tests/replay-single.sh | 23 +-- lustre/tests/replay-vbr.sh | 5 +- lustre/tests/rpc.sh | 15 +- lustre/tests/runtests | 1 + lustre/tests/sanity-benchmark.sh | 17 +- lustre/tests/sanity-quota.sh | 8 +- lustre/tests/sanity.sh | 6 +- lustre/tests/sanityn.sh | 4 + lustre/tests/sgpdd-survey.sh | 3 +- lustre/tests/test-framework.sh | 304 ++++++++++++++++++++---------- lustre/tests/test-groups/regression | 20 ++ lustre/tests/test-groups/regression-mpi | 3 + lustre/tests/yaml.sh | 191 +++++++++++++++++++ 36 files changed, 1165 insertions(+), 361 deletions(-) create mode 100755 lustre/tests/auster.sh create mode 100755 lustre/tests/maloo_upload.sh create mode 100644 lustre/tests/test-groups/regression create mode 100644 lustre/tests/test-groups/regression-mpi create mode 100644 lustre/tests/yaml.sh diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 2262fa4..98d7398 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -23,8 +23,9 @@ noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh noinst_SCRIPTS += run_dbench.sh run_IOR.sh recovery-double-scale.sh noinst_SCRIPTS += recovery-random-scale.sh parallel-scale.sh metadata-updates.sh noinst_SCRIPTS += ost-pools.sh rpc.sh lnet-selftest.sh obdfilter-survey.sh mmp.sh -noinst_SCRIPTS += sgpdd-survey.sh +noinst_SCRIPTS += sgpdd-survey.sh auster.sh yaml.sh maloo_upload.sh nobase_noinst_SCRIPTS = cfg/local.sh +nobase_noinst_SCRIPTS += test-groups/regression test-groups/regression-mpi nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh nobase_noinst_SCRIPTS += racer/file_rm.sh racer/racer.sh racer/file_concat.sh @@ -40,7 +41,7 @@ if MPITESTS SUBDIRS = mpi endif noinst_PROGRAMS = openunlink truncate directio writeme mlink utime it_test -noinst_PROGRAMS += tchmod fsx test_brw +noinst_PROGRAMS += tchmod fsx test_brw noinst_PROGRAMS += createmany chownmany statmany multifstat createtest noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany checkstat noinst_PROGRAMS += statone runas openfile rmdirmany @@ -48,7 +49,7 @@ noinst_PROGRAMS += small_write multiop ll_sparseness_verify noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany noinst_PROGRAMS += openfilleddirunlink rename_many memhog iopentest1 iopentest2 noinst_PROGRAMS += mmap_sanity flock_test writemany reads flocks_test -# noinst_PROGRAMS += copy_attr mkdirdeep +# noinst_PROGRAMS += copy_attr mkdirdeep bin_PROGRAMS = mcreate munlink testdir = $(libdir)/lustre/tests test_SCRIPTS = $(noinst_SCRIPTS) $(noinst_PROGRAMS) diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index 524cb9b..83a9532 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -4,26 +4,37 @@ #set -vx set -e -export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL REPLAY_VBR INSANITY SANITY_QUOTA PERFORMANCE_SANITY LARGE_SCALE RECOVERY_MDS_SCALE RECOVERY_DOUBLE_SCALE RECOVERY_RANDOM_SCALE PARALLEL_SCALE METADATA_UPDATES OST_POOLS SANITY_BENCHMARK LNET_SELFTEST MMP OBDFILTER_SURVEY SGPDD_SURVEY" +export MSKIPPED=0 +export OSKIPPED=0 + +# This is the default set of tests to run. +DEFAULT_SUITES="runtests sanity sanity-benchmark sanityn lfsck liblustre + racer replay-single conf-sanity recovery-small + replay-ost-single replay-dual replay-vbr insanity sanity-quota + performance-sanity large-scale recovery-mds-scale + recovery-double-scale recovery-random-scale parallel-scale + lustre_rsync-test metadata-updates ost-pools lnet-selftest + mmp obdfilter-survey sgpdd-survey" + +if [[ -n $@ ]]; then + ACC_SM_ONLY="${ACC_SM_ONLY} $@" +fi if [ "$ACC_SM_ONLY" ]; then - for O in $TESTSUITE_LIST; do - export ${O}="no" + for O in $DEFAULT_SUITES; do + O=$(echo $O | tr "-" "_" | tr "[:lower:]" "[:upper:]") + export ${O}="no" done for O in $ACC_SM_ONLY; do - O=`echo ${O%.sh} | tr "-" "_"` - O=`echo $O | tr "[:lower:]" "[:upper:]"` - export ${O}="yes" + O=`echo ${O%.sh} | tr "-" "_"` + O=`echo $O | tr "[:lower:]" "[:upper:]"` + export ${O}="yes" done fi -LIBLUSTRETESTS=${LIBLUSTRETESTS:-../liblustre/tests} - -RANTEST="" - LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh -init_test_env $@ +init_test_env SETUP=${SETUP:-setupall} FORMAT=${FORMAT:-formatall} @@ -65,57 +76,21 @@ find_in_path() { title() { # update titlebar if stdin is attached to an xterm if ${UPDATE_TITLEBAR:-false}; then - if tty -s; then - case $TERM in - xterm*) - echo -ne "\033]2; acceptance-small: $* \007" >&0 - ;; - esac - fi - fi + if tty -s; then + case $TERM in + xterm*) + echo -ne "\033]2; acceptance-small: $* \007" >&0 + ;; + esac + fi + fi log "-----============= acceptance-small: "$*" ============----- `date`" - RANTEST=${RANTEST}$*", " -} - -skip_remost() { - remote_ost_nodsh && log "SKIP: $1: remote OST with nodsh" && return 0 - return 1 -} - -skip_remmds() { - remote_mds_nodsh && log "SKIP: $1: remote MDS with nodsh" && return 0 - return 1 -} - -# cleanup the logs of all suites -cleanup_log () { - local suite - local o=$(echo $O | tr "[:upper:]" "[:lower:]") - o=${o//_/-} - - rm -f ${TMP}/${o}.log } -cleanup_logs () { - local suite - for suite in ${ACC_SM_ONLY:-$TESTSUITE_LIST}; do - cleanup_log $suite - done -} - -export NAME MOUNT START CLEAN -. $LUSTRE/tests/cfg/$NAME.sh - -assert_env mds_HOST MDS_MKFS_OPTS MDSDEV -assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT -assert_env FSNAME MOUNT MOUNT2 - -setup_if_needed - -for s in ${ACC_SM_ONLY:-$TESTSUITE_LIST}; do - suite_name=$(echo ${s%.sh} | tr "[:upper:]_" "[:lower:]-" ) - suite=$(echo ${suite_name} | tr "[:lower:]-" "[:upper:]_") - suite_only=ONLY # Change to ${suite}_ONLY after fixing YALA +run_suite() { + local suite_name=$(echo ${1%.sh} | tr "[:upper:]_" "[:lower:]-" ) + local suite=$(echo ${suite_name} | tr "[:lower:]-" "[:upper:]_") + local suite_only=ONLY # Change to ${suite}_ONLY after fixing YALA if is_sanity_benchmark ${suite_name}; then suite_only=suite_name @@ -130,34 +105,55 @@ for s in ${ACC_SM_ONLY:-$TESTSUITE_LIST}; do suite_script=${suite_name}.sh else echo "Can't find test script for $suite_name" - exit 1 + return 1 fi echo "$suite_script located." - - if [[ ${!suite} = no ]]; then + if [[ ${!suite} != no ]]; then + local rc + local status + local duration + local start_ts=$(date +%s) + rm -rf $TF_FAIL + title $suite_name + log_test $suite_name + bash $suite_script ${!suite_only} + rc=$? + duration=$(($(date +%s) - $start_ts)) + if [ -f $TF_FAIL -o $rc -ne 0 ]; then + status="FAIL" + else + status="PASS" + fi + echo "Script: $status" + log_test_status $duration $status + + $CLEANUP + [ x$suite = xSGPDD_SURVEY ] || $SETUP + + eval ${suite}="done" + else echo "Skipping $suite_name" - continue fi +} - start_ts=$(date +%s) - title $suite_name - bash $suite_script ${!suite_only} - rc=$? - duration=$(($(date +%s) - $start_ts)) - if [ $rc -ne 0 ]; then - RC=$rc - status="FAIL" - else - status="PASS" - fi - echo "Script: $status" +run_suites() { + for suite in $*; do + run_suite $suite + done +} + +export NAME MOUNT START CLEAN +. $LUSTRE/tests/cfg/$NAME.sh +assert_env mds_HOST MDS_MKFS_OPTS +assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT +assert_env FSNAME MOUNT MOUNT2 + +setup_if_needed +init_logging - $CLEANUP - [ x$suite = xSGPDD_SURVEY ] || $SETUP - eval ${suite}="done" -done +run_suites ${ACC_SM_ONLY:-$DEFAULT_SUITES} RC=$? title FINISHED diff --git a/lustre/tests/auster.sh b/lustre/tests/auster.sh new file mode 100755 index 0000000..17c60e1 --- /dev/null +++ b/lustre/tests/auster.sh @@ -0,0 +1,320 @@ +#!/bin/bash +# +# +# auster - drive lustre tests +# TODO +# 1. --time-limt add per test time limit, kill test if it runs to long +# 2. Read list of tests to run from a file. same syntax as cli, but one test per line +# 3. Run test on remote node +# 4. Use long opts for auster options + +set -e + +export TF_FAIL=/tmp/tf.fail + +usage() { + less -F <.sh) + -g GROUP Test group file (Overrides tests listed on command line) + -i N Repeat tests N times (default 1). A new directory + will be created under LOGDIR for each iteration. + -k Don't stop when subtests fail + -R Remount lustre between tests + -r Reformat (during initial configuration if needed) + -s SLOW=yes + -v Verbose mode + -l Send logs to the Maloo database after run + (can be done later by running maloo_upload.sh) + -h This help. + +Suite options +These are suite specific options that can be specified after each suite on +the command line. + suite-name [options] + --only LIST Run only specific list of subtests + --except LIST Skip list of subtests + --start-at SUBTEST Start testing from subtest + --stop-at SUBTEST Stop testing at subtest + --time-limit LIMIT Don't allow this suite to run longer + than LIMT seconds. [UNIMPLEMENTED] + +Example usage: +Run all of sanity and all of replay-single except for 70b with SLOW=y using +the default "local" configuration. + + auster -s sanity replay-single --except 70b + +Run all tests in the regression group 5 times using large config. + + auster -f large -g test-groups/regression -r 5 + +EOF + exit +} + +dry_run=false +do_reset=false +verbose=false +repeat_count=1 +upload_logs=false +reformat=false +test_logs_dir=/tmp/test_logs/$(date +%Y-%m-%d)/$(date +%H%M%S) +export SLOW=no +export ${NAME:=local} +while getopts "c:d:D:nkf:g:i:rRslhv" opt +do + case "$opt" in + c) CONFIG=$OPTARG;; + d) test_logs_dir=$OPTARG/$(date +%Y-%m-%d)/$(date +%H%M%S);; + D) test_logs_dir=$OPTARG;; + g) test_group_file=$OPTARG;; + k) export FAIL_ON_ERROR=false;; + n) dry_run=:;; + v) verbose=:;; + i) repeat_count=$OPTARG;; + f) NAME=$OPTARG;; + R) do_reset=:;; + r) reformat=:;; + s) SLOW=yes;; + l) upload_logs=true;; + h|\?) usage;; + esac +done + +# If a test_group_file is specified, then ignore rest of command line +if [[ $test_group_file ]]; then + export TEST_GROUP=$(basename $test_group_file) + set $(sed 's/#.*$//' $test_group_file) +else + shift $((OPTIND -1)) +fi + +reset_lustre() { + if $do_reset; then + stopall + setupall + fi +} + +STARTTIME=`date +%s` + +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} +. $LUSTRE/tests/test-framework.sh +init_test_env + +print_summary () { + trap 0 + local form="%-13s %-17s %s\n" + printf "$form" "status" "script" "skipped tests E(xcluded) S(low)" + echo "------------------------------------------------------------------------------------" + echo "Done!" +} + + +setup_if_needed() { + nfs_client_mode && return + auster_cleanup=false + + local MOUNTED=$(mounted_lustre_filesystems) + if $(echo $MOUNTED | grep -w -q $MOUNT); then + check_config_clients $MOUNT + # init_facets_vars + # init_param_vars + return + fi + + echo "Lustre is not mounted, trying to do setup ... " + $reformat && formatall + setupall + + MOUNTED=$(mounted_lustre_filesystems) + if ! $(echo $MOUNTED | grep -w -q $MOUNT); then + echo "Lustre is not mounted after setup! " + exit 1 + fi + auster_cleanup=true +} + +cleanup_if_needed() { + if $auster_cleanup; then + cleanupall + fi +} + +find_script_in_path() { + target=$1 + path=$2 + for dir in $(tr : " " <<< $path); do + if [ -e $dir/$target ]; then + echo $dir/$target + return 0 + fi + if [ -e $dir/$target.sh ]; then + echo $dir/$target.sh + return 0 + fi + done + return 1 +} + +title() { + log "-----============= acceptance-small: "$*" ============----- `date`" +} + +doit() { + if $dry_run; then + printf "Would have run: %s\n" "$*" + return 0 + fi + if $verbose; then + printf "Running: %s\n" "$*" + fi + "$@" +} + + +run_suite() { + suite_name=$1 + suite_script=$2 + title $suite_name + log_test $suite_name + + rm -f $TF_FAIL + local start_ts=$(date +%s) + doit bash $suite_script + rc=$? + duration=$(($(date +%s) - $start_ts)) + if [ -f $TF_FAIL -o $rc -ne 0 ]; then + status="FAIL" + else + status="PASS" + fi + log_test_status $duration $status + + reset_lustre +} + +run_suite_logged() { + local suite_name=${1%.sh} + local suite=$(echo ${suite_name} | tr "[:lower:]-" "[:upper:]_") + + suite_script=$(find_script_in_path $suite_name $PATH:$LUSTRE/tests) + + if [[ -z $suite_script ]]; then + echo "Can't find test script for $suite_name" + return 1 + fi + + echo "run_suite $suite_name $suite_script" + local log_name=${suite_name}.suite_log.$(hostname).log + if $verbose; then + run_suite $suite_name $suite_script 2>&1 |tee $LOGDIR/$log_name + else + run_suite $suite_name $suite_script > $LOGDIR/$log_name 2>&1 + fi + +} + +# +# Add this to test-framework somewhere. +reset_logging() { + export LOGDIR=$1 + unset YAML_LOG + init_logging +} + +split_commas() { + echo "${*//,/ }" +} + +run_suites() { + local n=0 + local argv=("$@") + while ((n < repeat_count)); do + local RC=0 + local logdir=${test_logs_dir} + ((repeat_count > 1)) && logdir="$logdir/$n" + reset_logging $logdir + set -- "${argv[@]}" + while [[ -n $1 ]]; do + unset ONLY EXCEPT START_AT STOP_AT + local opts="" + local time_limit="" +# echo "argv: $*" + suite=$1 + shift; + while [[ -n $1 ]]; do + case "$1" in + --only) + shift; + export ONLY=$(split_commas $1) + opts+="ONLY=$ONLY ";; + --except) + shift; + export EXCEPT=$(split_commas $1) + opts+="EXCEPT=$EXCEPT ";; + --start-at) + shift; + export START_AT=$1 + opts+="START_AT=$START_AT ";; + --stop-at) + shift; + export STOP_AT=$1 + opts+="STOP_AT=$STOP_AT ";; + --time-limit) + shift; + time_limit=$1;; + *) + break;; + esac + shift + done + echo "running: $suite $opts" + run_suite_logged $suite || RC=$? + echo $suite returned $RC + done + if $upload_logs; then + $upload_script $LOGDIR + fi + n=$((n + 1)) + done +} + +if [ $upload_logs = true ] ; then + upload_script=$(find_script_in_path maloo_upload.sh $PATH:$LUSTRE/tests) + if [[ -z $upload_script ]]; then + echo "Can't find maloo_upload.sh script" + exit 1 + fi + + if [ ! -r ~/.maloorc ] ; then + echo "A ~/.maloorc file is required in order to upload results." + echo "Visit your maloo web interface to download your .maloorc file" + exit 1 + fi +fi + +export NAME MOUNT START CLEAN +. ${CONFIG:-$LUSTRE/tests/cfg/$NAME.sh} + +assert_env mds_HOST MDS_MKFS_OPTS +assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT +assert_env FSNAME MOUNT MOUNT2 + +echo "Started at `date`" +setup_if_needed + +run_suites "$@" +RC=$? + +if [[ $RC -eq 0 ]]; then + cleanup_if_needed +fi + +echo "Finished at `date` in $((`date +%s` - $STARTTIME))s" +echo "$0: completed with rc $RC" && exit $RC diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index d2c8f148..f2e34b4 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -31,6 +31,7 @@ HOSTNAME=`hostname` . $LUSTRE/tests/test-framework.sh init_test_env $@ +init_logging # STORED_MDSSIZE is used in test_18 if [ -n "$MDSSIZE" ]; then STORED_MDSSIZE=$MDSSIZE @@ -40,15 +41,14 @@ MDSSIZE=40000 OSTSIZE=40000 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +require_dsh_mds || exit 0 +require_dsh_ost || exit 0 + if ! combined_mgs_mds; then # bug number for skipped test: 23954 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 24b" fi -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 - -# [ "$SLOW" = "no" ] && EXCEPT_SLOW="30 31 45" assert_DIR @@ -456,7 +456,7 @@ test_5f() { sleep 5 - if ! ps -f -p $pid >/dev/null; then + if ! ps -f -p $pid >/dev/null; then wait $pid rc=$? grep " $MOUNT " /etc/mtab && echo "test 5f: mtab after mount" @@ -469,7 +469,7 @@ test_5f() { # start mds start_mds - # mount should succeed after start mds + # mount should succeed after start mds wait $pid rc=$? [ $rc -eq 0 ] || error "mount returned $rc" @@ -649,7 +649,7 @@ test_18() { echo "mount mds with large journal..." local OLD_MDS_MKFS_OPTS=$MDS_MKFS_OPTS - local opts="--mdt --fsname=$FSNAME --device-size=$myMDSSIZE --param sys.timeout=$TIMEOUT $MDSOPT" + local opts="--mdt --fsname=$FSNAME --device-size=$myMDSSIZE --param sys.timeout=$TIMEOUT $MDSOPT" if combined_mgs_mds ; then MDS_MKFS_OPTS="--mgs $opts" @@ -983,7 +983,7 @@ test_27b() { setup # interop 1.8 <-> 2.0: - # 1.8: group_acquire_expire, 2.0: identity_acquire_expire + # 1.8: group_acquire_expire, 2.0: identity_acquire_expire local acquire_expire=$(do_facet mds lctl get_param md*.$FSNAME-MDT0000.*acquire_expire | \ cut -d= -f1 | cut -d. -f3) facet_failover mds @@ -1511,7 +1511,7 @@ test_35b() { # bug 18674 return 1 local at_max_saved=0 - # adaptive timeouts may prevent seeing the issue + # adaptive timeouts may prevent seeing the issue if at_is_enabled; then at_max_saved=$(at_max_get mds) at_max_set 0 mds client @@ -1869,7 +1869,7 @@ cleanup_46a() { stop ost${count} -f || rc=$? let count=count-1 done - stop_mds || rc=$? + stop_mds || rc=$? cleanup_nocli || rc=$? #writeconf to remove all ost2 traces for subsequent tests writeconf @@ -1887,7 +1887,7 @@ test_46a() { mount_client $MOUNT || return 3 trap "cleanup_46a $OSTCOUNT" EXIT ERR - local i + local i for (( i=2; i<=$OSTCOUNT; i++ )); do start ost$i `ostdevname $i` $OST_MOUNT_OPTS || return $((i+2)) done diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 8f40d52..8206a85 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -9,7 +9,7 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} - +init_logging ALWAYS_EXCEPT="10 $INSANITY_EXCEPT" if [ "$FAILURE_MODE" = "HARD" ]; then @@ -33,8 +33,8 @@ assert_env mds_HOST MDS_MKFS_OPTS MDSDEV assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT assert_env LIVE_CLIENT FSNAME -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 +require_dsh_mds || exit 0 +require_dsh_ost || exit 0 # FAIL_CLIENTS list should not contain the LIVE_CLIENT FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g") @@ -64,9 +64,9 @@ fail_clients() { log "Request clients to fail: ${num}. Num of clients to fail: ${FAIL_NUM}, already failed: $DOWN_NUM" if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then - num=$((FAIL_NUM - DOWN_NUM)) + num=$((FAIL_NUM - DOWN_NUM)) fi - + if [ -z "$num" ] || [ "$num" -le 0 ]; then log "No clients failed!" return @@ -505,7 +505,7 @@ run_test 8 "Eighth Failure Mode: CLIENT/OST `date`" ############### Ninth Failure Mode ############### test_9() { - echo + echo #Create files echo "Verify Lustre filesystem is up and running" diff --git a/lustre/tests/large-scale.sh b/lustre/tests/large-scale.sh index 51b8777..d7b6ce2 100644 --- a/lustre/tests/large-scale.sh +++ b/lustre/tests/large-scale.sh @@ -15,8 +15,9 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging -remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 +require_dsh_mds || exit 0 [ -n "$CLIENTS" ] || { skip_env "$0: Need two or more clients" && exit 0; } [ $CLIENTCOUNT -ge 2 ] || \ @@ -35,7 +36,7 @@ rm -rf $DIR/[df][0-9]* # VBR scale tests check_vbr () { - do_nodes $CLIENTS "$LCTL get_param mdc.*.connect_flags | grep version_recovery" + do_nodes $CLIENTS "$LCTL get_param mdc.*.connect_flags | grep version_recovery" } check_vbr || \ @@ -119,7 +120,7 @@ test_1c() { replay_barrier mds do_nodes $CLIENTS "createmany -o $DIR/$tfile-\\\$(hostname)" 25 # XXX For FAILURE_MODE=HARD it is better to exclude - # shutdown_facet and reboot_facet time + # shutdown_facet and reboot_facet time fail_mds local current_ts=`date +%s` @@ -178,7 +179,7 @@ test_3a() { local -a nodes=(${CLIENTS//,/ }) - # INCREMENT is a number of clients + # INCREMENT is a number of clients # a half of clients by default increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))} @@ -205,7 +206,7 @@ test_3a() { local num=$increment while [ $num -le $CLIENTCOUNT ]; do - list=$(comma_list ${nodes[@]:0:$num}) + list=$(comma_list ${nodes[@]:0:$num}) generate_machine_file $list $machinefile || { error "can not generate machinefile"; exit 1; } @@ -231,7 +232,7 @@ test_3a() { fi duration=$(do_facet mds lctl get_param -n $procfile | grep recovery_duration) - + res=( "${res[@]}" "$num" ) res=( "${res[@]}" "$duration" ) echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration" diff --git a/lustre/tests/lfsck.sh b/lustre/tests/lfsck.sh index b23559d..926949e 100644 --- a/lustre/tests/lfsck.sh +++ b/lustre/tests/lfsck.sh @@ -9,6 +9,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging NUMFILES=${NUMFILES:-10} NUMDIRS=${NUMDIRS:-4} @@ -156,7 +157,7 @@ get_files() { esac local files="" - local f + local f for f in $(seq -f testfile.%g $first $last); do test_file=$test_dir/$f files="$files $test_file" diff --git a/lustre/tests/liblustre.sh b/lustre/tests/liblustre.sh index 12af4d7..0ad8c35 100644 --- a/lustre/tests/liblustre.sh +++ b/lustre/tests/liblustre.sh @@ -8,6 +8,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging LIBLUSTRETESTS=${LIBLUSTRETESTS:-$LUSTRE/liblustre/tests} diff --git a/lustre/tests/lnet-selftest.sh b/lustre/tests/lnet-selftest.sh index f4dd5b3..be4b2e8 100755 --- a/lustre/tests/lnet-selftest.sh +++ b/lustre/tests/lnet-selftest.sh @@ -4,6 +4,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging # ALWAYS_EXCEPT="$ALWAYS_EXCEPT $LNET_SELFTEST_EXCEPT" @@ -104,7 +105,6 @@ test_smoke_sub () { echo 'trap "cleanup $pid" INT TERM' echo sleep $smoke_DURATION echo 'cleanup $pid' - } run_lst () { @@ -137,24 +137,23 @@ test_smoke () { local log=$TMP/$tfile.log local rc=0 - test_smoke_sub $servers $clients 2>&1 > $runlst + test_smoke_sub $servers $clients 2>&1 > $runlst cat $runlst run_lst $runlst | tee $log rc=${PIPESTATUS[0]} [ $rc = 0 ] || error "$runlst failed: $rc" - + lst_end_session --verbose | tee -a $log # error counters in "lst show_error" should be checked check_lst_err $log - } run_test smoke "lst regression test" complete $(basename $0) $SECONDS if [ "$RESTORE_MOUNT" = yes ]; then setupall -fi +fi exit_status diff --git a/lustre/tests/maloo_upload.sh b/lustre/tests/maloo_upload.sh new file mode 100755 index 0000000..dc81ed0 --- /dev/null +++ b/lustre/tests/maloo_upload.sh @@ -0,0 +1,31 @@ +#!/bin/sh + +FILENAME=$1 + +if [ -r ~/.maloorc ] ; then + source ~/.maloorc +else + echo "Error: ~/.maloorc not found. Please obtain this file from the maloo web interface, under 'Upload results'" + exit 1 +fi + +if [ -z $FILENAME ] ; then + echo "Usage: ${0} " + exit 2 +fi + + +if [ ! -r $FILENAME ] ; then + echo "Input file '$FILENAME' not found" + exit 3 +fi + +echo Uploading $FILENAME to $MALOO_URL +if [ -d $FILENAME ] ; then + pushd $FILENAME + tar czf - * | curl -F "user_id=${MALOO_USER_ID}" -F "upload=@-" -F "user_upload_token=${MALOO_UPLOAD_TOKEN}" ${MALOO_URL} > /dev/null + popd +else + curl -F "user_id=${MALOO_USER_ID}" -F "upload=@${FILENAME}" -F "user_upload_token=${MALOO_UPLOAD_TOKEN}" ${MALOO_URL} > /dev/null +fi +echo Complete. diff --git a/lustre/tests/metadata-updates.sh b/lustre/tests/metadata-updates.sh index 9ef46ee..a698981 100755 --- a/lustre/tests/metadata-updates.sh +++ b/lustre/tests/metadata-updates.sh @@ -10,6 +10,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging TRACE=${TRACE:-"+x"} @@ -71,7 +72,7 @@ do_write () { do_nodes $NODES_TO_USE "set $TRACE; TESTFILE=$TESTDIR/\\\$(hostname)/$FILE; dd if=/dev/zero of=\\\$TESTFILE bs=$FILE_SIZE count=1 2>/dev/null || exit 54; -echo \\\$(hostname) | dd of=\\\$TESTFILE conv=notrunc 2>/dev/null || exit 55; +echo \\\$(hostname) | dd of=\\\$TESTFILE conv=notrunc 2>/dev/null || exit 55; md5sum \\\$TESTFILE >> $SUMFILE; " || return ${PIPESTATUS[0]} return 0 } @@ -90,7 +91,7 @@ do_truncate () { do_nodes $NODES_TO_USE "set $TRACE; TESTFILE=$TESTDIR/\\\$(hostname)/$FILE; -$TRUNCATE \\\$TESTFILE 0" || return ${PIPESTATUS[0]} +$TRUNCATE \\\$TESTFILE 0" || return ${PIPESTATUS[0]} FILE_SIZE=0 return 0 @@ -103,7 +104,7 @@ get_stat () { echo "Checking file(s) attributes ... " do_nodesv $NODES_TO_USE "set $TRACE; -for HOST in ${HOSTS//,/ } ; do +for HOST in ${HOSTS//,/ } ; do TESTFILE=$TESTDIR/\\\$HOST/$FILE; tmp=\\\$(stat -c \\\"%u %g %s 0%a\\\" \\\$TESTFILE); echo \\\"\\\$TESTFILE [ uid gid size mode ] expected : $attr ; got : \\\$tmp \\\"; @@ -112,7 +113,7 @@ for HOST in ${HOSTS//,/ } ; do exit 56; fi; done " || return ${PIPESTATUS[0]} - return 0 + return 0 } do_chmod () { @@ -121,7 +122,7 @@ do_chmod () { do_nodes $NODES_TO_USE "set $TRACE; TESTFILE=$TESTDIR/\\\$(hostname)/$FILE; chmod $NEW_MODE \\\$TESTFILE" || return ${PIPESTATUS[0]} - + CURRENT_MODE=$NEW_MODE return 0 } @@ -146,7 +147,7 @@ do_check_timestamps () { echo "Checking atime, mtime ... " do_nodesv $NODES_TO_USE "set $TRACE; -for HOST in ${HOSTS//,/ } ; do +for HOST in ${HOSTS//,/ } ; do TESTFILE=$TESTDIR/\\\$HOST/$FILE; tmp=\\\$(stat -c \\\"%X %Y\\\" \\\$TESTFILE); if [ x\\\"\\\$tmp\\\" != x\\\"$times\\\" ] ; then @@ -155,7 +156,7 @@ for HOST in ${HOSTS//,/ } ; do fi; done; exit \\\$RC" || return ${PIPESTATUS[0]} - return 0 + return 0 } do_fill_dir () { @@ -176,7 +177,7 @@ check_dir_contents () { echo "Checking dir contents ... (should exist files : f$num_files ... f$NUM_FILES) ... " do_nodes $NODES_TO_USE "set $TRACE; -for HOST in ${HOSTS//,/ } ; do +for HOST in ${HOSTS//,/ } ; do DIR=$TESTDIR/\\\$HOST; for i in \\\$(seq $NUM_FILES -1 $num_files) ; do if ! [ -f \\\$DIR/f\\\$i ] ; then diff --git a/lustre/tests/mmp.sh b/lustre/tests/mmp.sh index 6b7c256..4eca25c 100755 --- a/lustre/tests/mmp.sh +++ b/lustre/tests/mmp.sh @@ -22,9 +22,10 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 +require_dsh_mds || exit 0 +require_dsh_ost || exit 0 # unmount and cleanup the Lustre filesystem MMP_RESTORE_MOUNT=false @@ -164,7 +165,7 @@ mmp_init() { fi local var=${MMP_OSS}failover_HOST - + if [ -z "${!var}" ]; then log "Failover is not used on OSS, enabling MMP manually..." enable_mmp $MMP_OSS $MMP_OSTDEV || \ @@ -204,7 +205,7 @@ mmp_fini() { return 0 } -# Mount the shared target on the failover server after some interval it's +# Mount the shared target on the failover server after some interval it's # mounted on the primary server. mount_after_interval_sub() { local interval=$1 @@ -269,7 +270,7 @@ mount_after_interval() { return 0 } -# Mount the shared target on the failover server +# Mount the shared target on the failover server # during unmounting it on the primary server. mount_during_unmount() { local device=$1 @@ -309,7 +310,7 @@ mount_during_unmount() { return 0 } -# Mount the shared target on the failover server +# Mount the shared target on the failover server # after clean unmounting it on the primary server. mount_after_unmount() { local device=$1 @@ -323,7 +324,7 @@ mount_after_unmount() { start $facet $device $mnt_opts || return ${PIPESTATUS[0]} log "Unmounting $device on $facet..." - stop $facet || return ${PIPESTATUS[0]} + stop $facet || return ${PIPESTATUS[0]} log "Mounting $device on $failover_facet..." start $failover_facet $device $mnt_opts || return ${PIPESTATUS[0]} diff --git a/lustre/tests/obdfilter-survey.sh b/lustre/tests/obdfilter-survey.sh index 043883f..cc84b4d 100644 --- a/lustre/tests/obdfilter-survey.sh +++ b/lustre/tests/obdfilter-survey.sh @@ -5,12 +5,13 @@ set -e LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ +init_logging nobjhi=${nobjhi:-1} -thrhi=${thrhi:-16} +thrhi=${thrhi:-16} size=${size:-1024} -# the summary file a bit smaller than OSTSIZE +# the summary file a bit smaller than OSTSIZE . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} [ "$SLOW" = no ] && { nobjhi=1; thrhi=4; } @@ -85,7 +86,7 @@ print_jbd () { local varsvc=${facet}_svc local dev=$(ldiskfs_canon "*.${!varsvc}.mntdev" $facet) - # ext4: /proc/fs/jbd2/sda1:8/history + # ext4: /proc/fs/jbd2/sda1:8/history # ext3: /proc/fs/jbd/sdb1/history do_facet $facet cat /proc/fs/jbd*/${dev}*/$file diff --git a/lustre/tests/ost-pools.sh b/lustre/tests/ost-pools.sh index c47dd3e..79e9d9c 100644 --- a/lustre/tests/ost-pools.sh +++ b/lustre/tests/ost-pools.sh @@ -25,6 +25,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging check_and_setup_lustre @@ -142,7 +143,7 @@ check_file_in_osts() { local ost_count=$($GETSTRIPE $file | grep 0x | wc -l) [[ -n "$count" ]] && [[ $ost_count -ne $count ]] && \ { error "Stripe count $count expected; got $ost_count" && return 1; } - + return 0 } @@ -681,10 +682,10 @@ test_12() { add_pool $POOL2 $FSNAME-OST[$TGT_FIRST] "$FIRST_UUID " do_facet $SINGLEMDS lctl pool_list $FSNAME.$POOL2 - echo Checking the files again + echo Checking the files again check_dir_in_pool $POOL_ROOT/dir1 $POOL check_dir_in_pool $POOL_ROOT/dir2 $POOL2 - check_file_in_osts $POOL_ROOT/file1 "$TGT_LIST2" + check_file_in_osts $POOL_ROOT/file1 "$TGT_LIST2" check_file_in_osts $POOL_ROOT/file2 "$(seq $start 2 $TGT_MAX)" echo Creating some more files @@ -693,14 +694,14 @@ test_12() { create_file $POOL_ROOT/file3 $POOL create_file $POOL_ROOT/file4 $POOL2 - echo Checking the new files + echo Checking the new files check_file_in_pool $POOL_ROOT/file3 $POOL check_file_in_pool $POOL_ROOT/file4 $POOL2 destroy_pool $POOL destroy_pool $POOL2 - return 0 + return 0 } run_test 12 "OST Pool Membership" @@ -786,7 +787,7 @@ test_14() { create_dir $POOL_ROOT/dir1 $POOL 1 create_file $POOL_ROOT/dir1/file $POOL 1 - local OST=$($GETSTRIPE $POOL_ROOT/dir1/file | grep 0x | cut -f2) + local OST=$($GETSTRIPE $POOL_ROOT/dir1/file | grep 0x | cut -f2) i=0 while [[ $i -lt $numfiles ]]; do @@ -1297,7 +1298,7 @@ test_24() { error "Stripe count ($count) not inherited in $file ($count1)" [[ "$size" != "$size1" ]] && [[ "$size" != "0" ]] && \ error "Stripe size ($size) not inherited in $file ($size1)" - done + done done rm -rf $POOL_ROOT diff --git a/lustre/tests/parallel-scale.sh b/lustre/tests/parallel-scale.sh index 061db6d..73e0040 100644 --- a/lustre/tests/parallel-scale.sh +++ b/lustre/tests/parallel-scale.sh @@ -6,6 +6,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging # bug number: ALWAYS_EXCEPT="$PARALLEL_SCALE_EXCEPT" @@ -139,7 +140,7 @@ test_compilebench() { mkdir -p $testdir local savePWD=$PWD - cd $cbench_DIR + cd $cbench_DIR local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej" log "$cmd" @@ -147,7 +148,7 @@ test_compilebench() { local rc=0 eval $cmd rc=$? - + cd $savePWD [ $rc = 0 ] || error "compilebench failed: $rc" rm -rf $testdir @@ -260,9 +261,9 @@ test_connectathon() { # -s special # -l lock # -a all of the above - # + # # -f a quick functionality test - # + # tests="-b -g -s" # Include lock tests unless we're running on nfsv4 @@ -306,7 +307,7 @@ test_ior() { echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)" fi - + generate_machine_file $clients $MACHINEFILE || return $? print_opts IOR ior_THREADS ior_DURATION MACHINEFILE @@ -316,13 +317,13 @@ test_ior() { # mpi_run uses mpiuser chmod 0777 $testdir if [ "$NFSCLIENT" ]; then - setstripe_nfsserver $testdir -c -1 || - { error "setstripe on nfsserver failed" && return 1; } + setstripe_nfsserver $testdir -c -1 || + { error "setstripe on nfsserver failed" && return 1; } else $LFS setstripe $testdir -c -1 || { error "setstripe failed" && return 2; } fi - # + # # -b N blockSize -- contiguous bytes to write per task (e.g.: 8, 4k, 2m, 1g)" # -o S testFileName # -t N transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)" @@ -342,7 +343,7 @@ test_ior() { rm -rf $testdir } run_test ior "ior" - + test_cascading_rw() { if [ "$NFSCLIENT" ]; then skip "skipped for NFSCLIENT mode" @@ -369,7 +370,7 @@ test_cascading_rw() { # mpi_run uses mpiuser chmod 0777 $testdir - # -g: debug mode + # -g: debug mode # -n: repeat test # times local cmd="$CASC_RW -g -d $testdir -n $casc_REP" @@ -391,7 +392,7 @@ test_write_append_truncate() { return fi - # location is lustre/tests dir + # location is lustre/tests dir if ! which write_append_truncate > /dev/null 2>&1 ; then skip_env "write_append_truncate not found" return @@ -578,9 +579,9 @@ test_statahead () { cancel_lru_locks mdc - local cmd="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir $testdir --nfiles $num_files --filefmt 'f%%d'" + local cmd="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir $testdir --nfiles $num_files --filefmt 'f%%d'" echo "+ $cmd" - + mpi_run -np $((num_clients * 32)) -machinefile ${MACHINEFILE} $cmd local rc=$? diff --git a/lustre/tests/performance-sanity.sh b/lustre/tests/performance-sanity.sh index 918b891..b217d0d 100644 --- a/lustre/tests/performance-sanity.sh +++ b/lustre/tests/performance-sanity.sh @@ -11,13 +11,14 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging [ -x "$MDSRATE" ] || FAIL_ON_ERROR=true error "No mdsrate program. Aborting." which mpirun > /dev/null 2>&1 || \ - FAIL_ON_ERROR=true error "No mpirun program. Aborting." + FAIL_ON_ERROR=true error "No mpirun program. Aborting." # Skip these tests -# bug number: 15266 15266 +# bug number: 15266 15266 ALWAYS_EXCEPT="1 2 $PERFORMANCE_SANITY_EXCEPT" build_test_filter @@ -28,7 +29,7 @@ test_1() { } run_test 1 "single-client IO perf =====" -# parallel-IOR-rates +# parallel-IOR-rates test_2() { echo "MPI coordinated test of parallel filesystem system calls and library functions" } diff --git a/lustre/tests/racer.sh b/lustre/tests/racer.sh index 3567ebd..327f051 100644 --- a/lustre/tests/racer.sh +++ b/lustre/tests/racer.sh @@ -1,4 +1,7 @@ #!/bin/bash +# -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*- +# vim:autoindent:shiftwidth=4:tabstop=4: + #set -vx set -e @@ -7,9 +10,11 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging racer=$LUSTRE/tests/racer/racer.sh echo racer: $racer +[ -z "$racer" ] && echo racer is not installed && exit 1 CLIENTS=${CLIENTS:-$HOSTNAME} RACERDIRS=${RACERDIRS:-$DIR} @@ -23,38 +28,140 @@ done DURATION=${DURATION:-900} [ "$SLOW" = "no" ] && DURATION=300 +PIDFILE=$TMP/racer.$$ + +assert_env CLIENTS + +timer_on () { + sleep $1 && kill -s ALRM $$ & + TIMERPID=$! + echo TIMERPID=$TIMERPID +} + +do_racer_cleanup () { + trap 0 + + local WAIT=0 + local INTERVAL=5 + local pids + local rc=0 + local TMAX + + local RDIR=$1 + + echo "DOING RACER CLEANUP ... " + + # Check if all processes are killed + + local clients=$CLIENTS + local num_clients=$(get_node_count ${clients//,/ }) + + if at_is_enabled; then + TMAX=$(at_max_get mds) + else + TMAX=$(lctl get_param -n timeout) + fi + + [ $TMAX -gt $((num_clients * 60)) ] || TMAX=$((num_clients * 60)) + # 1.Let chance to racer to kill all it's processes + # FIXME: not sure how long does it take for racer to kill all processes + # 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec + while [ $WAIT -lt $TMAX ]; do + running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true) + [ -z "$running" ] && rc=0 && break + echo "clients $clients are still running the racer processes. Waited $WAIT secs" + echo $running + rc=1 + [ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL)) + sleep $INTERVAL + WAIT=$((WAIT + INTERVAL)) + done + + # 2. Kill the remaining processes + if [ $rc -ne 0 ]; then + for C in ${clients//,/ } ; do + pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true) + if [ ! -z "$pids" ]; then + echo "client $C still running racer processes after $WAIT seconds. Killing $pids" + do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" + do_node $C kill -TERM $pids || true + # let processes to be killed, there maybe many threads to be killed, so give 20 sec gap + sleep 20 + # 3. Check if the processes were killed + # exit error if the processes still exist + for pid in $pids; do + do_node $C "ps -P $pid" && RC=1 || true + done + else + echo "All processes on client $C exited after $WAIT seconds. OK." + fi + done + else + echo "No racer processes running after $WAIT seconds. OK." + wait_remote_prog $racer 10 + fi +} + +racer_cleanup () { + if [ "$timeout" == "timeout" ]; then + echo $timeout killing RACERPID=$RACERPID + kill $RACERPID || true + sleep 2 # give chance racer to kill it's processes + local dir + for dir in $RDIRS; do + do_racer_cleanup $dir + done + else + echo "Racer completed before DURATION=$DURATION expired. Cleaning up..." + kill $TIMERPID || true + for dir in $RDIRS; do + do_racer_cleanup $dir + done + fi +} + +racer_timeout () { + timeout="timeout" + RACERPID=$(cat $PIDFILE) + rm -f $PIDFILE + racer_cleanup + echo "$0: completed $RC" + return $RC +} + build_test_filter check_and_setup_lustre +trap racer_timeout ALRM # run racer test_1() { - local rrc=0 - local rc=0 - local clients=${CLIENTS:-$(hostname)} + RC=0 - check_progs_installed $clients $racer || \ - { skip_env "$racer not found" && return 0; } + timer_on $((DURATION + 5)) - local rpids="" + RACERPID="" for rdir in $RDIRS; do - do_nodes $clients "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" & + do_nodes $CLIENTS "DURATION=$DURATION $racer $rdir $NUM_RACER_THREADS" & pid=$! - rpids="$rpids $pid" + RACERPID="$RACERPID $pid" done - echo racers pids: $rpids - for pid in $rpids; do - wait $pid + echo RACERPID=$RACERPID + echo $RACERPID > $PIDFILE + for rpid in $RACERPID; do + wait $rpid rc=$? - echo "pid=$pid rc=$rc" + echo "rpid=$rpid rc=$rc" if [ $rc != 0 ]; then - rrc=$((rrc + 1)) + RC=$((RC + 1)) fi done - return $rrc + racer_cleanup + + return $RC } -run_test 1 "racer on clients: ${CLIENTS:-$(hostname)} DURATION=$DURATION" +run_test 1 "racer on clients: $CLIENTS DURATION=$DURATION" complete $(basename $0) $SECONDS check_and_cleanup_lustre diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh index 3f83867..4dff18f 100644 --- a/lustre/tests/recovery-double-scale.sh +++ b/lustre/tests/recovery-double-scale.sh @@ -17,6 +17,7 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} DEBUGLOG=$TESTSUITELOG.debug @@ -84,7 +85,7 @@ reboot_recover_node () { shutdown_client $c boot_node $c echo "Reintegrating $c" - # one client fails; need dk logs from this client only + # one client fails; need dk logs from this client only zconf_mount $c $MOUNT || NODES="$c $(facet_host mds) $(osts_nodes)" error_exit "zconf_mount failed" done start_client_loads $item @@ -166,7 +167,7 @@ failover_pair() { reboot_recover_node $item1 $type1 - # Hendrix test17 description: + # Hendrix test17 description: # Introduce a failure, wait at # least 5 minutes (for recovery), # introduce a 2nd @@ -178,13 +179,13 @@ failover_pair() { # We have a "double failures" if SERIAL is not set, # do not need a sleep between failures for "double failures" - log " Failing type2=$type2 item2=$item2 ... " + log " Failing type2=$type2 item2=$item2 ... " reboot_recover_node $item2 $type2 # Client loads are allowed to die while in recovery, so we just # restart them. log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK" - restart_client_loads $NODES_TO_USE $ERRORS_OK || return $? + restart_client_loads $NODES_TO_USE $ERRORS_OK || return $? log "Done checking / re-Starting client loads. PASS" return 0 } @@ -209,7 +210,7 @@ summary_and_cleanup () { echo "Client load failed on node $END_RUN_NODE" echo echo "client $END_RUN_NODE load debug output :" - local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug + local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug do_node ${END_RUN_NODE} "set -x; [ -e $logfile ] && cat $logfile " || true fi rc=1 @@ -260,11 +261,11 @@ START_TS=$(date +%s) CURRENT_TS=$START_TS ELAPSED=0 -# Set SERIAL to serialize the failure through a recovery of the first failure. +# Set SERIAL to serialize the failure through a recovery of the first failure. SERIAL=${SERIAL:-""} ERRORS_OK="yes" -[ "$SERIAL" ] && ERRORS_OK="" +[ "$SERIAL" ] && ERRORS_OK="" FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes @@ -275,7 +276,7 @@ if ! do_nodesv $NODES_TO_USE "cat $TMP/client-load.pid"; then exit 3 fi -# FIXME: Do we want to have an initial sleep period where the clients +# FIXME: Do we want to have an initial sleep period where the clients # just run before introducing a failure? sleep $FAILOVER_PERIOD @@ -296,7 +297,7 @@ if [ $OSTCOUNT -gt 1 ]; then sleep $FAILOVER_PERIOD else skip "$0 : $OSTCOUNT < 2 OSTs, test 4 skipped" -fi +fi #CMD_TEST_NUM=17.5 failover_pair OST clients "test 5: failover OST, then 2 clients ====" diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh index 82a5507..496c71c 100644 --- a/lustre/tests/recovery-mds-scale.sh +++ b/lustre/tests/recovery-mds-scale.sh @@ -14,6 +14,7 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} DEBUGLOG=$TESTSUITELOG.debug @@ -69,7 +70,7 @@ if [ "$FLAVOR" == "MDS" ]; then else SERVERS=$OSTS fi - + if [ "$SLOW" = "no" ]; then DURATION=${DURATION:-$((60 * 30))} SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 5))} @@ -119,7 +120,7 @@ summary_and_cleanup () { # the one we are really interested in. if [ -n "$END_RUN_NODE" ]; then var=$(node_var_name $END_RUN_NODE)_load - echo "Client load failed on node $END_RUN_NODE" + echo "Client load failed on node $END_RUN_NODE" echo echo "client $END_RUN_NODE load stdout and debug files : ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE} @@ -127,7 +128,7 @@ summary_and_cleanup () { fi rc=1 fi - + echo $(date +'%F %H:%M:%S') Terminating clients loads ... echo "$0" >> $END_RUN_FILE local result=PASS @@ -172,7 +173,7 @@ Status: $result: rc=$rc" } # -# MAIN +# MAIN # log "-----============= $0 starting =============-----" @@ -204,21 +205,21 @@ CURRENT_TS=$START_TS while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do - # In order to perform the + # In order to perform the # expected number of failovers, we need to account the following : # 1) the time that has elapsed during the client load checking # 2) time takes for failover it_time_start=$(date +%s) - + SERVERFACET=$(get_random_entry $SERVERS) var=${SERVERFACET}_numfailovers - # Check that our client loads are still running. If any have died, - # that means they have died outside of recovery, which is unacceptable. + # Check that our client loads are still running. If any have died, + # that means they have died outside of recovery, which is unacceptable. log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ - ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" + ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" if ! check_client_loads $NODES_TO_USE; then exit 4 @@ -234,7 +235,7 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do log "Checking clients are in FULL state before doing next failover" if ! wait_clients_import_state $NODES_TO_USE $SERVERFACET FULL; then echo "Clients import not FULL, please consider to increase SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD !" - + fi log "Starting failover on $SERVERFACET" @@ -252,14 +253,14 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do # Increment the number of failovers val=$((${!var} + 1)) eval $var=$val - + CURRENT_TS=$(date +%s) ELAPSED=$((CURRENT_TS - START_TS)) - + sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start))) # keep count the number of itterations when - # time spend to failover and two client loads check exceeded + # time spend to failover and two client loads check exceeded # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ) if [ $sleep -lt $MINSLEEP ]; then reqfail=$((reqfail +1)) @@ -269,8 +270,8 @@ This iteration, the load was only applied for sleep=$sleep seconds. Estimated max recovery time : $max_recov_time Probably the hardware is taking excessively long to boot. Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918" - [ $reqfail -gt $REQFAIL ] && exit 6 - fi + [ $reqfail -gt $REQFAIL ] && exit 6 + fi log "$SERVERFACET has failed over ${!var} times, and counting..." @@ -278,7 +279,7 @@ Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug break fi - if [ $sleep -gt 0 ]; then + if [ $sleep -gt 0 ]; then echo "sleeping $sleep seconds ... " sleep $sleep fi diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh index 2fced26..fb281e1 100644 --- a/lustre/tests/recovery-random-scale.sh +++ b/lustre/tests/recovery-random-scale.sh @@ -18,6 +18,7 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} DEBUGLOG=$TESTSUITELOG.debug @@ -123,7 +124,7 @@ summary_and_cleanup () { # the one we are really interested in. if [ -n "$END_RUN_NODE" ]; then var=$(node_var_name $END_RUN_NODE)_load - echo "Client load failed on node $END_RUN_NODE" + echo "Client load failed on node $END_RUN_NODE" echo echo "client $END_RUN_NODE load stdout and debug files : ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE} @@ -179,7 +180,7 @@ Status: $result: rc=$rc" } # -# MAIN +# MAIN # log "-----============= $0 starting =============-----" @@ -213,13 +214,13 @@ sleep=0 ERRORS_OK="yes" while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do - # In order to perform the + # In order to perform the # expected number of failovers, we need to account the following : # 1) the time that has elapsed during the client load checking # 2) time takes for failover it_time_start=$(date +%s) - + FAIL_CLIENT=$(get_random_entry $NODES_TO_USE) client_var=$(node_var_name $FAIL_CLIENT)_nums @@ -230,11 +231,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do SERVERFACET=$(get_random_entry $MDTS) var=${SERVERFACET}_nums - # Check that our client loads are still running. If any have died, - # that means they have died outside of recovery, which is unacceptable. + # Check that our client loads are still running. If any have died, + # that means they have died outside of recovery, which is unacceptable. log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ - ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" + ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" if ! check_client_loads $NODES_TO_USE; then exit 4 @@ -246,11 +247,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do log "Starting failover on $SERVERFACET" facet_failover "$SERVERFACET" || exit 1 - if ! wait_recovery_complete $SERVERFACET ; then + if ! wait_recovery_complete $SERVERFACET ; then echo "$SERVERFACET recovery is not completed!" exit 7 fi - + boot_node $FAIL_CLIENT echo "Reintegrating $FAIL_CLIENT" zconf_mount $FAIL_CLIENT $MOUNT || exit $? @@ -269,10 +270,10 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do # not for all clients. if [ -e $END_RUN_FILE ]; then read END_RUN_NODE < $END_RUN_FILE - [[ $END_RUN_NODE = $FAIL_CLIENT ]] && + [[ $END_RUN_NODE = $FAIL_CLIENT ]] && rm -f $END_RUN_FILE || exit 13 fi - + restart_client_loads $FAIL_CLIENT $ERRORS_OK || exit $? # Check that not failed clients loads are still running. @@ -286,11 +287,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do CURRENT_TS=$(date +%s) ELAPSED=$((CURRENT_TS - START_TS)) - + sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start))) # keep count the number of itterations when - # time spend to failover and two client loads check exceeded + # time spend to failover and two client loads check exceeded # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ) if [ $sleep -lt $MINSLEEP ]; then reqfail=$((reqfail +1)) @@ -300,8 +301,8 @@ This iteration, the load was only applied for sleep=$sleep seconds. Estimated max recovery time : $max_recov_time Probably the hardware is taking excessively long to boot. Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918" - [ $reqfail -gt $REQFAIL ] && exit 6 - fi + [ $reqfail -gt $REQFAIL ] && exit 6 + fi log " Number of failovers: $(numfailovers) and counting..." @@ -310,7 +311,7 @@ $(numfailovers) and counting..." break fi - if [ $sleep -gt 0 ]; then + if [ $sleep -gt 0 ]; then echo "sleeping $sleep seconds ... " sleep $sleep fi diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 0897f01..e3558c1 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -10,8 +10,9 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 +require_dsh_mds || exit 0 # also long tests: 19, 21a, 21e, 21f, 23, 27 # 1 2.5 2.5 4 4 (min)" @@ -136,7 +137,7 @@ run_test 11 "wake up a thread waiting for completion after eviction (b=2460)" #b=2494 test_12(){ - $LCTL mark multiop $DIR/$tfile OS_c + $LCTL mark multiop $DIR/$tfile OS_c do_facet mds "lctl set_param fail_loc=0x115" clear_failloc mds $((TIMEOUT * 2)) & multiop_bg_pause $DIR/$tfile OS_c || return 1 @@ -262,7 +263,7 @@ test_18a() { local osc2dev=`lctl get_param -n devices | grep ${ost2_svc}-osc- | awk '{print $1}'` $LCTL --device $osc2dev deactivate || return 3 # my understanding is that there should be nothing in the page - # cache after the client reconnects? + # cache after the client reconnects? rc=0 pgcache_empty || rc=2 $LCTL --device $osc2dev activate @@ -383,7 +384,7 @@ test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup rc=$? [ $rc -eq 0 ] && error "multiop didn't fail enqueue: rc $rc" || true } -run_test 20a "ldlm_handle_enqueue error (should return error)" +run_test 20a "ldlm_handle_enqueue error (should return error)" test_20b() { # bug 2986 - ldlm_handle_enqueue error during open remote_ost_nodsh && skip "remote OST with nodsh" && return 0 @@ -693,7 +694,7 @@ test_26a() { # was test_26 bug 5921 - evict dead exports by pinger echo starting with $OST_NEXP OST exports # OBD_FAIL_PTLRPC_DROP_RPC 0x505 do_facet client lctl set_param fail_loc=0x505 - # evictor takes up to 2.25x to evict. But if there's a + # evictor takes up to 2.25x to evict. But if there's a # race to start the evictor from various obds, the loser # might have to wait for the next ping. @@ -732,8 +733,8 @@ test_26b() { # bug 10140 - evict dead exports by pinger # PING_INTERVAL max(obd_timeout / 4, 1U) # PING_EVICT_TIMEOUT (PING_INTERVAL * 6) - # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. - # But if there's a race to start the evictor from various obds, + # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. + # But if there's a race to start the evictor from various obds, # the loser might have to wait for the next ping. # = 9 * PING_INTERVAL + PING_INTERVAL # = 10 PING_INTERVAL = 10 obd_timeout / 4 = 2.5 obd_timeout @@ -762,7 +763,7 @@ test_27() { facet_failover mds #no crashes allowed! kill -USR1 $CLIENT_PID - wait $CLIENT_PID + wait $CLIENT_PID true FAILURE_MODE=$save_FAILURE_MODE } @@ -802,7 +803,7 @@ test_50() { # client process should see no problems even though MDS went down sleep $TIMEOUT kill -USR1 $CLIENT_PID - wait $CLIENT_PID + wait $CLIENT_PID rc=$? echo writemany returned $rc #these may fail because of eviction due to slow AST response. @@ -833,7 +834,7 @@ test_51() { # and recovery was interrupted sleep $TIMEOUT kill -USR1 $CLIENT_PID - wait $CLIENT_PID + wait $CLIENT_PID rc=$? echo writemany returned $rc [ $rc -eq 0 ] || error_ignore 13652 "writemany returned rc $rc" || true @@ -931,8 +932,8 @@ test_55() { count=0 echo "step2: testing ......" while [ $count -le 64 ]; do - dd_name="`ps x | awk '$1 == '$DDPID' { print $5 }'`" - if [ -z $dd_name ]; then + dd_name="`ps x | awk '$1 == '$DDPID' { print $5 }'`" + if [ -z $dd_name ]; then ls -l $DIR/$tdir echo "debug: (dd_name=$dd_name, dd_pid=$DDPID, time=$count)" error "dd shouldn't be finished!" @@ -971,7 +972,7 @@ test_56() { # b=11277 run_test 56 "do not allow reconnect to busy exports" test_57_helper() { - # no oscs means no client or mdt + # no oscs means no client or mdt while lctl get_param osc.*.* > /dev/null 2>&1; do : # loop until proc file is removed done @@ -1038,7 +1039,7 @@ test_61() $LFS setstripe -c 1 --index 0 $DIR/d61 replay_barrier mds - createmany -o $DIR/d61/$tfile-%d 10 + createmany -o $DIR/d61/$tfile-%d 10 local oid=`do_facet ost1 "lctl get_param -n obdfilter.${ost1_svc}.last_id"` fail_abort mds diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 449c4ab..871ecd5 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -14,10 +14,10 @@ MOUNT_2=${MOUNT_2:-"yes"} . $LUSTRE/tests/test-framework.sh init_test_env $@ - . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 +require_dsh_mds || exit 0 [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 5 14" diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 563a27a..40afe70 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -8,20 +8,21 @@ CLEANUP=${CLEANUP:-""} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging # While we do not use OSTCOUNT=1 setup anymore, # ost1failover_HOST is used #ostfailover_HOST=${ostfailover_HOST:-$ost_HOST} #failover= must be defined in OST_MKFS_OPTIONS if ostfailover_HOST != ost_HOST -remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 +require_dsh_ost || exit 0 # Tests that fail on uml CPU=`awk '/model/ {print $4}' /proc/cpuinfo` [ "$CPU" = "UML" ] && EXCEPT="$EXCEPT 6" # Skip these tests -# BUG NUMBER: +# BUG NUMBER: ALWAYS_EXCEPT="$REPLAY_OST_SINGLE_EXCEPT" # @@ -34,7 +35,7 @@ assert_DIR rm -rf $DIR/[df][0-9]* TDIR=$DIR/d0.${TESTSUITE} -mkdir -p $TDIR +mkdir -p $TDIR $LFS setstripe $TDIR -i 0 -c 1 $LFS getstripe $TDIR @@ -67,11 +68,11 @@ run_test 1 "touch" test_2() { for i in `seq 10`; do echo "tag-$i" > $TDIR/$tfile-$i - done + done fail ost1 for i in `seq 10`; do grep -q "tag-$i" $TDIR/$tfile-$i || error "f2-$i" - done + done rm -f $TDIR/$tfile-* } run_test 2 "|x| 10 open(O_CREAT)s" diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index ad7b1e3..eacbecb 100644 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -13,10 +13,11 @@ CLEANUP=${CLEANUP:-} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging CHECK_GRANT=${CHECK_GRANT:-"yes"} GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} -remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 +require_dsh_mds || exit 0 # Skip these tests # bug number: @@ -906,7 +907,7 @@ test_45() { [ "$mdcdev" ] || return 2 [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; } - $LCTL --device $mdcdev recover || return 6 + $LCTL --device $mdcdev recover || return 6 multiop_bg_pause $DIR/$tfile O_c || return 1 pid=$! @@ -2041,7 +2042,7 @@ test_80b() { { skip "sync journal is not implemeted" && return; } do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0" - + replay_barrier ost1 lfs setstripe -i 0 -c 1 $DIR/$tfile dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 || error "Cannot write" @@ -2131,14 +2132,14 @@ test_85a() { #bug 16774 createmany -o $DIR/$tfile- 100 ls -l $DIR/ > /dev/null - lov_id=`lctl dl | grep "clilov"` + lov_id=`lctl dl | grep "clilov"` addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'` count=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count` echo "before recovery: unused locks count = $count" [ $count -ne 0 ] || error "unused locks should not be zero before recovery" fail mds - + count2=`lctl get_param -n ldlm.namespaces.*MDT0000*$addr.lock_unused_count` echo "after recovery: unused locks count = $count2" @@ -2161,13 +2162,13 @@ test_85b() { #bug 16774 dd if=$DIR/$tfile-$i of=/dev/null bs=4096 count=32 >/dev/null 2>&1 done - lov_id=`lctl dl | grep "clilov"` + lov_id=`lctl dl | grep "clilov"` addr=`echo $lov_id | awk '{print $4}' | awk -F '-' '{print $3}'` count=`lctl get_param -n ldlm.namespaces.*OST0000*$addr.lock_unused_count` echo "before recovery: unused locks count = $count" fail ost1 - + count2=`lctl get_param -n ldlm.namespaces.*OST0000*$addr.lock_unused_count` echo "after recovery: unused locks count = $count2" @@ -2202,7 +2203,7 @@ test_87() { #bug 17485 local mdtosc=$(get_mdtosc_proc_path $OST) local last_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id) local next_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id) - echo "before test: last_id = $last_id, next_id = $next_id" + echo "before test: last_id = $last_id, next_id = $next_id" echo "Creating to objid $last_id on ost $OST..." createmany -o $DIR/$tdir/f-%d $next_id $((last_id - next_id + 2)) @@ -2213,7 +2214,7 @@ test_87() { #bug 17485 last_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id) next_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id) - echo "before recovery: last_id = $last_id2, next_id = $next_id2" + echo "before recovery: last_id = $last_id2, next_id = $next_id2" # if test uses shutdown_facet && reboot_facet instead of facet_failover () # it has to take care about the affected facets, bug20407 @@ -2237,9 +2238,9 @@ test_87() { #bug 17485 last_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id) next_id2=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id) - echo "after recovery: last_id = $last_id2, next_id = $next_id2" + echo "after recovery: last_id = $last_id2, next_id = $next_id2" - # create new files, which should use new objids, and ensure the orphan + # create new files, which should use new objids, and ensure the orphan # cleanup phase for ost1 is completed at the same time for i in `seq 8`; do file_id=$(($last_id + 10 + $i)) diff --git a/lustre/tests/replay-vbr.sh b/lustre/tests/replay-vbr.sh index 7a3c8f9..4c09fc6 100644 --- a/lustre/tests/replay-vbr.sh +++ b/lustre/tests/replay-vbr.sh @@ -13,13 +13,14 @@ CLEANUP=${CLEANUP:-""} . $LUSTRE/tests/test-framework.sh init_test_env $@ - . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging [ -n "$CLIENTS" ] || { skip_env "Need two or more clients" && exit 0; } [ $CLIENTCOUNT -ge 2 ] || \ { skip_env "Need two or more remote clients, have $CLIENTCOUNT" && exit 0; } -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 + +require_dsh_mds || exit 0 [ "$SLOW" = "no" ] && EXCEPT_SLOW="" diff --git a/lustre/tests/rpc.sh b/lustre/tests/rpc.sh index 15e960a..79c1327 100755 --- a/lustre/tests/rpc.sh +++ b/lustre/tests/rpc.sh @@ -3,12 +3,19 @@ export PATH=`dirname $0`/../utils:$PATH NAME=${NAME:-local} LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} + +if [ ! -f $LUSTRE/tests/rpc.sh ]; then + LUSTRE=$(cd $(dirname $(which $0))/..; echo $PWD) +fi + . $LUSTRE/tests/test-framework.sh init_test_env . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -cmd=$1 -shift -$cmd $@ +# Reset the trap on ERR set by the framework. Noticing this failure is the +# framework's job. +trap - ERR + +# Execute the command +"$@" -exit $? diff --git a/lustre/tests/runtests b/lustre/tests/runtests index f99f69f..1416303 100755 --- a/lustre/tests/runtests +++ b/lustre/tests/runtests @@ -13,6 +13,7 @@ export NAME=${NAME:-local} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging RUNTESTS_SRC=${RUNTESTS_SRC:-"/etc /bin"} [ "$COUNT" ] || COUNT=1000 diff --git a/lustre/tests/sanity-benchmark.sh b/lustre/tests/sanity-benchmark.sh index 4c19a53..2ea5b3d 100644 --- a/lustre/tests/sanity-benchmark.sh +++ b/lustre/tests/sanity-benchmark.sh @@ -12,6 +12,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging # bug number: ALWAYS_EXCEPT="$SANITY_BENCHMARK_EXCEPT" @@ -58,7 +59,7 @@ test_dbench() { local SPACE=`df -P $MOUNT | tail -n 1 | awk '{ print $4 }'` DB_THREADS=$((SPACE / 50000)) [ $THREADS -lt $DB_THREADS ] && DB_THREADS=$THREADS - + $DEBUG_OFF myUID=$RUNAS_ID myGID=$RUNAS_GID @@ -113,7 +114,7 @@ test_iozone() { fi export O_DIRECT - + local IOZDIR=$DIR/d0.iozone mkdir -p $IOZDIR $LFS setstripe -c -1 $IOZDIR @@ -138,7 +139,7 @@ test_iozone() { { error "iozone (1) failed" && return 1; } rm -f $IOZLOG $DEBUG_ON - + # check if O_DIRECT support is implemented in kernel if [ -z "$O_DIRECT" ]; then touch $DIR/f.iozone @@ -245,7 +246,7 @@ space_check () { local num_runs=$(echo ${pios_THREADCOUNT//,/ } | wc -w) size=$(( size * $num_runs)) space=$((space * 1024)) - echo size=$size space=$space + echo size=$size space=$space if [ $space -le $size ]; then local ratio=$(( size / space + 1 )) echo "Need free space atleast $size, available $space, ratio=$ratio" @@ -260,7 +261,7 @@ space_check () { fi } -pios_setup() { +pios_setup() { local testdir=$DIR/$tdir mkdir -p $testdir @@ -285,8 +286,8 @@ run_pios () { local cmd="$PIOSBIN -t $pios_THREADCOUNT -n $pios_REGIONCOUNT \ -c $pios_CHUNKSIZE -s $pios_REGIONSIZE \ -o $pios_OFFSET $@ -p $testdir" - - if [ ! -d $testdir ]; then + + if [ ! -d $testdir ]; then error "No test directory created, setup_pios must have failed" return 20 fi @@ -314,7 +315,7 @@ test_pios_ssf() { return 0 fi run_pios || return - run_pios --verify || rc=$? + run_pios --verify || rc=$? pios_cleanup $rc return $rc } diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index 23b3523..5d15058 100755 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -53,12 +53,13 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging DIRECTIO=${DIRECTIO:-$LUSTRE/tests/directio} unset ENABLE_QUOTA -remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 +require_dsh_mds || exit 0 +require_dsh_ost || exit 0 [ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 18b 21" @@ -1089,7 +1090,7 @@ test_11() { echo "" PROCS=$(ps -ef | grep -v grep | grep "dd if /dev/zero of $TESTDIR" | wc -l) LAST_USED=0 - while [ $PROCS -gt 0 ]; do + while [ $PROCS -gt 0 ]; do sleep 20 SECS=$((SECS + sleep)) PROCS=$(ps -ef | grep -v grep | grep "dd if /dev/zero of $TESTDIR" | wc -l) @@ -1867,7 +1868,6 @@ test_24() { set_blk_unitsz $((128 * 1024)) set_blk_tunesz $((128 * 1024 / 2)) - } run_test_with_stat 24 "test if lfs draws an asterix when limit is reached (16646) ===========" diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 637e0a6..17c87a1 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -65,7 +65,7 @@ LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} - +init_logging [ "$SLOW" = "no" ] && EXCEPT_SLOW="24o 27m 36f 36g 36h 51b 51c 60c 63 64b 68 71 73 77f 78 101 103 115 120g 124b" FAIL_ON_ERROR=${FAIL_ON_ERROR:-false} @@ -6856,7 +6856,7 @@ test_201c() { do_facet mgs $LCTL pool_destroy $FSNAME.$POOL - sleep 2 + sleep 2 # striping on an empty/nonexistant pool should fall back to "pool of everything" touch ${POOL_DIR}/$tfile || error "failed to use fallback striping for missing pool" # setstripe on an empty pool should fail @@ -6940,4 +6940,4 @@ check_and_cleanup_lustre if [ "$I_MOUNTED" != "yes" ]; then lctl set_param debug="$OLDDEBUG" 2> /dev/null || true fi -exit_status +exit_status diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index f0521bb..9a909ac 100644 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -38,6 +38,7 @@ CLEANUP=${CLEANUP:-:} SETUP=${SETUP:-:} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging [ "$SLOW" = "no" ] && EXCEPT_SLOW="12 16" @@ -64,6 +65,9 @@ check_runas_id $RUNAS_ID $RUNAS_GID $RUNAS build_test_filter +mkdir -p $MOUNT2 +mount_client $MOUNT2 + test_1a() { touch $DIR1/f1 [ -f $DIR2/f1 ] || error diff --git a/lustre/tests/sgpdd-survey.sh b/lustre/tests/sgpdd-survey.sh index ca9b3d6..0f6d2e5 100644 --- a/lustre/tests/sgpdd-survey.sh +++ b/lustre/tests/sgpdd-survey.sh @@ -5,11 +5,12 @@ set -e LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ +init_logging # QE uses the following parameters: # size=128 crghi=16 thrhi=32 crghi=${crghi:-2} -thrhi=${thrhi:-16} +thrhi=${thrhi:-16} size=${size:-1024} . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index fae3a3a7..ea93c40 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -16,6 +16,7 @@ export CATASTROPHE=${CATASTROPHE:-/proc/sys/lnet/catastrophe} # function used by scripts run on remote nodes LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/functions.sh +. $LUSTRE/tests/yaml.sh LUSTRE_TESTS_CFG_DIR=${LUSTRE_TESTS_CFG_DIR:-${LUSTRE}/tests/cfg} @@ -48,14 +49,15 @@ usage() { print_summary () { trap 0 - [ "$TESTSUITE" == "lfscktest" ] && return 0 + [ "$TESTSUITE" == "lfsck" ] && return 0 [ -n "$ONLY" ] && echo "WARNING: ONLY is set to $(echo $ONLY)" local details local form="%-13s %-17s %-9s %s %s\n" printf "$form" "status" "script" "Total(sec)" "E(xcluded) S(low)" echo "------------------------------------------------------------------------------------" - for O in $TESTSUITE_LIST; do + for O in $DEFAULT_SUITES; do [ "${!O}" = "no" ] && continue || true + O=$(echo $O | tr "-" "_" | tr "[:lower:]" "[:upper:]") local o=$(echo $O | tr "[:upper:]" "[:lower:]") o=${o//_/-} local log=${TMP}/${o}.log @@ -82,23 +84,25 @@ print_summary () { printf "$form" "-" "-" "-" "S=$(echo $slow)" done - for O in $TESTSUITE_LIST; do + for O in $DEFAULT_SUITES; do + O=$(echo $O | tr "-" "_" | tr "[:lower:]" "[:upper:]") if [ "${!O}" = "no" ]; then # FIXME. # only for those tests suits which are run directly from acc-sm script: # bonnie, iozone, etc. if [ -f "$TESTSUITELOG" ] && grep FAIL $TESTSUITELOG | grep -q ' '$O ; then - printf "$form" "UNFINISHED" "$O" "" + printf "$form" "UNFINISHED" "$O" "" else printf "$form" "Skipped" "$O" "" fi fi done - # print the detailed tests durations if DDETAILS=true - if $DDETAILS; then - echo "$details" - fi + for O in $DEFAULT_SUITES; do + O=$(echo $O | tr "-" "_" | tr "[:lower:]" "[:upper:]") + [ "${!O}" = "done" -o "${!O}" = "no" ] || \ + printf "$form" "UNFINISHED" "$O" "" + done } init_test_env() { @@ -134,12 +138,16 @@ init_test_env() { #[ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} export TESTSUITELOG=${TMP}/${TESTSUITE}.log + if [[ -z $LOGDIRSET ]]; then + export LOGDIR=${LOGDIR:-${TMP}/test_logs/}/$(date +%s) + export LOGDIRSET=true + fi export HOSTNAME=${HOSTNAME:-`hostname`} if ! echo $PATH | grep -q $LUSTRE/utils; then - export PATH=$PATH:$LUSTRE/utils + export PATH=$PATH:$LUSTRE/utils fi if ! echo $PATH | grep -q $LUSTRE/test; then - export PATH=$PATH:$LUSTRE/tests + export PATH=$PATH:$LUSTRE/tests fi if ! echo $PATH | grep -q $LUSTRE/../lustre-iokit/sgpdd-survey; then export PATH=$PATH:$LUSTRE/../lustre-iokit/sgpdd-survey @@ -154,7 +162,7 @@ init_test_env() { export MDSRATE=${MDSRATE:-"$LUSTRE/tests/mpi/mdsrate"} [ ! -f "$MDSRATE" ] && export MDSRATE=$(which mdsrate 2> /dev/null) if ! echo $PATH | grep -q $LUSTRE/tests/racer; then - export PATH=$PATH:$LUSTRE/tests/racer + export PATH=$LUSTRE/tests/racer:$PATH: fi if ! echo $PATH | grep -q $LUSTRE/tests/mpi; then export PATH=$PATH:$LUSTRE/tests/mpi @@ -353,7 +361,7 @@ load_modules () { if $LOAD_MODULES_REMOTE ; then local list=$(comma_list $(remote_nodes_list)) echo loading modules on $list - do_rpc_nodes $list load_modules + do_rpc_nodes $list load_modules fi } @@ -534,7 +542,7 @@ quota_save_version() { $LFS quotaoff -ug $MOUNT # just in case [ -n "$ver" ] && quota_set_version $ver else - echo mds running $lustre_version + echo mds running $lustre_version [ -n "$ver" -a "$ver" != "3" ] && error "wrong quota version specifier" fi @@ -682,7 +690,7 @@ fi" } sanity_mount_check_servers () { - [ "$CLIENTONLY" ] && + [ "$CLIENTONLY" ] && { echo "CLIENTONLY mode, skip mount_check_servers"; return 0; } || true echo Checking servers environments @@ -1575,12 +1583,12 @@ do_node() { if [ "$myPDSH" = "rsh" ]; then # we need this because rsh does not return exit code of an executed command - local command_status="$TMP/cs" - rsh $HOST ":> $command_status" - rsh $HOST "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; - cd $RPWD; sh -c \"$@\") || - echo command failed >$command_status" - [ -n "$($myPDSH $HOST cat $command_status)" ] && return 1 || true + local command_status="$TMP/cs" + rsh $HOST ":> $command_status" + rsh $HOST "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; + cd $RPWD; sh -c \"$@\") || + echo command failed >$command_status" + [ -n "$($myPDSH $HOST cat $command_status)" ] && return 1 || true return 0 fi @@ -1616,7 +1624,7 @@ do_nodes() { local rnodes=$1 shift - if $(single_local_node $rnodes); then + if single_local_node $rnodes; then if $verbose; then do_nodev $rnodes "$@" else @@ -1714,7 +1722,7 @@ stopall() { rm -f $TMP/ost${num}active done if ! combined_mgs_mds ; then - stop mgs + stop mgs fi return 0 @@ -1753,12 +1761,12 @@ mkfs_opts () { [[ $facet = mgs ]] && echo $opt && return # 1. - # --failnode options + # --failnode options local var=${facet}failover_HOST if [ x"${!var}" != x ] && [ x"${!var}" != x$(facet_host $facet) ] ; then local failnode=$(h2$NETTYPE ${!var}) failnode="--failnode=$failnode" - # options does not contain + # options does not contain # or contains wrong --failnode= if [[ $opt != *${failnode}* ]]; then opt=$(echo $opt | sed 's/--failnode=.* / /') @@ -1824,8 +1832,8 @@ mount_client() { remount_client() { - zconf_umount `hostname` $1 || error "umount failed" - zconf_mount `hostname` $1 || error "mount failed" + zconf_umount `hostname` $1 || error "umount failed" + zconf_mount `hostname` $1 || error "mount failed" } writeconf_facet () { @@ -1894,7 +1902,7 @@ setupall() { } mounted_lustre_filesystems() { - awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts + awk '($3 ~ "lustre" && $1 ~ ":") { print $2 }' /proc/mounts } init_facet_vars () { @@ -1930,7 +1938,7 @@ init_facet_vars () { # get mount point of already mounted device # is facet_dev is already mounted then use the real # mount point of this facet; otherwise use $(facet_mntpt $facet) - # i.e. ${facet}_MOUNT if specified by user or default + # i.e. ${facet}_MOUNT if specified by user or default local mntpt=$(do_facet ${facet} cat /proc/mounts | \ awk '"'${!dev}'" == $1 && $3 == "lustre" { print $2 }') if [ -z $mntpt ]; then @@ -1981,7 +1989,7 @@ nfs_client_mode () { declare -a nfsexport=(`grep ' '$MOUNT' ' /proc/mounts | awk '{print $1}' | awk -F: '{print $1 " " $2}'`) if [[ ${#nfsexport[@]} -eq 0 ]]; then error_exit NFSCLIENT=$NFSCLIENT mode, but no NFS export found! - fi + fi do_nodes ${nfsexport[0]} "echo \\\$(hostname); df -T ${nfsexport[1]}" return fi @@ -1999,7 +2007,7 @@ check_config_client () { # in theory someone could create a new, # client-only config file that assumed lustre was already # configured and didn't set the MGSNID. If MGSNID is not set, - # then we should use the mgs nid currently being used + # then we should use the mgs nid currently being used # as the default value. bug 18021 [[ x$MGSNID = x ]] && MGSNID=${mgc//MGC/} @@ -2109,7 +2117,7 @@ check_and_setup_lustre() { restore_mount $MOUNT2 export I_MOUNTED2=yes fi - fi + fi # 5. # MOUNT is mounted MOUNT2 is not mounted @@ -2145,7 +2153,7 @@ cleanup_mount () { local clients=${CLIENTS:-$HOSTNAME} local mntpt=$1 - zconf_umount_clients $clients $mntpt + zconf_umount_clients $clients $mntpt } cleanup_and_setup_lustre() { @@ -2153,7 +2161,7 @@ cleanup_and_setup_lustre() { lctl set_param debug=0 || true cleanupall if [ "$ONLY" == "cleanup" ]; then - exit 0 + exit 0 fi fi check_and_setup_lustre @@ -2219,7 +2227,7 @@ generate_db() { local dev local tmp_file - tmp_file=$(mktemp -p $SHARED_DIRECTORY || + tmp_file=$(mktemp -p $SHARED_DIRECTORY || error_exit "fail to create file in $SHARED_DIRECTORY") # make sure everything gets to the backing store @@ -2299,7 +2307,6 @@ wait_for_function () { if [ "$1" = "--quiet" ]; then shift quiet=" > /dev/null 2>&1" - fi local fn=$1 @@ -2347,7 +2354,7 @@ comma_list() { list_member () { local list=$1 local item=$2 - echo $list | grep -qw $item + echo $list | grep -qw $item } # list, excluded are the comma separated lists @@ -2599,7 +2606,6 @@ debugrestore() { error_noexit() { local TYPE=${TYPE:-"FAIL"} - local ERRLOG local tmp=$TMP [ -d "$SHARED_DIR_LOGS" ] && tmp=$SHARED_DIR_LOGS @@ -2612,17 +2618,14 @@ error_noexit() { log " ${TESTSUITE} ${TESTNAME}: @@@@@@ ${TYPE}: $@ " + # We need to dump the logs on all nodes if $dump; then - ERRLOG=$tmp/lustre_${TESTSUITE}_${TESTNAME}.$(date +%s) - [[ $cntlog -eq 0 ]] || ERRLOG=$ERRLOG.$cntlog - (( cntlog+=1 )) - echo "Dumping lctl log to $ERRLOG" - # We need to dump the logs on all nodes - do_nodes $(comma_list $(nodes_list)) $NODE $LCTL dk $ERRLOG + gather_logs $(comma_list $(nodes_list)) fi + debugrestore [ "$TESTSUITELOG" ] && echo "$0: ${TYPE}: $TESTNAME $@" >> $TESTSUITELOG - TEST_FAILED=true + echo "$@" > $LOGDIR/err } exit_status () { @@ -2684,7 +2687,7 @@ build_test_filter() { done for G in $GRANT_CHECK_LIST; do eval GCHECK_ONLY_${G}=true - done + done } basetest() { @@ -2705,13 +2708,13 @@ run_test() { testname=ONLY_$1 if [ ${!testname}x != x ]; then [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED= - run_one $1 "$2" + run_one_logged $1 "$2" return $? fi testname=ONLY_$base if [ ${!testname}x != x ]; then [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED= - run_one $1 "$2" + run_one_logged $1 "$2" return $? fi LAST_SKIPPED="y" @@ -2744,7 +2747,7 @@ run_test() { fi LAST_SKIPPED= - run_one $1 "$2" + run_one_logged $1 "$2" return $? } @@ -2790,9 +2793,13 @@ complete () { } pass() { - local status=PASS - $TEST_FAILED && status=FAIL - echo "$status $testnum $@" 2>&1 | tee -a $TESTSUITELOG + # Set TEST_STATUS here; will be used for logging the result + if [ -f $LOGDIR/err ]; then + TEST_STATUS="FAIL" + else + TEST_STATUS="PASS" + fi + echo $TEST_STATUS " " $@ } check_mds() { @@ -2812,28 +2819,48 @@ run_one() { message=$2 tfile=f${testnum} export tdir=d0.${TESTSUITE}/d${base} - + export TESTNAME=test_$testnum local SAVE_UMASK=`umask` umask 0022 - local BEFORE=`date +%s` echo - log "== test $testnum: $message == `date +%H:%M:%S` ($BEFORE)" - export TESTNAME=test_$testnum - TEST_FAILED=false - cntlog=0 + log "== test $testnum: $message == `date +%H:%M:%S`" test_${testnum} || error "test_$testnum failed with $?" cd $SAVE_PWD reset_fail_loc - check_grant ${testnum} || $TEST_FAILED || error "check_grant $testnum failed" - check_catastrophe || $TEST_FAILED || error "LBUG/LASSERT detected" - ps auxww | grep -v grep | grep -q multiop && ($TEST_FAILED || error "multiop still running") - pass "($((`date +%s` - $BEFORE))s)" - TEST_FAILED=false - cntlog=0 + check_grant ${testnum} || error "check_grant $testnum failed with $?" + check_catastrophe || error "LBUG/LASSERT detected" + ps auxww | grep -v grep | grep -q multiop && error "multiop still running" unset TESTNAME unset tdir umask $SAVE_UMASK + return 0 +} + +run_one_logged() { + local BEFORE=`date +%s` + local TEST_ERROR + local name=${TESTSUITE}.test_${1}.test_log.$(hostname).log + local test_log=$LOGDIR/$name + rm -rf $LOGDIR/err + + log_sub_test_begin test_${1} + (run_one $1 "$2") 2>&1 | tee $test_log + local RC=${PIPESTATUS[0]} + + [ $RC -ne 0 ] && [ ! -f $LOGDIR/err ] && \ + echo "test_$1 returned $RC" | tee $LOGDIR/err + + duration=$((`date +%s` - $BEFORE)) + pass "(${duration}s)" + [ -f $LOGDIR/err ] && TEST_ERROR=$(cat $LOGDIR/err) + log_sub_test_end $TEST_STATUS $duration "$RC" "$TEST_ERROR" + + if [ -f $LOGDIR/err ]; then + $FAIL_ON_ERROR && exit $RC + fi + + return 0 } canonical_path() { @@ -2906,6 +2933,13 @@ remote_mds_nodsh() remote_mds && [ "$PDSH" = "no_dsh" -o -z "$PDSH" -o -z "$mds_HOST" ] } +require_dsh_mds() +{ + remote_mds_nodsh && echo "SKIP: $TESTSUITE: remote MDS with nodsh" && \ + MSKIPPED=1 && return 1 + return 0 +} + remote_ost () { local node @@ -2917,10 +2951,17 @@ remote_ost () remote_ost_nodsh() { - [ "$CLIENTONLY" ] && return 0 || true + [ "$CLIENTONLY" ] && return 0 || true remote_ost && [ "$PDSH" = "no_dsh" -o -z "$PDSH" -o -z "$ost_HOST" ] } +require_dsh_ost() +{ + remote_ost_nodsh && echo "SKIP: $TESTSUITE: remote OST with nodsh" && \ + OSKIPPED=1 && return 1 + return 0 +} + remote_mgs_nodsh() { local MGS @@ -3140,7 +3181,7 @@ do_and_time () { SECONDS=0 eval '$cmd' - + [ ${PIPESTATUS[0]} -eq 0 ] || rc=1 echo $SECONDS @@ -3210,19 +3251,19 @@ exit \\\$rc;" # $2 file # $3 $RUNAS get_stripe_info() { - local tmp_file + local tmp_file - stripe_size=0 - stripe_count=0 - stripe_index=0 - tmp_file=$(mktemp) + stripe_size=0 + stripe_count=0 + stripe_index=0 + tmp_file=$(mktemp) - do_facet $1 $3 lfs getstripe -v $2 > $tmp_file + do_facet $1 $3 lfs getstripe -v $2 > $tmp_file - stripe_size=`awk '$1 ~ /size/ {print $2}' $tmp_file` - stripe_count=`awk '$1 ~ /count/ {print $2}' $tmp_file` - stripe_index=`awk '$1 ~ /stripe_offset/ {print $2}' $tmp_file` - rm -f $tmp_file + stripe_size=`awk '$1 ~ /size/ {print $2}' $tmp_file` + stripe_count=`awk '$1 ~ /count/ {print $2}' $tmp_file` + stripe_index=`awk '$1 ~ /stripe_offset/ {print $2}' $tmp_file` + rm -f $tmp_file } mdsrate_cleanup () { @@ -3341,7 +3382,7 @@ get_md_name () { ######################## -convert_facet2label() { +convert_facet2label() { local facet=$1 if [ x$facet = xost ]; then @@ -3352,7 +3393,7 @@ convert_facet2label() { if [ -n ${!varsvc} ]; then echo ${!varsvc} - else + else error "No lablel for $facet!" fi } @@ -3420,10 +3461,10 @@ wait_osc_import_state() { CONN_PROC="osc.${ost}.ost_server_uuid" CONN_STATE=$(do_facet $facet lctl get_param -n $CONN_PROC 2>/dev/null | cut -f2) while [ "${CONN_STATE}" != "${expected}" ]; do - if [ "${expected}" == "DISCONN" ]; then + if [ "${expected}" == "DISCONN" ]; then # for disconn we can check after proc entry is removed [ "x${CONN_STATE}" == "x" ] && return 0 - # with AT enabled, we can have connect request timeout near of + # with AT enabled, we can have connect request timeout near of # reconnect timeout and test can't see real disconnect [ "${CONN_STATE}" == "CONNECTING" ] && return 0 fi @@ -3438,7 +3479,6 @@ wait_osc_import_state() { log "${ost_facet} now in ${CONN_STATE} state" return 0 } - get_clientmdc_proc_path() { echo "${1}-mdc-*" } @@ -3447,7 +3487,8 @@ do_rpc_nodes () { local list=$1 shift - local RPATH="PATH=$LUSTRE/tests/:$PATH" + # Add paths to lustre tests for 32 and 64 bit systems. + local RPATH="PATH=$RLUSTRE/tests:/usr/lib/lustre/tests:/usr/lib64/lustre/tests:$PATH" do_nodesv $list "${RPATH} NAME=${NAME} sh rpc.sh $@ " } @@ -3545,27 +3586,30 @@ gather_logs () { # of writing the file to an NFS directory so it doesn't need to be copied. local tmp=$TMP local docp=true - [ -d "$SHARED_DIR_LOGS" ] && tmp=$SHARED_DIR_LOGS && docp=false + [ -f $LOGDIR/shared ] && docp=false # dump lustre logs, dmesg - do_nodes $list "log=$tmp/\\\$(hostname)-debug-$ts.log ; -lctl dk \\\$log >/dev/null; -log=$tmp/\\\$(hostname)-dmesg-$ts.log; -dmesg > \\\$log; " - # FIXME: does it make sense to collect the logs for $ts only, but all - # TESTSUITE logs? - # rsync $TMP/*${TESTSUITE}* to gather the logs dumped by error fn - local logs=$TMP/'*'${TESTSUITE}'*' - if $docp; then - logs=$logs' '$tmp/'*'$ts'*' + prefix="$LOGDIR/${TESTSUITE}.${TESTNAME}" + suffix="$ts.log" + echo "Dumping lctl log to ${prefix}.*.${suffix}" + + if [ "$CLIENTONLY" -o "$PDSH" == "no_dsh" ]; then + echo "Dumping logs only on local client." + $LCTL dk > ${prefix}.debug_log.$(hostname).${suffix} + dmesg > ${prefix}.dmesg.$(hostname).${suffix} + return fi - for node in ${list//,/ }; do - rsync -az $node:"$logs" $TMP - done - local archive=$TMP/${TESTSUITE}-$ts.tar.bz2 - tar -jcf $archive $tmp/*$ts* $TMP/*${TESTSUITE}* + do_nodes --verbose $list \ + "$LCTL dk > ${prefix}.debug_log.\\\$(hostname).${suffix}; + dmesg > ${prefix}.dmesg.\\\$(hostname).${suffix}" + if [ ! -f $LOGDIR/shared ]; then + do_nodes $list rsync -az "${prefix}.*.${suffix}" $HOSTNAME:$LOGDIR + fi + + local archive=$LOGDIR/${TESTSUITE}-$ts.tar.bz2 + tar -jcf $archive $LOGDIR/*$ts* $LOGDIR/*${TESTSUITE}* echo $archive } @@ -3610,11 +3654,11 @@ do_ls () { max_recovery_time () { local init_connect_timeout=$(( TIMEOUT / 20 )) - [[ $init_connect_timeout > 5 ]] || init_connect_timeout=5 + [[ $init_connect_timeout > 5 ]] || init_connect_timeout=5 local service_time=$(( $(at_max_get client) + $(( 2 * $(( 25 + 1 + init_connect_timeout)) )) )) - echo $service_time + echo $service_time } remove_mdt_files() { @@ -3708,3 +3752,65 @@ min_ost_size () { $LCTL get_param -n osc.*.kbytesavail | sort -n | head -n1 } +check_logdir() { + local dir=$1 + # Checking for shared logdir + if [ ! -d $dir ]; then + # Not found. Create local logdir + mkdir -p $dir + else + touch $dir/node.$(hostname).yml + fi + return 0 +} + +check_write_access() { + local dir=$1 + for node in $(nodes_list); do + if [ ! -f "$dir/node.${node}.yml" ]; then + # Logdir not accessible/writable from this node. + return 1 + fi + done + return 0 +} + +init_logging() { + if [[ -n $YAML_LOG ]]; then + return + fi + export YAML_LOG=${LOGDIR}/results.yml + mkdir -p $LOGDIR + init_clients_lists + + do_rpc_nodes $(comma_list $(nodes_list)) check_logdir $LOGDIR + if check_write_access $LOGDIR; then + touch $LOGDIR/shared + echo "Logging to shared log directory: $LOGDIR" + else + echo "Logging to local directory: $LOGDIR" + fi + + yml_nodes_file $LOGDIR >> $YAML_LOG + yml_results_file >> $YAML_LOG +} + +log_test() { + yml_log_test $1 >> $YAML_LOG +} + +log_sub_test() { + yml_log_sub_test $@ >> $YAML_LOG +} + +log_test_status() { + yml_log_test_status $@ >> $YAML_LOG +} + +log_sub_test_begin() { + yml_log_sub_test_begin $@ >> $YAML_LOG +} + +log_sub_test_end() { + yml_log_sub_test_end $@ >> $YAML_LOG +} diff --git a/lustre/tests/test-groups/regression b/lustre/tests/test-groups/regression new file mode 100644 index 0000000..1c79bc8 --- /dev/null +++ b/lustre/tests/test-groups/regression @@ -0,0 +1,20 @@ +sanity +metadata-updates +sanity-benchmark +sanityn +lfsck +liblustre +racer +replay-single +conf-sanity +recovery-small +replay-ost-single +replay-dual +replay-vbr +insanity +sanity-quota +ost-pools +lnet-selftest +mmp +obdfilter-survey +sgpdd-survey diff --git a/lustre/tests/test-groups/regression-mpi b/lustre/tests/test-groups/regression-mpi new file mode 100644 index 0000000..fd44302 --- /dev/null +++ b/lustre/tests/test-groups/regression-mpi @@ -0,0 +1,3 @@ +performance-sanity +large-scale +parallel-scale diff --git a/lustre/tests/yaml.sh b/lustre/tests/yaml.sh new file mode 100644 index 0000000..f5803e2 --- /dev/null +++ b/lustre/tests/yaml.sh @@ -0,0 +1,191 @@ +#!/bin/bash +# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: + +# +# Shell routines for logging results to a yaml file. +# + +split_output() { + while read line; do + host=${line%%:*}; + echo "$line" | sed "s/^${host}: //" | sed "s/^${host}://" \ + >> $logdir/node.$host.yml; + done +} + +yml_nodes_file() { + export logdir=$1 + + if [ -f $logdir/shared ]; then + do_rpc_nodes $(comma_list $(nodes_list)) \ + "yml_node >> $logdir/node.\\\$(hostname).yml" + else + do_rpc_nodes $(comma_list $(nodes_list)) yml_node | split_output + fi + yml_entities +} + +yml_results_file() { + export logdir=$1 + + #TestGroup + yml_test_group + + # Tests + printf "Tests:\n" +} + +# Called on the node for which we the info is needed. +yml_node() { + local node=$(hostname) + logdir=$1 + + printf "Build:\n" + yml_build_info + printf "\n" + + printf "Node:\n" + yml_node_info + printf "\n" + + printf "LustreEntities:\n" +} + +yml_test_group() { + TEST_GROUP=${TEST_GROUP:-"acc-sm-$(hostname)"} + TEST_HOST=${TEST_HOST:-$(hostname)} + TEST_USER=${TEST_USER:-$USER} + + # TestGroup information + cat < /dev/null | \ + sed -e 's/\/etc\///' -e 's/-release//' | head -1) + else + dist="UNKNOWN" + fi + + echo $dist +} + +yml_build_info() { + TEST_DISTRO=$(release) + LUSTRE_VERSION=$(lctl lustre_build_version | awk '/Lustre version:/ {print $3}') + LUSTRE_BUILD=$(sed 's/-.*//' <<<$LUSTRE_VERSION) + +cat <> $logdir/node.$host.yml + done + + for num in $(seq $OSTCOUNT); do + host=$(facet_active_host ost$num) + yml_entity "OST $num" $host >> $logdir/node.$host.yml + done + + i=1 + for host in ${CLIENTS//,/ }; do + yml_entity "Client $i" $host >> $logdir/node.$host.yml + i=$((i+1)) + done +} + +yml_log_test() { + if [ $1 != "FINISHED" ]; then + cat <