From: Yu Jian Date: Mon, 5 Mar 2012 08:08:53 +0000 (+0800) Subject: LU-734 tests: save recovery-*-scale debug logs into $LOGDIR X-Git-Tag: 2.2.51~63 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=285389479125decaefa3d64a8460a7bfe0a7c889;hp=f665b5f7691740cbdae6c96c5d959dfb7659f679 LU-734 tests: save recovery-*-scale debug logs into $LOGDIR This patch changes the specific debug log names of recovery-*-scale tests and saves them into $LOGDIR. Signed-off-by: Yu Jian Change-Id: Id1ad7dd3dde41bd49a53986945d15d80f91718e5 Reviewed-on: http://review.whamcloud.com/2261 Reviewed-by: Minh Diep Reviewed-by: Cliff White Tested-by: Hudson Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh index 6bf0b32..805fddc 100644 --- a/lustre/tests/recovery-double-scale.sh +++ b/lustre/tests/recovery-double-scale.sh @@ -18,10 +18,8 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} -DEBUGLOG=$TESTSUITELOG.debug -cleanup_logs +DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log exec 2>$DEBUGLOG echo "--- env ---" >&2 @@ -86,8 +84,9 @@ reboot_recover_node () { ;; clients) for c in ${item//,/ }; do # make sure the client loads die - do_nodes $c "set -x; test -f $TMP/client-load.pid && \ - { kill -s TERM \$(cat $TMP/client-load.pid) || true; }" + do_nodes $c "set -x; test -f $LOAD_PID_FILE && + { kill -s TERM \\\$(cat $LOAD_PID_FILE); + rm -f $LOAD_PID_FILE || true; }" shutdown_client $c boot_node $c echo "Reintegrating $c" @@ -198,6 +197,7 @@ failover_pair() { summary_and_cleanup () { local rc=$? + local var trap 0 # Having not empty END_RUN_FILE means the failed loads only @@ -207,17 +207,16 @@ summary_and_cleanup () { local END_RUN_NODE= read END_RUN_NODE < $END_RUN_FILE - # a client load will end (i.e. fail) if it finds - # the end run file. that does not mean that that client load - # actually failed though. the first node in the END_RUN_NODE is - # the one we are really interested in. + # A client load will stop if it found the END_RUN_FILE file. + # That does not mean the client load actually failed though. + # The first node in END_RUN_FILE is the one we are interested in. if [ -n "$END_RUN_NODE" ]; then var=$(node_var_name $END_RUN_NODE)_load echo "Client load failed on node $END_RUN_NODE" echo - echo "client $END_RUN_NODE load debug output :" - local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug - do_node ${END_RUN_NODE} "set -x; [ -e $logfile ] && cat $logfile " || true + echo "Client $END_RUN_NODE load stdout and debug files: + $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log + $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log" fi rc=1 fi @@ -233,8 +232,9 @@ Exited after: $ELAPSED seconds Status: $result: rc=$rc" # make sure the client loads die - do_nodes $NODES_TO_USE "set -x; test -f $TMP/client-load.pid && \ - { kill -s TERM \$(cat $TMP/client-load.pid) || true; }" + do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && + { kill -s TERM \\\$(cat $LOAD_PID_FILE); + rm -f $LOAD_PID_FILE || true; }" # and free up the pdshes that started them, if any are still around if [ -n "$CLIENT_LOAD_PIDS" ]; then @@ -249,7 +249,7 @@ Status: $result: rc=$rc" # FIXME: need ostfailover-s nodes also for FLAVOR=OST local product=$(gather_logs $(comma_list $(osts_nodes) \ $(mdts_nodes) $mdsfailover_HOST $failedclients) 1) - echo logs files $product + echo $product fi [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT @@ -277,9 +277,10 @@ FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes # Start client loads. start_client_loads $NODES_TO_USE + echo clients load pids: -if ! do_nodesv $NODES_TO_USE "cat $TMP/client-load.pid"; then - exit 3 +if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then + exit 3 fi # FIXME: Do we want to have an initial sleep period where the clients diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh index 0f88612..3b016b9 100644 --- a/lustre/tests/recovery-mds-scale.sh +++ b/lustre/tests/recovery-mds-scale.sh @@ -16,10 +16,7 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} -DEBUGLOG=$TESTSUITELOG.debug - -cleanup_logs +DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log exec 2>$DEBUGLOG echo "--- env ---" >&2 @@ -41,6 +38,7 @@ check_shared_dir $SHARED_DIRECTORY || END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} +VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid} remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 @@ -84,8 +82,6 @@ fi rm -f $END_RUN_FILE -vmstatLOG=${TESTSUITELOG}_$(basename $0 .sh).vmstat - server_numfailovers () { local facet=$1 local var=${facet}_numfailovers @@ -105,7 +101,6 @@ servers_numfailovers () { } summary_and_cleanup () { - local rc=$? local var trap 0 @@ -117,17 +112,16 @@ summary_and_cleanup () { local END_RUN_NODE= read END_RUN_NODE < $END_RUN_FILE - # a client load will end (i.e. fail) if it finds - # the end run file. that does not mean that that client load - # actually failed though. the first node in the END_RUN_NODE is - # the one we are really interested in. + # A client load will stop if it found the END_RUN_FILE file. + # That does not mean the client load actually failed though. + # The first node in END_RUN_FILE is the one we are interested in. if [ -n "$END_RUN_NODE" ]; then var=$(node_var_name $END_RUN_NODE)_load - echo "Client load failed on node $END_RUN_NODE" + echo "Client load failed on node $END_RUN_NODE" echo - echo "client $END_RUN_NODE load stdout and debug files : - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE} - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug" + echo "Client $END_RUN_NODE load stdout and debug files: + $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log + $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log" fi rc=1 fi @@ -146,14 +140,15 @@ Status: $result: rc=$rc" # stop the vmstats on the OSTs if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "test -f /tmp/vmstat.pid && \ - { kill -s TERM \$(cat /tmp/vmstat.pid); rm -f /tmp/vmstat.pid; \ - gzip -f9 $vmstatLOG-\$(hostname); }" + do_nodes $(comma_list $(osts_nodes)) "test -f $VMSTAT_PID_FILE && + { kill -s TERM \\\$(cat $VMSTAT_PID_FILE); + rm -f $VMSTAT_PID_FILE || true; }" fi # make sure the client loads die - do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && \ - { kill -s TERM \$(cat $LOAD_PID_FILE) || true; }" + do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && + { kill -s TERM \\\$(cat $LOAD_PID_FILE); + rm -f $LOAD_PID_FILE || true; }" # and free up the pdshes that started them, if any are still around if [ -n "$CLIENT_LOAD_PIDS" ]; then @@ -161,13 +156,14 @@ Status: $result: rc=$rc" sleep 5 kill -9 $CLIENT_LOAD_PIDS || true fi + if [ $rc -ne 0 ]; then # we are interested in only on failed clients and servers local failedclients=$(cat $END_RUN_FILE | grep -v $0) # FIXME: need ostfailover-s nodes also for FLAVOR=OST local product=$(gather_logs $(comma_list $(osts_nodes) \ $(mdts_nodes) $mdsfailover_HOST $failedclients) 1) - echo logs files $product + echo $product fi [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT @@ -186,7 +182,9 @@ ELAPSED=0 # vmstat the osts if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "vmstat 1 > $vmstatLOG-\$(hostname) 2>/dev/null /tmp/vmstat.pid" + do_nodes $(comma_list $(osts_nodes)) \ + "vmstat 1 > $TESTLOG_PREFIX.vmstat.\\\$(hostname -s).log \ + 2>/dev/null $VMSTAT_PID_FILE" fi # Start client loads. @@ -194,7 +192,7 @@ start_client_loads $NODES_TO_USE echo clients load pids: if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then - exit 3 + exit 3 fi MINSLEEP=${MINSLEEP:-120} diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh index 2ee2ea8..8b4506f 100644 --- a/lustre/tests/recovery-random-scale.sh +++ b/lustre/tests/recovery-random-scale.sh @@ -20,10 +20,7 @@ init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} -DEBUGLOG=$TESTSUITELOG.debug - -cleanup_logs +DEBUGLOG=$TESTLOG_PREFIX.suite_debug_log.$(hostname -s).log exec 2>$DEBUGLOG echo "--- env ---" >&2 @@ -45,6 +42,7 @@ check_shared_dir $SHARED_DIRECTORY || END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} +VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid} remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 @@ -80,8 +78,6 @@ fi rm -f $END_RUN_FILE -vmstatLOG=${TESTSUITELOG}_$(basename $0 .sh).vmstat - numfailovers () { local facet local var @@ -95,21 +91,7 @@ numfailovers () { done } -# list is comma separated -print_logs () { - local list=$1 - - do_nodes $list "node=\\\$(hostname) -var=\\\${node}_load -log=${TESTSUITELOG}_run_${!var}.sh-\\\$node.debug -if [ -e \\\$log ] ; then -echo Node \\\$node debug log: -cat \\\$log -fi" -} - summary_and_cleanup () { - local rc=$? local var trap 0 @@ -121,22 +103,20 @@ summary_and_cleanup () { local END_RUN_NODE= read END_RUN_NODE < $END_RUN_FILE - # a client load will end (i.e. fail) if it finds - # the end run file. that does not mean that that client load - # actually failed though. the first node in the END_RUN_NODE is - # the one we are really interested in. + # A client load will stop if it found the END_RUN_FILE file. + # That does not mean the client load actually failed though. + # The first node in END_RUN_FILE is the one we are interested in. if [ -n "$END_RUN_NODE" ]; then var=$(node_var_name $END_RUN_NODE)_load echo "Client load failed on node $END_RUN_NODE" echo - echo "client $END_RUN_NODE load stdout and debug files : - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE} - ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug" + echo "Client $END_RUN_NODE load stdout and debug files: + $TESTLOG_PREFIX.run_${!var}_stdout.$END_RUN_NODE.log + $TESTLOG_PREFIX.run_${!var}_debug.$END_RUN_NODE.log" fi rc=1 fi - echo $(date +'%F %H:%M:%S') Terminating clients loads ... echo "$0" >> $END_RUN_FILE local result=PASS @@ -151,14 +131,15 @@ Status: $result: rc=$rc" # stop the vmstats on the OSTs if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "test -f /tmp/vmstat.pid && \ - { kill -s TERM \$(cat /tmp/vmstat.pid); rm -f /tmp/vmstat.pid; \ - gzip -f9 $vmstatLOG-\$(hostname); }" + do_nodes $(comma_list $(osts_nodes)) "test -f $VMSTAT_PID_FILE && + { kill -s TERM \\\$(cat $VMSTAT_PID_FILE); + rm -f $VMSTAT_PID_FILE || true; }" fi # make sure the client loads die - do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && \ - { kill -s TERM \$(cat $LOAD_PID_FILE) || true; }" + do_nodes $NODES_TO_USE "set -x; test -f $LOAD_PID_FILE && + { kill -s TERM \\\$(cat $LOAD_PID_FILE); + rm -f $LOAD_PID_FILE || true; }" # and free up the pdshes that started them, if any are still around if [ -n "$CLIENT_LOAD_PIDS" ]; then @@ -168,13 +149,12 @@ Status: $result: rc=$rc" fi if [ $rc -ne 0 ]; then - print_logs $NODES_TO_USE # we are interested in only on failed clients and servers local failedclients=$(cat $END_RUN_FILE | grep -v $0) # FIXME: need ostfailover-s nodes also for FLAVOR=OST local product=$(gather_logs $(comma_list $(osts_nodes) \ $(mdts_nodes) $mdsfailover_HOST $failedclients) 1) - echo logs files $product + echo $product fi [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT @@ -193,7 +173,9 @@ ELAPSED=0 # vmstat the osts if [ "$VMSTAT" ]; then - do_nodes $(comma_list $(osts_nodes)) "vmstat 1 > $vmstatLOG-\$(hostname) 2>/dev/null /tmp/vmstat.pid" + do_nodes $(comma_list $(osts_nodes)) \ + "vmstat 1 > $TESTLOG_PREFIX.vmstat.\\\$(hostname -s).log \ + 2>/dev/null $VMSTAT_PID_FILE" fi # Start client loads. @@ -201,7 +183,7 @@ start_client_loads $NODES_TO_USE echo clients load pids: if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then - exit 3 + exit 3 fi START_TS=$(date +%s) diff --git a/lustre/tests/run_IOR.sh b/lustre/tests/run_IOR.sh index 6da7f54..4cd6933 100755 --- a/lustre/tests/run_IOR.sh +++ b/lustre/tests/run_IOR.sh @@ -3,9 +3,9 @@ set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} diff --git a/lustre/tests/run_dbench.sh b/lustre/tests/run_dbench.sh index f1520e8..d1a4a38 100755 --- a/lustre/tests/run_dbench.sh +++ b/lustre/tests/run_dbench.sh @@ -3,9 +3,9 @@ set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} diff --git a/lustre/tests/run_dd.sh b/lustre/tests/run_dd.sh index d8151d8..0f2a1f9 100755 --- a/lustre/tests/run_dd.sh +++ b/lustre/tests/run_dd.sh @@ -3,9 +3,9 @@ set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} diff --git a/lustre/tests/run_iozone.sh b/lustre/tests/run_iozone.sh index 297142d..01eb9fe 100755 --- a/lustre/tests/run_iozone.sh +++ b/lustre/tests/run_iozone.sh @@ -3,9 +3,9 @@ set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} diff --git a/lustre/tests/run_tar.sh b/lustre/tests/run_tar.sh index 1bc47e9..9ad3a58 100755 --- a/lustre/tests/run_tar.sh +++ b/lustre/tests/run_tar.sh @@ -3,9 +3,9 @@ set -x TMP=${TMP:-/tmp} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-mds-scale} -LOG=${TESTSUITELOG}_$(basename $0)-$(hostname) -DEBUGLOG=${LOG}.debug +TESTLOG_PREFIX=${TESTLOG_PREFIX:-$TMP/recovery-mds-scale} +LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log +DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/') mkdir -p ${LOG%/*} diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 94867a5..9146697 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -133,10 +133,9 @@ init_test_env() { #[ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} export TESTSUITELOG=${TMP}/${TESTSUITE}.log - if [[ -z $LOGDIRSET ]]; then - export LOGDIR=${LOGDIR:-${TMP}/test_logs/}/$(date +%s) - export LOGDIRSET=true - fi + export LOGDIR=${LOGDIR:-${TMP}/test_logs/$(date +%s)} + export TESTLOG_PREFIX=$LOGDIR/$TESTSUITE + export HOSTNAME=${HOSTNAME:-$(hostname -s)} if ! echo $PATH | grep -q $LUSTRE/utils; then export PATH=$LUSTRE/utils:$PATH @@ -1111,11 +1110,14 @@ start_client_load() { BREAK_ON_ERROR=$BREAK_ON_ERROR \ END_RUN_FILE=$END_RUN_FILE \ LOAD_PID_FILE=$LOAD_PID_FILE \ - TESTSUITELOG=$TESTSUITELOG \ + TESTLOG_PREFIX=$TESTLOG_PREFIX \ run_${load}.sh" & - CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!" + local ppid=$! log "Started client load: ${load} on $client" + # get the children process IDs + local pids=$(ps --ppid $ppid -o pid= | xargs) + CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $ppid $pids" return 0 } @@ -4254,7 +4256,7 @@ gather_logs () { # dump lustre logs, dmesg - prefix="$LOGDIR/${TESTSUITE}.${TESTNAME}" + prefix="$TESTLOG_PREFIX.$TESTNAME" suffix="$ts.log" echo "Dumping lctl log to ${prefix}.*.${suffix}" @@ -4280,12 +4282,6 @@ gather_logs () { fi } -cleanup_logs () { - local list=${1:-$(comma_list $(nodes_list))} - - [ -n ${TESTSUITE} ] && do_nodes $list "rm -f $TMP/*${TESTSUITE}*" || true -} - do_ls () { local mntpt_root=$1 local num_mntpts=$2