X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-random-scale.sh;h=6cedb2bdae7c5cee929fd88aa1a3649dd241c1d4;hb=2bcb82cd77f5cb616ede36a3fabf351222f069b7;hp=878e59434cb0a70e74bd397fa6c3c6490a4f1df4;hpb=d721e68a4f81dd3e74084eae4d2a32bdacffb40b;p=fs%2Flustre-release.git diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh index 878e594..6cedb2b 100644 --- a/lustre/tests/recovery-random-scale.sh +++ b/lustre/tests/recovery-random-scale.sh @@ -18,9 +18,13 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-random-scale} +TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} DEBUGLOG=$TESTSUITELOG.debug + +cleanup_logs + exec 2>$DEBUGLOG echo "--- env ---" >&2 env >&2 @@ -155,13 +159,16 @@ Status: $result: rc=$rc" if [ $rc -ne 0 ]; then print_logs $NODES_TO_USE + # we are interested in only on failed clients and servers + local failedclients=$(cat $END_RUN_FILE | grep -v $0) + # FIXME: need ostfailover-s nodes also for FLAVOR=OST + local product=$(gather_logs $(comma_list $(osts_nodes) \ + $(mdts_nodes) $mdsfailover_HOST $failedclients)) + echo logs files $product fi - if [ $rc -eq 0 ]; then - zconf_mount $(hostname) $MOUNT - else - error "exited with rc=$rc" - fi + [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT + exit $rc } @@ -183,12 +190,8 @@ fi start_client_loads $NODES_TO_USE echo clients load pids: -if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $LOAD_PID_FILE"; then - if [ -e $DEBUGLOG ]; then - exec 2<&- - cat $DEBUGLOG +if ! do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE"; then exit 3 - fi fi START_TS=$(date +%s) @@ -285,9 +288,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ) if [ $sleep -lt $MINSLEEP ]; then reqfail=$((reqfail +1)) - log "WARNING: failover, client reintegration and check_client_loads time -exceeded SERVER_FAILOVER_PERIOD - MINSLEEP ! -Failed to meet interval $reqfail times ( REQFAIL=$REQFAIL ); have sleep=$sleep" + log "WARNING: failover, client reintegration and check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP ! +Failed to load the filesystem with I/O for a minimum period of $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ). +This iteration, the load was only applied for sleep=$sleep seconds. +Probably the hardware is taking excessively long to boot. +Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918" [ $reqfail -gt $REQFAIL ] && exit 6 fi