X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-double-scale.sh;h=72b6d3c7378d3631f916dc43158f654c0aa4f07f;hb=b594948509f42859565d3ac141621b0f35d806d2;hp=81757e2bb446afcc0726fc57fef1c1f6a4c05c21;hpb=3131ab26bcfe548b6f8a5ba709460af9c88ae044;p=fs%2Flustre-release.git diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh index 81757e2..72b6d3c 100644 --- a/lustre/tests/recovery-double-scale.sh +++ b/lustre/tests/recovery-double-scale.sh @@ -17,8 +17,12 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-double-scale} +init_logging +TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} DEBUGLOG=$TESTSUITELOG.debug + +cleanup_logs + exec 2>$DEBUGLOG echo "--- env ---" >&2 env >&2 @@ -26,11 +30,13 @@ echo "--- env ---" >&2 set -x [ "$SHARED_DIRECTORY" ] || \ - { skip "$0: Empty SHARED_DIRECTORY" && exit 0; } + { FAIL_ON_ERROR=true skip_env "$0 Empty SHARED_DIRECTORY" && exit 0; } + +[ -n "$CLIENTS" ] || \ + { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients" && exit 0; } -[ -n "$CLIENTS" ] || { skip "$0 Need two or more remote clients" && exit 0; } [ $CLIENTCOUNT -ge 3 ] || \ - { skip "$0 Need two or more remote clients, have $CLIENTCOUNT" && exit 0; } + { FAIL_ON_ERROR=true skip_env "$0 Need two or more remote clients, have $((CLIENTCOUNT - 1))" && exit 0; } END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} @@ -40,6 +46,9 @@ remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 check_timeout || exit 1 +[[ $FAILURE_MODE = SOFT ]] && \ + log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" + build_test_filter check_and_setup_lustre @@ -70,9 +79,12 @@ reboot_recover_node () { # MDS, OST item contains the facet case $nodetype in MDS|OST ) facet_failover $item - [ "$SERIAL" ] && wait_recovery_complete $item $((timeout * 4)) || true + [ "$SERIAL" ] && wait_recovery_complete $item || true ;; clients) for c in ${item//,/ }; do + # make sure the client loads die + do_nodes $c "set -x; test -f $TMP/client-load.pid && \ + { kill -s TERM \$(cat $TMP/client-load.pid) || true; }" shutdown_client $c boot_node $c echo "Reintegrating $c" @@ -197,7 +209,7 @@ summary_and_cleanup () { # actually failed though. the first node in the END_RUN_NODE is # the one we are really interested in. if [ -n "$END_RUN_NODE" ]; then - var=$(client_var_name $END_RUN_NODE)_load + var=$(node_var_name $END_RUN_NODE)_load echo "Client load failed on node $END_RUN_NODE" echo echo "client $END_RUN_NODE load debug output :" @@ -227,6 +239,16 @@ Status: $result: rc=$rc" sleep 5 kill -9 $CLIENT_LOAD_PIDS || true fi + + if [ $rc -ne 0 ]; then + # we are interested in only on failed clients and servers + local failedclients=$(cat $END_RUN_FILE | grep -v $0) + # FIXME: need ostfailover-s nodes also for FLAVOR=OST + local product=$(gather_logs $(comma_list $(osts_nodes) \ + $(mdts_nodes) $mdsfailover_HOST $failedclients)) + echo logs files $product + fi + [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT exit $rc } @@ -253,12 +275,8 @@ FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes # Start client loads. start_client_loads $NODES_TO_USE echo clients load pids: -if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $TMP/client-load.pid"; then - if [ -e $DEBUGLOG ]; then - exec 2<&- - cat $DEBUGLOG +if ! do_nodesv $NODES_TO_USE "cat $TMP/client-load.pid"; then exit 3 - fi fi # FIXME: Do we want to have an initial sleep period where the clients