X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-double-scale.sh;h=4c0ea6f800209d14e77f26818eac05ecb4deedf4;hp=e77f7fb061eca5dbd80ac3f7746bbac882e52c09;hb=194fc7034057a90e881aaf4f9ee9f68fc6172406;hpb=a7a2133bfab42eba077f1b8d5c991c651c8028c3 diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh index e77f7fb..4c0ea6f 100644 --- a/lustre/tests/recovery-double-scale.sh +++ b/lustre/tests/recovery-double-scale.sh @@ -17,19 +17,26 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-double-scale} +init_logging +TESTSUITELOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh)} DEBUGLOG=$TESTSUITELOG.debug + +cleanup_logs + exec 2>$DEBUGLOG echo "--- env ---" >&2 env >&2 echo "--- env ---" >&2 set -x +[ "$SHARED_DIRECTORY" ] || \ + { skip "$0: Empty SHARED_DIRECTORY" && exit 0; } + [ -n "$CLIENTS" ] || { skip "$0 Need two or more remote clients" && exit 0; } [ $CLIENTCOUNT -ge 3 ] || \ { skip "$0 Need two or more remote clients, have $CLIENTCOUNT" && exit 0; } -END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY}/end_run_file} +END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 @@ -37,6 +44,9 @@ remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 check_timeout || exit 1 +[[ $FAILURE_MODE = SOFT ]] && \ + log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" + build_test_filter check_and_setup_lustre @@ -70,14 +80,20 @@ reboot_recover_node () { [ "$SERIAL" ] && wait_recovery_complete $item $((timeout * 4)) || true ;; clients) for c in ${item//,/ }; do + # make sure the client loads die + do_nodes $c "set -x; test -f $TMP/client-load.pid && \ + { kill -s TERM \$(cat $TMP/client-load.pid) || true; }" shutdown_client $c boot_node $c echo "Reintegrating $c" - zconf_mount $c $MOUNT || return $? + # one client fails; need dk logs from this client only + zconf_mount $c $MOUNT || NODES="$c $(mdts_nodes) $(osts_nodes)" error_exit "zconf_mount failed" done - start_client_loads $item || return $? + start_client_loads $item ;; - * ) error "reboot_recover_node: nodetype=$nodetype. Must be one of 'MDS', 'OST', or 'clients'." + # script failure: + # don't use error (), the logs from all nodes not needed + * ) echo "reboot_recover_node: nodetype=$nodetype. Must be one of 'MDS', 'OST', or 'clients'." exit 1;; esac } @@ -92,7 +108,9 @@ get_item_type () { OST ) list=$OSTS;; clients) list=$NODES_TO_USE ;; - * ) error "Invalid type=$type. Must be one of 'MDS', 'OST', or 'clients'." + # script failure: + # don't use error (), the logs from all nodes not needed + * ) echo "Invalid type=$type. Must be one of 'MDS', 'OST', or 'clients'." exit 1;; esac @@ -148,7 +166,7 @@ failover_pair() { log "Done checking client loads. Failing type1=$type1 item1=$item1 ... " - reboot_recover_node $item1 $type1 || return $? + reboot_recover_node $item1 $type1 # Hendrix test17 description: # Introduce a failure, wait at @@ -163,7 +181,7 @@ failover_pair() { # do not need a sleep between failures for "double failures" log " Failing type2=$type2 item2=$item2 ... " - reboot_recover_node $item2 $type2 || return $? + reboot_recover_node $item2 $type2 # Client loads are allowed to die while in recovery, so we just # restart them. @@ -189,7 +207,7 @@ summary_and_cleanup () { # actually failed though. the first node in the END_RUN_NODE is # the one we are really interested in. if [ -n "$END_RUN_NODE" ]; then - var=${END_RUN_NODE}_load + var=$(client_var_name $END_RUN_NODE)_load echo "Client load failed on node $END_RUN_NODE" echo echo "client $END_RUN_NODE load debug output :" @@ -219,6 +237,16 @@ Status: $result: rc=$rc" sleep 5 kill -9 $CLIENT_LOAD_PIDS || true fi + + if [ $rc -ne 0 ]; then + # we are interested in only on failed clients and servers + local failedclients=$(cat $END_RUN_FILE | grep -v $0) + # FIXME: need ostfailover-s nodes also for FLAVOR=OST + local product=$(gather_logs $(comma_list $(osts_nodes) \ + $(mdts_nodes) $mdsfailover_HOST $failedclients)) + echo logs files $product + fi + [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT exit $rc } @@ -245,12 +273,8 @@ FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes # Start client loads. start_client_loads $NODES_TO_USE echo clients load pids: -if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $TMP/client-load.pid"; then - if [ -e $DEBUGLOG ]; then - exec 2<&- - cat $DEBUGLOG +if ! do_nodesv $NODES_TO_USE "cat $TMP/client-load.pid"; then exit 3 - fi fi # FIXME: Do we want to have an initial sleep period where the clients