X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-random-scale.sh;h=5fc20c8e2da11cdad61cb40eb0bbb0f3c7b013b0;hp=f1116d6a9c511d89d5482e5763f11bb6536e7198;hb=9c3142af6f7f3ec25de78d9639ed4193a8de4739;hpb=a98728e4fd673ebe7a7d1d3f15a5a06d1efec9e3 diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh index f1116d6..5fc20c8 100644 --- a/lustre/tests/recovery-random-scale.sh +++ b/lustre/tests/recovery-random-scale.sh @@ -15,7 +15,7 @@ ONLY=${ONLY:-"$*"} LUSTRE=${LUSTRE:-$(dirname $0)/..} . $LUSTRE/tests/test-framework.sh -init_test_env $@ +init_test_env "$@" init_logging # bug number for skipped test: @@ -24,29 +24,28 @@ ALWAYS_EXCEPT="$RECOVERY_RANDOM_SCALE_EXCEPT" build_test_filter -remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0 +remote_mds_nodsh && skip_env "remote MDS with nodsh" +remote_ost_nodsh && skip_env "remote OST with nodsh" [ $CLIENTCOUNT -lt 3 ] && - skip_env "need three or more clients" && exit 0 + skip_env "need three or more clients" if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then - skip_env "SHARED_DIRECTORY should be specified with a shared directory \ -which is accessable on all of the nodes" - exit 0 + skip_env "SHARED_DIRECTORY should be specified with a shared directory \ +which is accessible on all of the nodes" fi [[ $FAILURE_MODE = SOFT ]] && \ - log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" + log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" # Application failures are allowed for the failed client # but not for other clients. ERRORS_OK="yes" if [ "$SLOW" = "no" ]; then - DURATION=${DURATION:-$((60 * 30))} + DURATION=${DURATION:-$((60 * 30))} else - DURATION=${DURATION:-$((60 * 60 * 24))} + DURATION=${DURATION:-$((60 * 60 * 24))} fi SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes @@ -74,32 +73,32 @@ numfailovers () { } summary_and_cleanup () { - local rc=$? - trap 0 + local rc=$? - # Having not empty END_RUN_FILE means the failed loads only - if [ -s $END_RUN_FILE ]; then - print_end_run_file $END_RUN_FILE - rc=1 - fi + # Having not empty END_RUN_FILE means the failed loads only + if [ -s $END_RUN_FILE ]; then + print_end_run_file $END_RUN_FILE + rc=1 + fi - echo $(date +'%F %H:%M:%S') Terminating clients loads ... - echo "$0" >> $END_RUN_FILE - local result=PASS - [ $rc -eq 0 ] || result=FAIL + echo $(date +'%F %H:%M:%S') Terminating clients loads ... + echo "$0" >> $END_RUN_FILE + local result=PASS + [ $rc -eq 0 ] || result=FAIL - log "Duration: $DURATION + log "Duration: $DURATION Server failover period: $SERVER_FAILOVER_PERIOD seconds Exited after: $ELAPSED seconds Number of failovers before exit: $(numfailovers) Status: $result: rc=$rc" - # stop vmstat on OSS nodes - [ "$VMSTAT" ] && stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE + # stop vmstat on OSS nodes + [ "$VMSTAT" ] && + stop_process $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE - # stop the client loads - stop_client_loads $NODES_TO_USE $LOAD_PID_FILE + # stop the client loads + stop_client_loads $NODES_TO_USE $LOAD_PID_FILE if [ $rc -ne 0 ]; then # we are interested in only on failed clients and servers @@ -107,7 +106,7 @@ Status: $result: rc=$rc" gather_logs $(comma_list $(all_server_nodes) $failedclients) fi - exit $rc + exit $rc } ################################## Main Flow ################################### @@ -124,116 +123,121 @@ zconf_umount $HOSTNAME $MOUNT NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME) -check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} +check_progs_installed $NODES_TO_USE "${CLIENT_LOADS[@]}" MDTS=$(get_facets MDS) # Fail a random client and then failover a random MDS. test_fail_client_mds() { - local fail_client - local serverfacet - local client_var - local var - - trap summary_and_cleanup EXIT INT - - # start vmstat on OSS nodes - [ "$VMSTAT" ] && start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE - - # start client loads - rm -f $END_RUN_FILE - start_client_loads $NODES_TO_USE - - echo client loads pids: - do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3 - - ELAPSED=0 - local sleep=0 - local reqfail=0 - local it_time_start - local start_ts=$(date +%s) - local current_ts=$start_ts - - while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do - # In order to perform the - # expected number of failovers, we need to account the following: - # 1) the time that has elapsed during the client load checking - # 2) time takes for failover - it_time_start=$(date +%s) - - fail_client=$(get_random_entry $NODES_TO_USE) - client_var=$(node_var_name $fail_client)_nums - - # store the list of failed clients - # lists are comma separated - FAILED_CLIENTS=$(expand_list $FAILED_CLIENTS $fail_client) - - serverfacet=$(get_random_entry $MDTS) - var=$(node_var_name $serverfacet)_nums - - # Check that our client loads are still running. If any have died, - # that means they have died outside of recovery, which is unacceptable. - log "==== Checking the clients loads BEFORE failover -- failure NOT OK \ - ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" - check_client_loads $NODES_TO_USE || exit 4 - - log "FAIL CLIENT $fail_client..." - shutdown_client $fail_client - - log "Starting failover on $serverfacet" - facet_failover "$serverfacet" || exit 1 - - if ! wait_recovery_complete $serverfacet; then - echo "$serverfacet recovery is not completed!" - exit 7 - fi - - boot_node $fail_client - echo "Reintegrating $fail_client" - zconf_mount $fail_client $MOUNT || exit $? - client_up $fail_client || exit $? - - # Increment the number of failovers - val=$((${!var} + 1)) - eval $var=$val - val=$((${!client_var} + 1)) - eval $client_var=$val - - # load script on failed clients could create END_RUN_FILE - # We shuold remove it and ignore the failure if this - # file contains the failed client only. - # We can not use ERRORS_OK when start all loads at the start of - # this script because the application errors allowed for random - # failed client only, but not for all clients. - if [ -e $END_RUN_FILE ]; then - local end_run_node - read end_run_node < $END_RUN_FILE - [[ $end_run_node = $fail_client ]] && - rm -f $END_RUN_FILE || exit 13 - fi - - restart_client_loads $fail_client $ERRORS_OK || exit $? - - # Check that not failed clients loads are still running. - # No application failures should occur on clients that were not failed. - log "==== Checking the clients loads AFTER failed client reintegrated \ --- failure NOT OK" - if ! ERRORS_OK= check_client_loads \ - $(exclude_items_from_list $NODES_TO_USE $fail_client); then - log "Client load failed. Exiting..." - exit 5 - fi - - current_ts=$(date +%s) - ELAPSED=$((current_ts - start_ts)) - sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start))) - - # Keep counting the number of iterations when - # time spent to failover and two client loads check exceeded - # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ). - if [ $sleep -lt $MINSLEEP ]; then - reqfail=$((reqfail + 1)) - log "WARNING: failover, client reintegration and \ + local fail_client + local serverfacet + local client_var + local var + + stack_trap summary_and_cleanup EXIT INT + + # start vmstat on OSS nodes + [ "$VMSTAT" ] && + start_vmstat $(comma_list $(osts_nodes)) $VMSTAT_PID_FILE + + # start client loads + rm -f $END_RUN_FILE + start_client_loads $NODES_TO_USE + + echo client loads pids: + do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3 + + ELAPSED=0 + local it_time_start + local sleep=0 + local reqfail=0 + local start_ts=$(date +%s) + local current_ts=$start_ts + + while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do + # In order to perform the + # expected number of failovers, we need to account the + # following: + # 1) the time that has elapsed during the client load checking + # 2) time takes for failover + it_time_start=$(date +%s) + + fail_client=$(get_random_entry $NODES_TO_USE) + client_var=$(node_var_name $fail_client)_nums + + # store the list of failed clients + # lists are comma separated + FAILED_CLIENTS=$(expand_list $FAILED_CLIENTS $fail_client) + + serverfacet=$(get_random_entry $MDTS) + var=$(node_var_name $serverfacet)_nums + + # Check that our client loads are still running. If any have + # died, that means they have died outside of recovery, which + # is unacceptable. + log "==== Checking clients loads BEFORE failover -- failure NOT OK \ + ELAPSED=$ELAPSED DURATION=$DURATION \ + PERIOD=$SERVER_FAILOVER_PERIOD" + check_client_loads $NODES_TO_USE || exit 4 + + log "FAIL CLIENT $fail_client..." + shutdown_client $fail_client + + log "Starting failover on $serverfacet" + facet_failover "$serverfacet" || exit 1 + + if ! wait_recovery_complete $serverfacet; then + echo "$serverfacet recovery is not completed!" + exit 7 + fi + + boot_node $fail_client + echo "Reintegrating $fail_client" + zconf_mount $fail_client $MOUNT || exit $? + client_up $fail_client || exit $? + + # Increment the number of failovers + val=$((${!var} + 1)) + eval $var=$val + val=$((${!client_var} + 1)) + eval $client_var=$val + + # load script on failed clients could create END_RUN_FILE + # We shuold remove it and ignore the failure if this + # file contains the failed client only. + # We can not use ERRORS_OK when start all loads at the start of + # this script because the application errors allowed for random + # failed client only, but not for all clients. + if [ -e $END_RUN_FILE ]; then + local end_run_node + read end_run_node < $END_RUN_FILE + [[ $end_run_node = $fail_client ]] && + rm -f $END_RUN_FILE || exit 13 + fi + + restart_client_loads $fail_client $ERRORS_OK || exit $? + + # Check that not failed clients loads are still running. + # No application failures should occur on clients that were + # not failed. + log "==== Checking clients loads AFTER failed client reintegrated \ + -- failure NOT OK" + if ! ERRORS_OK= check_client_loads \ + $(exclude_items_from_list $NODES_TO_USE $fail_client); then + log "Client load failed. Exiting..." + exit 5 + fi + + current_ts=$(date +%s) + ELAPSED=$((current_ts - start_ts)) + sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start))) + + # Keep counting the number of iterations when + # time spent to failover and two client loads check exceeded + # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ). + if [ $sleep -lt $MINSLEEP ]; then + reqfail=$((reqfail + 1)) + log "WARNING: failover, client reintegration and \ check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP! Failed to load the filesystem with I/O for a minimum period of \ $MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ). @@ -242,20 +246,20 @@ Estimated max recovery time : $MAX_RECOV_TIME Probably the hardware is taking excessively long time to boot. Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \ bug 20918" - [ $reqfail -gt $REQFAIL ] && exit 6 - fi + [ $reqfail -gt $REQFAIL ] && exit 6 + fi - log "Number of failovers: + log "Number of failovers: $(numfailovers) and counting..." - [ $((ELAPSED + sleep)) -ge $DURATION ] && break + [ $((ELAPSED + sleep)) -ge $DURATION ] && break - if [ $sleep -gt 0 ]; then - echo "sleeping $sleep seconds... " - sleep $sleep - fi - done - exit 0 + if [ $sleep -gt 0 ]; then + echo "sleeping $sleep seconds... " + sleep $sleep + fi + done + exit 0 } run_test fail_client_mds "fail client, then failover MDS"