X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Frecovery-double-scale.sh;h=82dce37dd732d0c6afe6a3cd42655e4638fa9603;hb=bc1d8b792ae61de6cc82175a26b097574c64eb50;hp=18ca852a35fcfd3e69f2c45d014447a00b383e22;hpb=fb05f2d177fbcfc2499008a4bb04fdf64ab19466;p=fs%2Flustre-release.git diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh index 18ca852..82dce37 100644 --- a/lustre/tests/recovery-double-scale.sh +++ b/lustre/tests/recovery-double-scale.sh @@ -1,6 +1,5 @@ #!/bin/bash -# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: - +# # All pairwise combinations of node failures. # Was cmd3-17 # @@ -13,30 +12,31 @@ set -e ONLY=${ONLY:-"$*"} +LUSTRE=${LUSTRE:-$(dirname $0)/..} +. $LUSTRE/tests/test-framework.sh +init_test_env "$@" +init_logging + # bug number for skipped test: ALWAYS_EXCEPT="$RECOVERY_DOUBLE_SCALE_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! -LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} -. $LUSTRE/tests/test-framework.sh -init_test_env $@ -. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -init_logging +build_test_filter -remote_mds_nodsh && skip_env "remote MDS with nodsh" && exit 0 -remote_ost_nodsh && skip_env "remote OST with nodsh" && exit 0 +remote_mds_nodsh && skip_env "remote MDS with nodsh" +remote_ost_nodsh && skip_env "remote OST with nodsh" [ -z "$CLIENTS" -o $CLIENTCOUNT -lt 3 ] && - skip_env "need three or more clients" && exit 0 + skip_env "need three or more clients" if [ -z "$SHARED_DIRECTORY" ] || ! check_shared_dir $SHARED_DIRECTORY; then - skip_env "SHARED_DIRECTORY should be specified with a shared directory \ + skip_env "SHARED_DIRECTORY should be specified with a shared directory \ which is accessable on all of the nodes" - exit 0 + exit 0 fi [[ $FAILURE_MODE = SOFT ]] && \ - log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" + log "WARNING: $0 is not functional with FAILURE_MODE = SOFT, bz22797" # Set SERIAL to serialize the failure through a recovery of the first failure. SERIAL=${SERIAL:-""} @@ -50,62 +50,62 @@ END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file} LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} reboot_recover_node () { - # item var contains a pair of clients if nodetype=clients - # I would prefer to have a list here - local item=$1 - local nodetype=$2 - local c - - # MDS, OST item contains the facet - case $nodetype in - MDS|OST ) facet_failover $item - [ "$SERIAL" ] && wait_recovery_complete $item || true - ;; - clients) for c in ${item//,/ }; do - # make sure the client loads die - stop_process $c $LOAD_PID_FILE - shutdown_client $c - boot_node $c - echo "Reintegrating $c" - zconf_mount $c $MOUNT || - error "mount $MOUNT on $c failed" - client_up $c || error "start client on $c failed" - done - start_client_loads $item - ;; - * ) echo "ERROR: invalid nodetype=$nodetype." \ - "Must be one of 'MDS', 'OST', or 'clients'." - exit 1;; - esac + # item var contains a pair of clients if nodetype=clients + # I would prefer to have a list here + local item=$1 + local nodetype=$2 + local c + + # MDS, OST item contains the facet + case $nodetype in + MDS|OST ) facet_failover $item + [ "$SERIAL" ] && wait_recovery_complete $item || true + ;; + clients) for c in ${item//,/ }; do + # make sure the client loads die + stop_process $c $LOAD_PID_FILE + shutdown_client $c + boot_node $c + echo "Reintegrating $c" + zconf_mount $c $MOUNT || + error "mount $MOUNT on $c failed" + client_up $c || error "start client on $c failed" + done + start_client_loads $item + ;; + * ) echo "ERROR: invalid nodetype=$nodetype." \ + "Must be one of 'MDS', 'OST', or 'clients'." + exit 1;; + esac } get_item_type () { - local type=$1 - local excluded=${2:-""} - - local list - case $type in - MDS ) list=$MDTS;; - OST ) list=$OSTS;; - clients) list=$NODES_TO_USE;; - * ) echo "ERROR: invalid type=$type." \ + local type=$1 + local excluded=${2:-""} + + local list + case $type in + MDS ) list=$MDTS;; + OST ) list=$OSTS;; + clients) list=$NODES_TO_USE;; + * ) echo "ERROR: invalid type=$type." \ "Must be one of 'MDS', 'OST', or 'clients'." - exit 1;; - esac - - [ "$excluded" ] && list=$(exclude_items_from_list $list $excluded) - # empty list - if [ ! "$(echo $list)" ]; then - echo - return - fi - - local item=$(get_random_entry $list) - if [ "$type" = "clients" ]; then - item="$item $(get_random_entry $(exclude_items_from_list $list $item))" - item=$(comma_list $item) - fi - echo $item + exit 1;; + esac + + [ "$excluded" ] && list=$(exclude_items_from_list $list $excluded) + # empty list + if [ ! "$(echo $list)" ]; then + echo + return + fi + + local item=$(get_random_entry $list) + if [ "$type" = "clients" ]; then + item="$item $(get_random_entry $(exclude_items_from_list $list $item))" + item=$(comma_list $item) + fi + echo $item } # failover_pair @@ -114,99 +114,95 @@ get_item_type () { # class, reboots the nodes sequentially, and then restarts lustre on # the nodes. failover_pair() { - local type1=$1 - local type2=$2 - local title=$3 + local type1=$1 + local type2=$2 + local title=$3 - local client_nodes="" - local item1= - local item2= - local client1= - local client2= + local client_nodes="" + local item1= + local item2= + local client1= + local client2= - log " + log " ==== START === $title" - item1=$(get_item_type $type1) - [ "$item1" ] || \ - { echo "type1=$type1 item1 is empty" && return 0; } - item2=$(get_item_type $type2 $item1) - [ "$item2" ] || \ - { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" \ - && return 0; } - - # Check that our client loads are still running. If any have died, - # that means they have died outside of recovery, which is unacceptable. - log "==== Checking the clients loads BEFORE failover -- failure NOT OK" - # FIXME. need print summary on exit - check_client_loads $NODES_TO_USE || exit $? - - log "Done checking client loads. Failing type1=$type1 item1=$item1 ... " - reboot_recover_node $item1 $type1 || exit $? - - # Hendrix test17 description: - # Introduce a failure, wait at - # least 5 minutes (for recovery), - # introduce a 2nd - # failure, and wait another 5 - # minutes - - # reboot_recover_node waits recovery in according to - # SERIAL value. - # We have a "double failures" if SERIAL is not set, - # do not need a sleep between failures for "double failures" - - log " Failing type2=$type2 item2=$item2 ... " - reboot_recover_node $item2 $type2 || exit $? - - # Client loads are allowed to die while in recovery, so we just - # restart them. - log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK" - restart_client_loads $NODES_TO_USE $ERRORS_OK || exit $? - log "Done checking / re-starting client loads. PASS" - return 0 + item1=$(get_item_type $type1) + [ "$item1" ] || \ + { echo "type1=$type1 item1 is empty" && return 0; } + item2=$(get_item_type $type2 $item1) + [ "$item2" ] || \ + { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" \ + && return 0; } + + # Check that our client loads are still running. If any have died, + # that means they have died outside of recovery, which is unacceptable. + log "==== Checking the clients loads BEFORE failover -- failure NOT OK" + # FIXME. need print summary on exit + check_client_loads $NODES_TO_USE || exit $? + + log "Done checking client loads. Failing type1=$type1 item1=$item1 ... " + reboot_recover_node $item1 $type1 || exit $? + + # Hendrix test17 description: + # Introduce a failure, wait at + # least 5 minutes (for recovery), + # introduce a 2nd + # failure, and wait another 5 + # minutes + + # reboot_recover_node waits recovery in according to + # SERIAL value. + # We have a "double failures" if SERIAL is not set, + # do not need a sleep between failures for "double failures" + + log " Failing type2=$type2 item2=$item2 ... " + reboot_recover_node $item2 $type2 || exit $? + + # Client loads are allowed to die while in recovery, so we just + # restart them. + log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK" + restart_client_loads $NODES_TO_USE $ERRORS_OK || exit $? + log "Done checking / re-starting client loads. PASS" + return 0 } summary_and_cleanup () { - local rc=$? - trap 0 + local rc=$? + trap 0 - CURRENT_TS=$(date +%s) - ELAPSED=$((CURRENT_TS - START_TS)) + CURRENT_TS=$(date +%s) + ELAPSED=$((CURRENT_TS - START_TS)) - # Having not empty END_RUN_FILE means the failed loads only - if [ -s $END_RUN_FILE ]; then - print_end_run_file $END_RUN_FILE - rc=1 - fi + # Having not empty END_RUN_FILE means the failed loads only + if [ -s $END_RUN_FILE ]; then + print_end_run_file $END_RUN_FILE + rc=1 + fi - echo $(date +'%F %H:%M:%S') Terminating clients loads ... - echo "$0" >> $END_RUN_FILE - local result=PASS - [ $rc -eq 0 ] || result=FAIL + echo $(date +'%F %H:%M:%S') Terminating clients loads ... + echo "$0" >> $END_RUN_FILE + local result=PASS + [ $rc -eq 0 ] || result=FAIL - log " + log " Server failover period: $FAILOVER_PERIOD seconds Exited after: $ELAPSED seconds Status: $result: rc=$rc" - # stop the client loads - stop_client_loads $NODES_TO_USE $LOAD_PID_FILE + # stop the client loads + stop_client_loads $NODES_TO_USE $LOAD_PID_FILE - if [ $rc -ne 0 ]; then - # we are interested in only on failed clients and servers - local failedclients=$(cat $END_RUN_FILE | grep -v $0) - # FIXME: need ostfailover-s nodes also for FLAVOR=OST - gather_logs $(comma_list $(osts_nodes) $(mdts_nodes) \ - $mdsfailover_HOST $failedclients) - fi + if [ $rc -ne 0 ]; then + # we are interested in only on failed clients and servers + local failedclients=$(cat $END_RUN_FILE | grep -v $0) + gather_logs $(comma_list $(all_server_nodes) $failedclients) + fi - exit $rc + exit $rc } ################################## Main Flow ################################### -build_test_filter - check_and_setup_lustre rm -rf $DIR/[Rdfs][0-9]* @@ -220,7 +216,7 @@ zconf_umount $HOSTNAME $MOUNT NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME) -check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} +check_progs_installed $NODES_TO_USE "${CLIENT_LOADS[@]}" MDTS=$(get_facets MDS) OSTS=$(get_facets OST) @@ -232,81 +228,81 @@ CURRENT_TS=$START_TS # Every pairwise combination of client failures (2 clients), # MDS failure, and OST failure will be tested. test_pairwise_fail() { - trap summary_and_cleanup EXIT TERM INT - - # Start client loads. - rm -f $END_RUN_FILE - start_client_loads $NODES_TO_USE - - echo clients load pids: - do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3 - - # FIXME: Do we want to have an initial sleep period where the clients - # just run before introducing a failure? - sleep $FAILOVER_PERIOD - - # CMD_TEST_NUM=17.1 - failover_pair MDS OST "test 1: failover MDS, then OST ==========" - sleep $FAILOVER_PERIOD - - # CMD_TEST_NUM=17.2 - failover_pair MDS clients "test 2: failover MDS, then 2 clients ====" - sleep $FAILOVER_PERIOD - - # CMD_TEST_NUM=17.3 - if [ $MDSCOUNT -gt 1 ]; then - failover_pair MDS MDS "test 3: failover MDS, then another MDS ==" - sleep $FAILOVER_PERIOD - else - skip_env "has less than 2 MDTs, test 3 skipped" - fi - - # CMD_TEST_NUM=17.4 - if [ $OSTCOUNT -gt 1 ]; then - failover_pair OST OST "test 4: failover OST, then another OST ==" - sleep $FAILOVER_PERIOD - else - skip_env "has less than 2 OSTs, test 4 skipped" - fi - - # CMD_TEST_NUM=17.5 - failover_pair OST clients "test 5: failover OST, then 2 clients ====" - sleep $FAILOVER_PERIOD - - # CMD_TEST_NUM=17.6 - failover_pair OST MDS "test 6: failover OST, then MDS ==========" - sleep $FAILOVER_PERIOD - - # CMD_TEST_NUM=17.7 - failover_pair clients MDS "test 7: failover 2 clients, then MDS ====" - sleep $FAILOVER_PERIOD - - # CMD_TEST_NUM=17.8 - failover_pair clients OST "test 8: failover 2 clients, then OST ====" - sleep $FAILOVER_PERIOD - - # CMD_TEST_NUM=17.9 - if [ $CLIENTCOUNT -gt 4 ]; then - failover_pair clients clients \ - "test 9: failover 2 clients, then 2 different clients ==" - sleep $FAILOVER_PERIOD - else - skip_env "has less than 5 Clients, test 9 skipped" - fi - - log "==== Checking the clients loads AFTER all failovers -- failure NOT OK" - if ! check_client_loads $NODES_TO_USE; then - log "Client load failed after failover. Exiting..." - exit 5 - fi - - exit 0 + trap summary_and_cleanup EXIT TERM INT + + # Start client loads. + rm -f $END_RUN_FILE + start_client_loads $NODES_TO_USE + + echo clients load pids: + do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3 + + # FIXME: Do we want to have an initial sleep period where the clients + # just run before introducing a failure? + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.1 + failover_pair MDS OST "test 1: failover MDS, then OST ==========" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.2 + failover_pair MDS clients "test 2: failover MDS, then 2 clients ====" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.3 + if [ $MDSCOUNT -gt 1 ]; then + failover_pair MDS MDS "test 3: failover MDS, then another MDS ==" + sleep $FAILOVER_PERIOD + else + skip_env "has less than 2 MDTs, test 3 skipped" + fi + + # CMD_TEST_NUM=17.4 + if [ $OSTCOUNT -gt 1 ]; then + failover_pair OST OST "test 4: failover OST, then another OST ==" + sleep $FAILOVER_PERIOD + else + skip_env "has less than 2 OSTs, test 4 skipped" + fi + + # CMD_TEST_NUM=17.5 + failover_pair OST clients "test 5: failover OST, then 2 clients ====" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.6 + failover_pair OST MDS "test 6: failover OST, then MDS ==========" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.7 + failover_pair clients MDS "test 7: failover 2 clients, then MDS ====" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.8 + failover_pair clients OST "test 8: failover 2 clients, then OST ====" + sleep $FAILOVER_PERIOD + + # CMD_TEST_NUM=17.9 + if [ $CLIENTCOUNT -gt 4 ]; then + failover_pair clients clients \ + "test 9: failover 2 clients, then 2 different clients ==" + sleep $FAILOVER_PERIOD + else + skip_env "has less than 5 Clients, test 9 skipped" + fi + + log "==== Checking the clients loads AFTER all failovers -- failure NOT OK" + if ! check_client_loads $NODES_TO_USE; then + log "Client load failed after failover. Exiting..." + exit 5 + fi + + exit 0 } run_test pairwise_fail "pairwise combination of clients, MDS, and OST failures" zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed" client_up || error "start client on $HOSTNAME failed" -complete $(basename $0) $SECONDS +complete $SECONDS check_and_cleanup_lustre exit_status