lustre/tests/recovery-scale-lib.sh

   1 #!/bin/bash
   2
   3 # global variables and functions used by:
   4 # recovery-mds-scale, recovery-oss-scale and recovery-random-scale
   5
   6 if [[ "$SLOW" == "no" ]]; then
   7         DURATION=${DURATION:-$((60 * 30))}
   8 else
   9         DURATION=${DURATION:-$((60 * 60 * 24))}
  10 fi
  11
  12 SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
  13 MINSLEEP=${MINSLEEP:-120}
  14 REQFAIL_PERCENT=${REQFAIL_PERCENT:-3}    # bug17839 comment 62
  15 # round up the result of integer division: C=(A + (B - 1)) / B
  16 REQFAIL=${REQFAIL:-$(((DURATION * REQFAIL_PERCENT + (SERVER_FAILOVER_PERIOD *
  17         100 - 1 )) / SERVER_FAILOVER_PERIOD / 100))}
  18
  19 END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
  20 LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
  21 VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
  22
  23 NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
  24
  25 # The test node needs to be insulated from a Lustre failure as much as
  26 # possible. Not even loading the lustre modules is ideal.
  27 # -- umount lustre
  28 # -- remove hostname from clients list
  29 insulate_clients() {
  30         zconf_umount $HOSTNAME $MOUNT
  31         NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
  32 }
  33
  34 run_info() {
  35         local oput=echo
  36
  37         $oput "******* Run Information *******"
  38         $oput "SERVER_FAILOVER_PERIOD:$1"
  39         $oput "DURATION:$2"
  40         $oput "MINSLEEP:$3"
  41         $oput "SLOW:$4"
  42         $oput "REQFAIL:$5"
  43         $oput "SHARED_DIRECTORY:$6"
  44         $oput "END_RUN_FILE:$7"
  45         $oput "LOAD_PID_FILE:$8"
  46         $oput "VMSTAT_PID_FILE:$9"
  47         $oput "CLIENTCOUNT:${10}"
  48         $oput "MDTS:${11}"
  49         $oput "OSTS:${12}"
  50         $oput "*******************************"
  51 }
  52
  53 server_numfailovers () {
  54         local facet="$1"
  55         local var="${facet}_numfailovers"
  56         local val=0
  57
  58         if [[ -n "${!var}" ]]; then
  59                 val="${!var}"
  60         fi
  61
  62         echo "$val"
  63 }
  64
  65 servers_numfailovers () {
  66         local facet
  67         local var
  68
  69         for facet in ${MDTS//,/ } ${OSTS//,/ }; do
  70                 echo "$facet: $(server_numfailovers $facet) times"
  71         done
  72 }
  73
  74 summary_and_cleanup () {
  75         local rc=$?
  76         local result=PASS
  77
  78         # Having not empty END_RUN_FILE means the failed loads only
  79         if [[ -s "$END_RUN_FILE" ]]; then
  80                 print_end_run_file "$END_RUN_FILE"
  81                 rc=1
  82         fi
  83
  84         echo $(date +'%F %H:%M:%S') Terminating clients loads ...
  85         echo "$0" >> "$END_RUN_FILE"
  86
  87         if (( rc != 0 )); then
  88                 result=FAIL
  89         fi
  90
  91         log "Duration:               $DURATION seconds"
  92         log "Server failover period: $SERVER_FAILOVER_PERIOD seconds"
  93         log "Exited after:           $ELAPSED seconds"
  94         log "Number of failovers before exit: $(servers_numfailovers)"
  95         log "Status: $result"
  96         log "Return code: $rc"
  97
  98         # stop vmstat on OSS nodes
  99         if [[ -n "$VMSTAT" ]]; then
 100                 stop_process $(comma_list $(osts_nodes)) "$VMSTAT_PID_FILE"
 101         fi
 102
 103         # stop the client loads
 104         stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
 105
 106         if (( rc != 0 )); then
 107                 # we are interested in only on failed clients and servers
 108                 local failedclients=$(cat $END_RUN_FILE | grep -v $0)
 109
 110                 gather_logs $(comma_list $(all_server_nodes) $failedclients)
 111         fi
 112
 113         exit $rc
 114 }
 115
 116 failover_target() {
 117         local servers
 118         local serverfacet
 119         local var
 120         local flavor=${1:-"MDS"}
 121
 122         if [[ "$flavor" == "MDS" ]]; then
 123                 servers=$MDTS
 124         else
 125                 servers=$OSTS
 126         fi
 127
 128         stack_trap summary_and_cleanup EXIT INT
 129
 130         # start vmstat on OSS nodes
 131         if [[ -n "$VMSTAT" ]]; then
 132                 start_vmstat $(comma_list $(osts_nodes)) "$VMSTAT_PID_FILE"
 133         fi
 134
 135         # start client loads
 136         rm -f "$END_RUN_FILE"
 137         start_client_loads $NODES_TO_USE
 138
 139         echo client loads pids:
 140         do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
 141
 142         ELAPSED=0
 143         local sleep=0
 144         local reqfail=0
 145         local it_time_start=0
 146         local start_ts=$(date +%s)
 147         local current_ts=$start_ts
 148
 149         while (( ELAPSED < DURATION )) && [[ ! -e "$END_RUN_FILE" ]]; do
 150                 # In order to perform the
 151                 # expected number of failovers, we need to account the
 152                 # following:
 153                 # 1) the time that has elapsed during the client load checking
 154                 # 2) time takes for failover
 155                 it_time_start=$(date +%s)
 156
 157                 serverfacet=$(get_random_entry $servers)
 158                 var=${serverfacet}_numfailovers
 159
 160                 # Check that our client loads are still running. If any have
 161                 # died, that means they have died outside of recovery, which
 162                 # is unacceptable.
 163                 log "==== Check clients loads BEFORE failover: failure NOT OK \
 164                      ELAPSED=$ELAPSED DURATION=$DURATION \
 165                      PERIOD=$SERVER_FAILOVER_PERIOD"
 166                 check_client_loads $NODES_TO_USE || exit 4
 167
 168                 log "Wait $serverfacet recovery complete before next failover"
 169                 if ! wait_recovery_complete $serverfacet; then
 170                     echo "$serverfacet recovery is not completed!"
 171                     exit 7
 172                 fi
 173
 174                 log "Checking clients are in FULL or IDLE state before next \
 175                      failover"
 176                 wait_clients_import_ready $NODES_TO_USE $serverfacet ||
 177                         echo "Client import is not ready, please consider" \
 178                              "to increase SERVER_FAILOVER_PERIOD =" \
 179                              "$SERVER_FAILOVER_PERIOD!"
 180
 181                 log "Starting failover on $serverfacet"
 182                 facet_failover "$serverfacet" || exit 1
 183
 184                 # Check that our client loads are still running during failover.
 185                 # No application failures should occur.
 186                 log "==== Check clients loads AFTER failover: failure NOT OK"
 187                 if ! check_client_loads $NODES_TO_USE; then
 188                         log "Client load failed during failover. Exiting..."
 189                         exit 5
 190                 fi
 191
 192                 # Increment the number of failovers.
 193                 val=$((${!var} + 1))
 194                 eval $var=$val
 195
 196                 current_ts=$(date +%s)
 197                 ELAPSED=$((current_ts - start_ts))
 198
 199                 sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
 200
 201                 # Keep counting the number of iterations when
 202                 # time spent to failover and two client loads check exceeded
 203                 # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
 204                 if (( sleep < MINSLEEP )); then
 205                         reqfail=$((reqfail + 1))
 206
 207                         cat <<- END_WARN
 208                         WARNING: failover and two check_client_loads time
 209                         exceeded: SERVER_FAILOVER_PERIOD - MINSLEEP!
 210
 211                         Failed loading I/O for a min period of $MINSLEEP
 212                         $reqfail times (REQFAIL=$REQFAIL).
 213
 214                         This iteration, load only applied for sleep $sleep
 215                         seconds.
 216
 217                         Estimated max recovery time: $MAX_RECOV_TIME
 218                         Probably hardware is taking excessively long time
 219                         to boot.
 220
 221                         Try increasing SERVER_FAILOVER_PERIOD
 222                         (current is $SERVER_FAILOVER_PERIOD), bug 20918.
 223                         END_WARN
 224
 225                         if (( reqfail > REQFAIL )); then
 226                                 exit 6
 227                         fi
 228                 fi
 229
 230                 log "$serverfacet failed over ${!var} times, and counting..."
 231
 232                 if (( (ELAPSED + sleep) >= DURATION )); then
 233                         break
 234                 fi
 235
 236                 if (( sleep > 0 )); then
 237                         echo "sleeping $sleep seconds... "
 238                         sleep $sleep
 239                 fi
 240         done
 241
 242         exit 0
 243 }