3 # global variables and functions used by:
4 # recovery-mds-scale, recovery-oss-scale and recovery-random-scale
6 if [[ "$SLOW" == "no" ]]; then
7 DURATION=${DURATION:-$((60 * 30))}
9 DURATION=${DURATION:-$((60 * 60 * 24))}
12 SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
13 MINSLEEP=${MINSLEEP:-120}
14 REQFAIL_PERCENT=${REQFAIL_PERCENT:-3} # bug17839 comment 62
15 # round up the result of integer division: C=(A + (B - 1)) / B
16 REQFAIL=${REQFAIL:-$(((DURATION * REQFAIL_PERCENT + (SERVER_FAILOVER_PERIOD *
17 100 - 1 )) / SERVER_FAILOVER_PERIOD / 100))}
19 END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
20 LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
21 VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
23 NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
25 # The test node needs to be insulated from a Lustre failure as much as
26 # possible. Not even loading the lustre modules is ideal.
28 # -- remove hostname from clients list
30 zconf_umount $HOSTNAME $MOUNT
31 NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
37 $oput "******* Run Information *******"
38 $oput "SERVER_FAILOVER_PERIOD:$1"
43 $oput "SHARED_DIRECTORY:$6"
44 $oput "END_RUN_FILE:$7"
45 $oput "LOAD_PID_FILE:$8"
46 $oput "VMSTAT_PID_FILE:$9"
47 $oput "CLIENTCOUNT:${10}"
50 $oput "*******************************"
53 server_numfailovers () {
55 local var="${facet}_numfailovers"
58 if [[ -n "${!var}" ]]; then
65 servers_numfailovers () {
69 for facet in ${MDTS//,/ } ${OSTS//,/ }; do
70 echo "$facet: $(server_numfailovers $facet) times"
74 summary_and_cleanup () {
78 # Having not empty END_RUN_FILE means the failed loads only
79 if [[ -s "$END_RUN_FILE" ]]; then
80 print_end_run_file "$END_RUN_FILE"
84 echo $(date +'%F %H:%M:%S') Terminating clients loads ...
85 echo "$0" >> "$END_RUN_FILE"
87 if (( rc != 0 )); then
91 log "Duration: $DURATION seconds"
92 log "Server failover period: $SERVER_FAILOVER_PERIOD seconds"
93 log "Exited after: $ELAPSED seconds"
94 log "Number of failovers before exit: $(servers_numfailovers)"
96 log "Return code: $rc"
98 # stop vmstat on OSS nodes
99 if [[ -n "$VMSTAT" ]]; then
100 stop_process $(comma_list $(osts_nodes)) "$VMSTAT_PID_FILE"
103 # stop the client loads
104 stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
106 if (( rc != 0 )); then
107 # we are interested in only on failed clients and servers
108 local failedclients=$(cat $END_RUN_FILE | grep -v $0)
110 gather_logs $(comma_list $(all_server_nodes) $failedclients)
120 local flavor=${1:-"MDS"}
122 if [[ "$flavor" == "MDS" ]]; then
128 stack_trap summary_and_cleanup EXIT INT
130 # start vmstat on OSS nodes
131 if [[ -n "$VMSTAT" ]]; then
132 start_vmstat $(comma_list $(osts_nodes)) "$VMSTAT_PID_FILE"
136 rm -f "$END_RUN_FILE"
137 start_client_loads $NODES_TO_USE
139 echo client loads pids:
140 do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
145 local it_time_start=0
146 local start_ts=$(date +%s)
147 local current_ts=$start_ts
149 while (( ELAPSED < DURATION )) && [[ ! -e "$END_RUN_FILE" ]]; do
150 # In order to perform the
151 # expected number of failovers, we need to account the
153 # 1) the time that has elapsed during the client load checking
154 # 2) time takes for failover
155 it_time_start=$(date +%s)
157 serverfacet=$(get_random_entry $servers)
158 var=${serverfacet}_numfailovers
160 # Check that our client loads are still running. If any have
161 # died, that means they have died outside of recovery, which
163 log "==== Check clients loads BEFORE failover: failure NOT OK \
164 ELAPSED=$ELAPSED DURATION=$DURATION \
165 PERIOD=$SERVER_FAILOVER_PERIOD"
166 check_client_loads $NODES_TO_USE || exit 4
168 log "Wait $serverfacet recovery complete before next failover"
169 if ! wait_recovery_complete $serverfacet; then
170 echo "$serverfacet recovery is not completed!"
174 log "Checking clients are in FULL or IDLE state before next \
176 wait_clients_import_ready $NODES_TO_USE $serverfacet ||
177 echo "Client import is not ready, please consider" \
178 "to increase SERVER_FAILOVER_PERIOD =" \
179 "$SERVER_FAILOVER_PERIOD!"
181 log "Starting failover on $serverfacet"
182 facet_failover "$serverfacet" || exit 1
184 # Check that our client loads are still running during failover.
185 # No application failures should occur.
186 log "==== Check clients loads AFTER failover: failure NOT OK"
187 if ! check_client_loads $NODES_TO_USE; then
188 log "Client load failed during failover. Exiting..."
192 # Increment the number of failovers.
196 current_ts=$(date +%s)
197 ELAPSED=$((current_ts - start_ts))
199 sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
201 # Keep counting the number of iterations when
202 # time spent to failover and two client loads check exceeded
203 # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
204 if (( sleep < MINSLEEP )); then
205 reqfail=$((reqfail + 1))
208 WARNING: failover and two check_client_loads time
209 exceeded: SERVER_FAILOVER_PERIOD - MINSLEEP!
211 Failed loading I/O for a min period of $MINSLEEP
212 $reqfail times (REQFAIL=$REQFAIL).
214 This iteration, load only applied for sleep $sleep
217 Estimated max recovery time: $MAX_RECOV_TIME
218 Probably hardware is taking excessively long time
221 Try increasing SERVER_FAILOVER_PERIOD
222 (current is $SERVER_FAILOVER_PERIOD), bug 20918.
225 if (( reqfail > REQFAIL )); then
230 log "$serverfacet failed over ${!var} times, and counting..."
232 if (( (ELAPSED + sleep) >= DURATION )); then
236 if (( sleep > 0 )); then
237 echo "sleeping $sleep seconds... "