Whamcloud - gitweb
LU-17452 tests: fix interop sanityn tests with b2_15
[fs/lustre-release.git] / lustre / tests / recovery-scale-lib.sh
1 #!/bin/bash
2
3 # global variables and functions used by:
4 # recovery-mds-scale, recovery-oss-scale and recovery-random-scale
5
6 if [[ "$SLOW" == "no" ]]; then
7         DURATION=${DURATION:-$((60 * 30))}
8 else
9         DURATION=${DURATION:-$((60 * 60 * 24))}
10 fi
11
12 SERVER_FAILOVER_PERIOD=${SERVER_FAILOVER_PERIOD:-$((60 * 10))} # 10 minutes
13 MINSLEEP=${MINSLEEP:-120}
14 REQFAIL_PERCENT=${REQFAIL_PERCENT:-3}    # bug17839 comment 62
15 # round up the result of integer division: C=(A + (B - 1)) / B
16 REQFAIL=${REQFAIL:-$(((DURATION * REQFAIL_PERCENT + (SERVER_FAILOVER_PERIOD *
17         100 - 1 )) / SERVER_FAILOVER_PERIOD / 100))}
18
19 END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
20 LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
21 VMSTAT_PID_FILE=${VMSTAT_PID_FILE:-$TMP/vmstat.pid}
22
23 NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
24
25 # The test node needs to be insulated from a Lustre failure as much as
26 # possible. Not even loading the lustre modules is ideal.
27 # -- umount lustre
28 # -- remove hostname from clients list
29 insulate_clients() {
30         zconf_umount $HOSTNAME $MOUNT
31         NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
32 }
33
34 run_info() {
35         local oput=echo
36
37         $oput "******* Run Information *******"
38         $oput "SERVER_FAILOVER_PERIOD:$1"
39         $oput "DURATION:$2"
40         $oput "MINSLEEP:$3"
41         $oput "SLOW:$4"
42         $oput "REQFAIL:$5"
43         $oput "SHARED_DIRECTORY:$6"
44         $oput "END_RUN_FILE:$7"
45         $oput "LOAD_PID_FILE:$8"
46         $oput "VMSTAT_PID_FILE:$9"
47         $oput "CLIENTCOUNT:${10}"
48         $oput "MDTS:${11}"
49         $oput "OSTS:${12}"
50         $oput "*******************************"
51 }
52
53 server_numfailovers () {
54         local facet="$1"
55         local var="${facet}_numfailovers"
56         local val=0
57
58         if [[ -n "${!var}" ]]; then
59                 val="${!var}"
60         fi
61
62         echo "$val"
63 }
64
65 servers_numfailovers () {
66         local facet
67         local var
68
69         for facet in ${MDTS//,/ } ${OSTS//,/ }; do
70                 echo "$facet: $(server_numfailovers $facet) times"
71         done
72 }
73
74 summary_and_cleanup () {
75         local rc=$?
76         local result=PASS
77
78         # Having not empty END_RUN_FILE means the failed loads only
79         if [[ -s "$END_RUN_FILE" ]]; then
80                 print_end_run_file "$END_RUN_FILE"
81                 rc=1
82         fi
83
84         echo $(date +'%F %H:%M:%S') Terminating clients loads ...
85         echo "$0" >> "$END_RUN_FILE"
86
87         if (( rc != 0 )); then
88                 result=FAIL
89         fi
90
91         log "Duration:               $DURATION seconds"
92         log "Server failover period: $SERVER_FAILOVER_PERIOD seconds"
93         log "Exited after:           $ELAPSED seconds"
94         log "Number of failovers before exit: $(servers_numfailovers)"
95         log "Status: $result"
96         log "Return code: $rc"
97
98         # stop vmstat on OSS nodes
99         if [[ -n "$VMSTAT" ]]; then
100                 stop_process $(comma_list $(osts_nodes)) "$VMSTAT_PID_FILE"
101         fi
102
103         # stop the client loads
104         stop_client_loads $NODES_TO_USE $LOAD_PID_FILE
105
106         if (( rc != 0 )); then
107                 # we are interested in only on failed clients and servers
108                 local failedclients=$(cat $END_RUN_FILE | grep -v $0)
109
110                 gather_logs $(comma_list $(all_server_nodes) $failedclients)
111         fi
112
113         exit $rc
114 }
115
116 failover_target() {
117         local servers
118         local serverfacet
119         local var
120         local flavor=${1:-"MDS"}
121
122         if [[ "$flavor" == "MDS" ]]; then
123                 servers=$MDTS
124         else
125                 servers=$OSTS
126         fi
127
128         stack_trap summary_and_cleanup EXIT INT
129
130         # start vmstat on OSS nodes
131         if [[ -n "$VMSTAT" ]]; then
132                 start_vmstat $(comma_list $(osts_nodes)) "$VMSTAT_PID_FILE"
133         fi
134
135         # start client loads
136         rm -f "$END_RUN_FILE"
137         start_client_loads $NODES_TO_USE
138
139         echo client loads pids:
140         do_nodesv $NODES_TO_USE "cat $LOAD_PID_FILE" || exit 3
141
142         ELAPSED=0
143         local sleep=0
144         local reqfail=0
145         local it_time_start=0
146         local start_ts=$(date +%s)
147         local current_ts=$start_ts
148
149         while (( ELAPSED < DURATION )) && [[ ! -e "$END_RUN_FILE" ]]; do
150                 # In order to perform the
151                 # expected number of failovers, we need to account the
152                 # following:
153                 # 1) the time that has elapsed during the client load checking
154                 # 2) time takes for failover
155                 it_time_start=$(date +%s)
156
157                 serverfacet=$(get_random_entry $servers)
158                 var=${serverfacet}_numfailovers
159
160                 # Check that our client loads are still running. If any have
161                 # died, that means they have died outside of recovery, which
162                 # is unacceptable.
163                 log "==== Check clients loads BEFORE failover: failure NOT OK \
164                      ELAPSED=$ELAPSED DURATION=$DURATION \
165                      PERIOD=$SERVER_FAILOVER_PERIOD"
166                 check_client_loads $NODES_TO_USE || exit 4
167
168                 log "Wait $serverfacet recovery complete before next failover"
169                 if ! wait_recovery_complete $serverfacet; then
170                     echo "$serverfacet recovery is not completed!"
171                     exit 7
172                 fi
173
174                 log "Checking clients are in FULL or IDLE state before next \
175                      failover"
176                 wait_clients_import_ready $NODES_TO_USE $serverfacet ||
177                         echo "Client import is not ready, please consider" \
178                              "to increase SERVER_FAILOVER_PERIOD =" \
179                              "$SERVER_FAILOVER_PERIOD!"
180
181                 log "Starting failover on $serverfacet"
182                 facet_failover "$serverfacet" || exit 1
183
184                 # Check that our client loads are still running during failover.
185                 # No application failures should occur.
186                 log "==== Check clients loads AFTER failover: failure NOT OK"
187                 if ! check_client_loads $NODES_TO_USE; then
188                         log "Client load failed during failover. Exiting..."
189                         exit 5
190                 fi
191
192                 # Increment the number of failovers.
193                 val=$((${!var} + 1))
194                 eval $var=$val
195
196                 current_ts=$(date +%s)
197                 ELAPSED=$((current_ts - start_ts))
198
199                 sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
200
201                 # Keep counting the number of iterations when
202                 # time spent to failover and two client loads check exceeded
203                 # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
204                 if (( sleep < MINSLEEP )); then
205                         reqfail=$((reqfail + 1))
206
207                         cat <<- END_WARN
208                         WARNING: failover and two check_client_loads time
209                         exceeded: SERVER_FAILOVER_PERIOD - MINSLEEP!
210
211                         Failed loading I/O for a min period of $MINSLEEP
212                         $reqfail times (REQFAIL=$REQFAIL).
213
214                         This iteration, load only applied for sleep $sleep
215                         seconds.
216
217                         Estimated max recovery time: $MAX_RECOV_TIME
218                         Probably hardware is taking excessively long time
219                         to boot.
220
221                         Try increasing SERVER_FAILOVER_PERIOD
222                         (current is $SERVER_FAILOVER_PERIOD), bug 20918.
223                         END_WARN
224
225                         if (( reqfail > REQFAIL )); then
226                                 exit 6
227                         fi
228                 fi
229
230                 log "$serverfacet failed over ${!var} times, and counting..."
231
232                 if (( (ELAPSED + sleep) >= DURATION )); then
233                         break
234                 fi
235
236                 if (( sleep > 0 )); then
237                         echo "sleeping $sleep seconds... "
238                         sleep $sleep
239                 fi
240         done
241
242         exit 0
243 }