2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
13 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
20 assert_env mds_HOST MDS_MKFS_OPTS
21 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
22 assert_env LIVE_CLIENT FSNAME
25 # This can be a regexp, to allow more clients
26 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
31 # fail clients round robin
33 # list of failable clients
34 FAIL_LIST=($FAIL_CLIENTS)
35 FAIL_NUM=${#FAIL_LIST[*]}
38 DOWN_NUM=0 # number of nodes currently down
40 # set next client to fail
42 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
43 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
44 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
49 if [ "$FAILURE_MODE" = HARD ]; then
51 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
52 echo "waiting for node $client to fail"
55 elif [ "$FAILURE_MODE" = SOFT ]; then
56 zconf_umount $client $MOUNT -f
62 if [ "$FAILURE_MODE" = HARD ]; then
69 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
70 num=$((FAIL_NUM - DOWN_NUM))
73 if [ -z "$num" ] || [ "$num" -le 0 ]; then
79 for i in `seq $num`; do
82 DOWN_CLIENTS="$DOWN_CLIENTS $client"
83 shutdown_client $client
86 echo "down clients: $DOWN_CLIENTS"
88 for client in $DOWN_CLIENTS; do
91 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
95 reintegrate_clients() {
96 for client in $DOWN_CLIENTS; do
98 echo "Restarting $client"
99 zconf_mount $client $MOUNT || return 1
106 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
115 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
116 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
120 zconf_umount $CLIENTS $MOUNT
129 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
130 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
131 $PDSH $c touch $MOUNT/${c}_$file || return 1
137 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
138 $PDSH $c rm $MOUNT/${c}_$file
143 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
144 echo "$c mkdir $MOUNT/$c"
145 $PDSH $c "mkdir $MOUNT/$c"
146 $PDSH $c "ls -l $MOUNT/$c"
151 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
152 echo "rmdir $MOUNT/$c"
153 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
157 clients_recover_osts() {
159 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
162 if [ "$ONLY" == "cleanup" ]; then
167 if [ ! -z "$EVAL" ]; then
174 if [ "$ONLY" == "setup" ]; then
178 # 9 Different Failure Modes Combinations
179 echo "Starting Test 17 at `date`"
182 facet_failover $SINGLEMDS
183 echo "Waiting for df pid: $DFPID"
184 wait $DFPID || { echo "df returned $?" && return 1; }
186 facet_failover ost1 || return 4
187 echo "Waiting for df pid: $DFPID"
188 wait $DFPID || { echo "df returned $?" && return 2; }
190 facet_failover ost2 || return 5
191 echo "Waiting for df pid: $DFPID"
192 wait $DFPID || { echo "df returned $?" && return 3; }
195 run_test 0 "Fail all nodes, independently"
197 ############### First Failure Mode ###############
199 echo "Don't do a MDS - MDS Failure Case"
200 echo "This makes no sense"
202 run_test 1 "MDS/MDS failure"
203 ###################################################
205 ############### Second Failure Mode ###############
207 echo "Verify Lustre filesystem is up and running"
210 shutdown_facet $SINGLEMDS
211 reboot_facet $SINGLEMDS
213 # prepare for MDS failover
214 change_active $SINGLEMDS
215 reboot_facet $SINGLEMDS
223 echo "Reintegrating OST"
226 start_ost 1 || return 2
229 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
233 clients_recover_osts ost1
234 echo "Verify reintegration"
235 client_df || return 1
238 run_test 2 "Second Failure Mode: MDS/OST `date`"
239 ###################################################
242 ############### Third Failure Mode ###############
245 echo "Verify Lustre filesystem is up and running"
248 facet_failover $SINGLEMDS
249 wait $DFPID || echo df failed: $?
252 echo "Test Lustre stability after MDS failover"
256 echo "Failing 2 CLIENTS"
260 echo "Test Lustre stability after CLIENT failure"
264 echo "Reintegrating CLIENTS"
265 reintegrate_clients || return 1
267 client_df || return 3
269 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
270 ###################################################
272 ############### Fourth Failure Mode ###############
274 echo "Fourth Failure Mode: OST/MDS `date`"
280 echo "Test Lustre stability after OST failure"
286 shutdown_facet $SINGLEMDS
287 reboot_facet $SINGLEMDS
289 # prepare for MDS failover
290 change_active $SINGLEMDS
291 reboot_facet $SINGLEMDS
298 echo "Reintegrating OST"
304 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
309 clients_recover_osts ost1
310 echo "Test Lustre stability after MDS failover"
311 client_df || return 1
313 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
314 ###################################################
316 ############### Fifth Failure Mode ###############
318 echo "Fifth Failure Mode: OST/OST `date`"
321 echo "Verify Lustre filesystem is up and running"
329 echo "Test Lustre stability after OST failure"
339 echo "Test Lustre stability after OST failure"
345 echo "Reintegrating OSTs"
351 clients_recover_osts ost1
352 clients_recover_osts ost2
357 client_df || return 2
359 run_test 5 "Fifth Failure Mode: OST/OST `date`"
360 ###################################################
362 ############### Sixth Failure Mode ###############
364 echo "Sixth Failure Mode: OST/CLIENT `date`"
367 echo "Verify Lustre filesystem is up and running"
368 client_df || return 1
369 client_touch testfile || return 2
376 echo "Test Lustre stability after OST failure"
382 echo "Failing CLIENTs"
386 echo "Test Lustre stability after CLIENTs failure"
392 echo "Reintegrating OST/CLIENTs"
400 echo "Verifying mount"
401 client_df || return 3
403 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
404 ###################################################
407 ############### Seventh Failure Mode ###############
409 echo "Seventh Failure Mode: CLIENT/MDS `date`"
412 echo "Verify Lustre filesystem is up and running"
414 client_touch testfile || return 1
417 echo "Part 1: Failing CLIENT"
421 echo "Test Lustre stability after CLIENTs failure"
423 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
424 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
427 echo "Wait 1 minutes"
431 echo "Verify Lustre filesystem is up and running"
436 facet_failover $SINGLEMDS
439 echo "Test Lustre stability after MDS failover"
440 wait $DFPID || echo "df on down clients fails " || return 1
441 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
442 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
445 echo "Reintegrating CLIENTs"
447 client_df || return 2
450 echo "wait 1 minutes"
453 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
454 ###################################################
457 ############### Eighth Failure Mode ###############
459 echo "Eighth Failure Mode: CLIENT/OST `date`"
462 echo "Verify Lustre filesystem is up and running"
464 client_touch testfile
467 echo "Failing CLIENTs"
471 echo "Test Lustre stability after CLIENTs failure"
473 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
474 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
477 echo "Wait 1 minutes"
481 echo "Verify Lustre filesystem is up and running"
483 client_touch testfile
491 echo "Test Lustre stability after OST failure"
495 #non-failout hangs forever here
496 #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
497 #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
500 echo "Reintegrating CLIENTs/OST"
505 client_df || return 1
506 client_touch testfile2 || return 2
509 echo "Wait 1 minutes"
512 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
513 ###################################################
516 ############### Ninth Failure Mode ###############
521 echo "Verify Lustre filesystem is up and running"
523 client_touch testfile || return 1
526 echo "Failing CLIENTs"
530 echo "Test Lustre stability after CLIENTs failure"
532 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
533 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
536 echo "Wait 1 minutes"
540 echo "Verify Lustre filesystem is up and running"
541 $PDSH $LIVE_CLIENT df $MOUNT || return 3
542 client_touch testfile || return 4
545 echo "Failing CLIENTs"
549 echo "Test Lustre stability after CLIENTs failure"
551 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
552 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
555 echo "Reintegrating CLIENTs/CLIENTs"
557 client_df || return 7
560 echo "Wait 1 minutes"
563 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
564 ###################################################
567 #Run availability after all failures
568 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
569 LOADTEST=${LOADTEST:-metadata-load.py}
570 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
572 run_test 10 "Running Availability for 6 hours..."
574 equals_msg `basename $0`: test complete, cleaning up
576 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true