2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
13 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
20 assert_env mds_HOST MDS_MKFS_OPTS
21 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
22 assert_env LIVE_CLIENT FSNAME
25 # This can be a regexp, to allow more clients
26 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
31 # fail clients round robin
33 # list of failable clients
34 FAIL_LIST=($FAIL_CLIENTS)
35 FAIL_NUM=${#FAIL_LIST[*]}
38 DOWN_NUM=0 # number of nodes currently down
40 # set next client to fail
42 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
43 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
44 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
49 if [ "$FAILURE_MODE" = HARD ]; then
51 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
52 echo "waiting for node $client to fail"
55 elif [ "$FAILURE_MODE" = SOFT ]; then
56 zconf_umount $client $MOUNT -f
62 if [ "$FAILURE_MODE" = HARD ]; then
69 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
70 num=$((FAIL_NUM - DOWN_NUM))
73 if [ -z "$num" ] || [ "$num" -le 0 ]; then
79 for i in `seq $num`; do
82 DOWN_CLIENTS="$DOWN_CLIENTS $client"
83 shutdown_client $client
86 echo "down clients: $DOWN_CLIENTS"
88 for client in $DOWN_CLIENTS; do
91 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
95 reintegrate_clients() {
96 for client in $DOWN_CLIENTS; do
98 echo "Restarting $client"
99 zconf_mount $client $MOUNT || return 1
106 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
115 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
116 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
120 zconf_umount $CLIENTS $MOUNT
129 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
130 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
131 $PDSH $c touch $MOUNT/${c}_$file || return 1
137 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
138 $PDSH $c rm $MOUNT/${c}_$file
143 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
144 echo "$c mkdir $MOUNT/$c"
145 $PDSH $c "mkdir $MOUNT/$c"
146 $PDSH $c "ls -l $MOUNT/$c"
151 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
152 echo "rmdir $MOUNT/$c"
153 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
157 clients_recover_osts() {
159 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
162 if [ "$ONLY" == "cleanup" ]; then
167 if [ ! -z "$EVAL" ]; then
174 if [ "$ONLY" == "setup" ]; then
178 # 9 Different Failure Modes Combinations
179 echo "Starting Test 17 at `date`"
182 facet_failover $SINGLEMDS
183 echo "Waiting for df pid: $DFPID"
184 wait $DFPID || { echo "df returned $?" && return 1; }
186 facet_failover ost1 || return 4
187 echo "Waiting for df pid: $DFPID"
188 wait $DFPID || { echo "df returned $?" && return 2; }
190 facet_failover ost2 || return 5
191 echo "Waiting for df pid: $DFPID"
192 wait $DFPID || { echo "df returned $?" && return 3; }
195 run_test 0 "Fail all nodes, independently"
197 ############### First Failure Mode ###############
199 echo "Don't do a MDS - MDS Failure Case"
200 echo "This makes no sense"
202 run_test 1 "MDS/MDS failure"
203 ###################################################
205 ############### Second Failure Mode ###############
207 echo "Verify Lustre filesystem is up and running"
210 shutdown_facet $SINGLEMDS
211 reboot_facet $SINGLEMDS
213 # prepare for MDS failover
214 change_active $SINGLEMDS
215 reboot_facet $SINGLEMDS
223 echo "Reintegrating OST"
226 start_ost 1 || return 2
229 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
233 clients_recover_osts ost1
234 echo "Verify reintegration"
235 client_df || return 1
238 run_test 2 "Second Failure Mode: MDS/OST `date`"
239 ###################################################
242 ############### Third Failure Mode ###############
245 echo "Verify Lustre filesystem is up and running"
248 facet_failover $SINGLEMDS
249 wait $DFPID || echo df failed: $?
252 echo "Test Lustre stability after MDS failover"
256 echo "Failing 2 CLIENTS"
260 echo "Test Lustre stability after CLIENT failure"
264 echo "Reintegrating CLIENTS"
265 reintegrate_clients || return 1
267 client_df || return 3
268 sleep 2 # give it a little time for fully recovered before next test
270 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
271 ###################################################
273 ############### Fourth Failure Mode ###############
275 echo "Fourth Failure Mode: OST/MDS `date`"
281 echo "Test Lustre stability after OST failure"
287 shutdown_facet $SINGLEMDS
288 reboot_facet $SINGLEMDS
290 # prepare for MDS failover
291 change_active $SINGLEMDS
292 reboot_facet $SINGLEMDS
299 echo "Reintegrating OST"
305 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
310 clients_recover_osts ost1
311 echo "Test Lustre stability after MDS failover"
312 client_df || return 1
314 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
315 ###################################################
317 ############### Fifth Failure Mode ###############
319 echo "Fifth Failure Mode: OST/OST `date`"
322 echo "Verify Lustre filesystem is up and running"
330 echo "Test Lustre stability after OST failure"
340 echo "Test Lustre stability after OST failure"
346 echo "Reintegrating OSTs"
352 clients_recover_osts ost1
353 clients_recover_osts ost2
358 client_df || return 2
360 run_test 5 "Fifth Failure Mode: OST/OST `date`"
361 ###################################################
363 ############### Sixth Failure Mode ###############
365 echo "Sixth Failure Mode: OST/CLIENT `date`"
368 echo "Verify Lustre filesystem is up and running"
369 client_df || return 1
370 client_touch testfile || return 2
377 echo "Test Lustre stability after OST failure"
383 echo "Failing CLIENTs"
387 echo "Test Lustre stability after CLIENTs failure"
393 echo "Reintegrating OST/CLIENTs"
401 echo "Verifying mount"
402 client_df || return 3
404 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
405 ###################################################
408 ############### Seventh Failure Mode ###############
410 echo "Seventh Failure Mode: CLIENT/MDS `date`"
413 echo "Verify Lustre filesystem is up and running"
415 client_touch testfile || return 1
418 echo "Part 1: Failing CLIENT"
422 echo "Test Lustre stability after CLIENTs failure"
424 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
425 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
428 echo "Wait 1 minutes"
432 echo "Verify Lustre filesystem is up and running"
437 facet_failover $SINGLEMDS
440 echo "Test Lustre stability after MDS failover"
441 wait $DFPID || echo "df on down clients fails " || return 1
442 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
443 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
446 echo "Reintegrating CLIENTs"
448 client_df || return 2
451 echo "wait 1 minutes"
454 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
455 ###################################################
458 ############### Eighth Failure Mode ###############
460 echo "Eighth Failure Mode: CLIENT/OST `date`"
463 echo "Verify Lustre filesystem is up and running"
465 client_touch testfile
468 echo "Failing CLIENTs"
472 echo "Test Lustre stability after CLIENTs failure"
474 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
475 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
478 echo "Wait 1 minutes"
482 echo "Verify Lustre filesystem is up and running"
484 client_touch testfile
492 echo "Test Lustre stability after OST failure"
496 #non-failout hangs forever here
497 #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
498 #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
501 echo "Reintegrating CLIENTs/OST"
506 client_df || return 1
507 client_touch testfile2 || return 2
510 echo "Wait 1 minutes"
513 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
514 ###################################################
517 ############### Ninth Failure Mode ###############
522 echo "Verify Lustre filesystem is up and running"
524 client_touch testfile || return 1
527 echo "Failing CLIENTs"
531 echo "Test Lustre stability after CLIENTs failure"
533 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
534 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
537 echo "Wait 1 minutes"
541 echo "Verify Lustre filesystem is up and running"
542 $PDSH $LIVE_CLIENT df $MOUNT || return 3
543 client_touch testfile || return 4
546 echo "Failing CLIENTs"
550 echo "Test Lustre stability after CLIENTs failure"
552 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
553 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
556 echo "Reintegrating CLIENTs/CLIENTs"
558 client_df || return 7
561 echo "Wait 1 minutes"
564 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
565 ###################################################
568 #Run availability after all failures
569 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
570 LOADTEST=${LOADTEST:-metadata-load.py}
571 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
573 run_test 10 "Running Availability for 6 hours..."
575 equals_msg `basename $0`: test complete, cleaning up
577 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true