3 # Test different failure modes combinations
7 LUSTRE=${LUSTRE:-$(dirname $0)/..}
8 . $LUSTRE/tests/test-framework.sh
12 # bug number for skipped test:
13 ALWAYS_EXCEPT="$INSANITY_EXCEPT"
14 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
18 if [ "$FAILURE_MODE" = "HARD" ]; then
19 skip_env "$TESTSUITE: is not functional with FAILURE_MODE = HARD, " \
20 "please use recovery-double-scale, bz20407"
23 SINGLECLIENT=${SINGLECLIENT:-$HOSTNAME}
24 LIVE_CLIENT=${LIVE_CLIENT:-$SINGLECLIENT}
25 FAIL_CLIENTS=${FAIL_CLIENTS:-$RCLIENTS}
27 assert_env mds_HOST MDSCOUNT
28 assert_env ost_HOST OSTCOUNT
29 assert_env LIVE_CLIENT FSNAME
31 require_dsh_mds || exit 0
32 require_dsh_ost || exit 0
34 # FAIL_CLIENTS list should not contain the LIVE_CLIENT
35 FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
38 TESTDIR=$DIR/d0.$TESTSUITE
41 # fail clients round robin
43 # list of failable clients
44 FAIL_LIST=($FAIL_CLIENTS)
45 FAIL_NUM=${#FAIL_LIST[*]}
48 DOWN_NUM=0 # number of nodes currently down
50 # set next client to fail
52 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
53 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
54 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
60 log "Request fail clients: $num, to fail: $FAIL_NUM, failed: $DOWN_NUM"
61 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
62 num=$((FAIL_NUM - DOWN_NUM))
65 if [ -z "$num" ] || [ "$num" -le 0 ]; then
66 log "No clients failed!"
72 for i in `seq $num`; do
75 DOWN_CLIENTS="$DOWN_CLIENTS $client"
76 shutdown_client $client
79 echo "down clients: $DOWN_CLIENTS"
81 for client in $DOWN_CLIENTS; do
84 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
88 reintegrate_clients() {
89 for client in $DOWN_CLIENTS; do
91 echo "Restarting $client"
92 zconf_mount $client $MOUNT || return 1
100 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
104 start mds$1 $(mdsdevname $1) $MDS_MOUNT_OPTS
111 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
112 echo $DOWN_CLIENTS | grep -q $c && continue
113 $PDSH $c touch $TESTDIR/${c}_$file || return 1
119 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
120 $PDSH $c rm $TESTDIR/${c}_$file
125 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
126 echo "$c mkdir $TESTDIR/$c"
127 $PDSH $c "mkdir $TESTDIR/$c && ls -l $TESTDIR/$c"
132 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
133 echo "rmdir $TESTDIR/$c"
134 $PDSH $LIVE_CLIENT "rmdir $TESTDIR/$c"
138 clients_recover_osts() {
140 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
143 check_and_setup_lustre
149 for i in $(seq $MDSCOUNT) ; do
153 for i in $(seq $OSTCOUNT) ; do
158 run_test 0 "Fail all nodes, independently"
160 ############### First Failure Mode ###############
162 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
169 # prepare for MDS failover
179 echo "Reintegrating MDS2"
182 start_mdt 2 || return 2
185 start_mdt 1 || return $?
189 echo "Verify reintegration"
190 clients_up || return 1
192 run_test 1 "MDS/MDS failure"
193 ###################################################
195 ############### Second Failure Mode ###############
197 echo "Verify Lustre filesystem is up and running"
198 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
202 for i in $(seq $MDSCOUNT) ; do
206 # prepare for MDS failover
217 echo "Reintegrating OST"
220 start_ost 1 || return 2
222 for i in $(seq $MDSCOUNT) ; do
224 start_mdt $i || return $?
229 clients_recover_osts ost1
230 echo "Verify reintegration"
231 clients_up || return 1
234 run_test 2 "Second Failure Mode: MDS/OST `date`"
235 ###################################################
237 ############### Third Failure Mode ###############
240 echo "Verify Lustre filesystem is up and running"
241 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
244 for i in $(seq $MDSCOUNT) ; do
249 echo "Test Lustre stability after MDS failover"
253 echo "Failing 2 CLIENTS"
257 echo "Test Lustre stability after CLIENT failure"
261 echo "Reintegrating CLIENTS"
262 reintegrate_clients || return 1
264 clients_up || return 3
265 sleep 2 # give it a little time for fully recovered before next test
267 run_test 3 "Third Failure Mode: MDS/CLIENT `date`"
268 ###################################################
270 ############### Fourth Failure Mode ###############
272 echo "Fourth Failure Mode: OST/MDS `date`"
278 echo "Test Lustre stability after OST failure"
283 for i in $(seq $MDSCOUNT) ; do
287 # prepare for MDS failover
297 echo "Reintegrating OST"
302 for i in $(seq $MDSCOUNT) ; do
304 start_mdt $i || return $?
310 clients_recover_osts ost1
311 echo "Test Lustre stability after MDS failover"
312 clients_up || return 1
314 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
315 ###################################################
317 ############### Fifth Failure Mode ###############
319 [ $OSTCOUNT -lt 2 ] && skip_env "needs >= 2 OSTs"
321 echo "Fifth Failure Mode: OST/OST `date`"
324 echo "Verify Lustre filesystem is up and running"
325 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
333 echo "Test Lustre stability after OST failure"
343 echo "Test Lustre stability after OST failure"
349 echo "Reintegrating OSTs"
355 clients_recover_osts ost1
356 clients_recover_osts ost2
361 clients_up || return 2
363 run_test 5 "Fifth Failure Mode: OST/OST `date`"
364 ###################################################
366 ############### Sixth Failure Mode ###############
368 echo "Sixth Failure Mode: OST/CLIENT `date`"
371 echo "Verify Lustre filesystem is up and running"
372 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
375 client_touch testfile || return 2
382 echo "Test Lustre stability after OST failure"
389 echo "Failing CLIENTs"
393 echo "Test Lustre stability after CLIENTs failure"
400 echo "Reintegrating OST/CLIENTs"
403 reintegrate_clients || return 1
406 wait_remote_prog "stat -f" $((TIMEOUT * 3 + 20))
410 echo "Verifying mount"
411 [ -z "$(mounted_lustre_filesystems)" ] && return 3
414 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
415 ###################################################
417 ############### Seventh Failure Mode ###############
419 echo "Seventh Failure Mode: CLIENT/MDS `date`"
422 echo "Verify Lustre filesystem is up and running"
423 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
426 client_touch testfile || return 1
429 echo "Part 1: Failing CLIENT"
433 echo "Test Lustre stability after CLIENTs failure"
435 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
436 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
439 echo "Wait 1 minutes"
443 echo "Verify Lustre filesystem is up and running"
444 [ -z "$(mounted_lustre_filesystems)" ] && return 2
450 for i in $(seq $MDSCOUNT) ; do
454 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
455 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
458 echo "Reintegrating CLIENTs"
459 reintegrate_clients || return 2
463 echo "wait 1 minutes"
466 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
467 ###################################################
469 ############### Eighth Failure Mode ###############
471 echo "Eighth Failure Mode: CLIENT/OST `date`"
474 echo "Verify Lustre filesystem is up and running"
475 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
478 client_touch testfile
481 echo "Failing CLIENTs"
485 echo "Test Lustre stability after CLIENTs failure"
487 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
488 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
491 echo "Wait 1 minutes"
495 echo "Verify Lustre filesystem is up and running"
496 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
499 client_touch testfile
506 echo "Test Lustre stability after OST failure"
510 #non-failout hangs forever here
511 #$PDSH $LIVE_CLIENT "ls -l $TESTDIR"
512 #$PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
515 echo "Reintegrating CLIENTs/OST"
516 reintegrate_clients || return 3
520 clients_up || return 1
521 client_touch testfile2 || return 2
524 echo "Wait 1 minutes"
527 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
528 ###################################################
530 ############### Ninth Failure Mode ###############
533 echo "Verify Lustre filesystem is up and running"
534 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
537 client_touch testfile || return 1
540 echo "Failing CLIENTs"
544 echo "Test Lustre stability after CLIENTs failure"
546 $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 1
547 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 2
550 echo "Wait 1 minutes"
554 echo "Verify Lustre filesystem is up and running"
555 client_up $LIVE_CLIENT || return 3
556 client_touch testfile || return 4
559 echo "Failing CLIENTs"
563 echo "Test Lustre stability after CLIENTs failure"
565 $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 5
566 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 6
569 echo "Reintegrating CLIENTs/CLIENTs"
570 reintegrate_clients || return 7
574 echo "Wait 1 minutes"
577 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
578 ###################################################
580 ############### Tenth Failure Mode ###############
582 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
587 # prepare for MDS failover
597 echo "Reintegrating OST"
600 start_ost 1 || return 2
605 # prepare for MDS failover
610 start_mdt 1 || return $?
613 start_mdt 2 || return $?
617 clients_recover_osts ost1
618 echo "Verify reintegration"
619 clients_up || return 1
621 run_test 10 "Tenth Failure Mode: MDT0/OST/MDT1 `date`"
622 ###################################################
624 ############### Seventh Failure Mode ###############
626 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
627 echo "Verify Lustre filesystem is up and running"
628 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
634 echo "Test Lustre stability after MDS failover"
638 echo "Failing 2 CLIENTS"
642 echo "Test Lustre stability after CLIENT failure"
646 echo "Reintegrating CLIENTS"
647 reintegrate_clients || return 1
651 clients_up || return 3
652 sleep 2 # give it a little time for fully recovered before next test
654 run_test 11 "Eleventh Failure Mode: MDS0/CLIENT/MDS1 `date`"
655 ###################################################
658 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
659 echo "Verify Lustre filesystem is up and running"
660 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
671 echo "Failing 2 CLIENTS"
675 echo "Test Lustre stability after CLIENT failure"
679 echo "Reintegrating CLIENTS"
680 reintegrate_clients || return 1
682 clients_up || return 3
683 sleep 2 # give it a little time for fully recovered before next test
685 run_test 12 "Twelve Failure Mode: MDS0,MDS1/OST0, OST1/CLIENTS `date`"
686 ###################################################
689 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
690 echo "Verify Lustre filesystem is up and running"
691 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
698 echo "Failing 2 CLIENTS"
702 echo "Test Lustre stability after CLIENT failure"
706 echo "Reintegrating CLIENTS"
707 reintegrate_clients || return 1
709 clients_up || return 3
710 sleep 2 # give it a little time for fully recovered before next test
714 clients_up || return 4
716 run_test 13 "Thirteen Failure Mode: MDS0,MDS1/CLIENTS/OST0,OST1 `date`"
717 ###################################################
720 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
721 echo "Verify Lustre filesystem is up and running"
722 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
729 echo "Failing 2 CLIENTS"
733 echo "Test Lustre stability after CLIENT failure"
737 echo "Reintegrating CLIENTS"
738 reintegrate_clients || return 1
740 clients_up || return 3
741 sleep 2 # give it a little time for fully recovered before next test
745 clients_up || return 4
747 run_test 14 "Fourteen Failure Mode: OST0,OST1/CLIENTS/MDS0,MDS1 `date`"
750 check_and_cleanup_lustre