3 # Test different failure modes combinations
7 LUSTRE=${LUSTRE:-`dirname $0`/..}
8 . $LUSTRE/tests/test-framework.sh
12 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
15 # bug number for skipped test:
16 ALWAYS_EXCEPT="$INSANITY_EXCEPT"
17 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
19 if [ "$FAILURE_MODE" = "HARD" ]; then
20 skip_env "$TESTSUITE: is not functional with FAILURE_MODE = HARD, " \
21 "please use recovery-double-scale, bz20407"
24 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
27 CLEANUP=${CLEANUP:-""}
31 SINGLECLIENT=${SINGLECLIENT:-$HOSTNAME}
32 LIVE_CLIENT=${LIVE_CLIENT:-$SINGLECLIENT}
33 FAIL_CLIENTS=${FAIL_CLIENTS:-$RCLIENTS}
35 assert_env mds_HOST MDSCOUNT
36 assert_env ost_HOST OSTCOUNT
37 assert_env LIVE_CLIENT FSNAME
39 require_dsh_mds || exit 0
40 require_dsh_ost || exit 0
42 # FAIL_CLIENTS list should not contain the LIVE_CLIENT
43 FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
46 TESTDIR=$DIR/d0.$TESTSUITE
49 # fail clients round robin
51 # list of failable clients
52 FAIL_LIST=($FAIL_CLIENTS)
53 FAIL_NUM=${#FAIL_LIST[*]}
56 DOWN_NUM=0 # number of nodes currently down
58 # set next client to fail
60 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
61 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
62 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
68 log "Request fail clients: $num, to fail: $FAIL_NUM, failed: $DOWN_NUM"
69 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
70 num=$((FAIL_NUM - DOWN_NUM))
73 if [ -z "$num" ] || [ "$num" -le 0 ]; then
74 log "No clients failed!"
80 for i in `seq $num`; do
83 DOWN_CLIENTS="$DOWN_CLIENTS $client"
84 shutdown_client $client
87 echo "down clients: $DOWN_CLIENTS"
89 for client in $DOWN_CLIENTS; do
92 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
96 reintegrate_clients() {
97 for client in $DOWN_CLIENTS; do
99 echo "Restarting $client"
100 zconf_mount $client $MOUNT || return 1
108 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
112 start mds$1 $(mdsdevname $1) $MDS_MOUNT_OPTS
119 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
120 echo $DOWN_CLIENTS | grep -q $c && continue
121 $PDSH $c touch $TESTDIR/${c}_$file || return 1
127 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
128 $PDSH $c rm $TESTDIR/${c}_$file
133 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
134 echo "$c mkdir $TESTDIR/$c"
135 $PDSH $c "mkdir $TESTDIR/$c && ls -l $TESTDIR/$c"
140 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
141 echo "rmdir $TESTDIR/$c"
142 $PDSH $LIVE_CLIENT "rmdir $TESTDIR/$c"
146 clients_recover_osts() {
148 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
151 check_and_setup_lustre
157 for i in $(seq $MDSCOUNT) ; do
161 for i in $(seq $OSTCOUNT) ; do
166 run_test 0 "Fail all nodes, independently"
168 ############### First Failure Mode ###############
170 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
177 # prepare for MDS failover
187 echo "Reintegrating MDS2"
190 start_mdt 2 || return 2
193 start_mdt 1 || return $?
197 echo "Verify reintegration"
198 clients_up || return 1
200 run_test 1 "MDS/MDS failure"
201 ###################################################
203 ############### Second Failure Mode ###############
205 echo "Verify Lustre filesystem is up and running"
206 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
210 for i in $(seq $MDSCOUNT) ; do
214 # prepare for MDS failover
225 echo "Reintegrating OST"
228 start_ost 1 || return 2
230 for i in $(seq $MDSCOUNT) ; do
232 start_mdt $i || return $?
237 clients_recover_osts ost1
238 echo "Verify reintegration"
239 clients_up || return 1
242 run_test 2 "Second Failure Mode: MDS/OST `date`"
243 ###################################################
245 ############### Third Failure Mode ###############
248 echo "Verify Lustre filesystem is up and running"
249 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
252 for i in $(seq $MDSCOUNT) ; do
257 echo "Test Lustre stability after MDS failover"
261 echo "Failing 2 CLIENTS"
265 echo "Test Lustre stability after CLIENT failure"
269 echo "Reintegrating CLIENTS"
270 reintegrate_clients || return 1
272 clients_up || return 3
273 sleep 2 # give it a little time for fully recovered before next test
275 run_test 3 "Third Failure Mode: MDS/CLIENT `date`"
276 ###################################################
278 ############### Fourth Failure Mode ###############
280 echo "Fourth Failure Mode: OST/MDS `date`"
286 echo "Test Lustre stability after OST failure"
291 for i in $(seq $MDSCOUNT) ; do
295 # prepare for MDS failover
305 echo "Reintegrating OST"
310 for i in $(seq $MDSCOUNT) ; do
312 start_mdt $i || return $?
318 clients_recover_osts ost1
319 echo "Test Lustre stability after MDS failover"
320 clients_up || return 1
322 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
323 ###################################################
325 ############### Fifth Failure Mode ###############
327 [ $OSTCOUNT -lt 2 ] && skip_env "needs >= 2 OSTs"
329 echo "Fifth Failure Mode: OST/OST `date`"
332 echo "Verify Lustre filesystem is up and running"
333 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
341 echo "Test Lustre stability after OST failure"
351 echo "Test Lustre stability after OST failure"
357 echo "Reintegrating OSTs"
363 clients_recover_osts ost1
364 clients_recover_osts ost2
369 clients_up || return 2
371 run_test 5 "Fifth Failure Mode: OST/OST `date`"
372 ###################################################
374 ############### Sixth Failure Mode ###############
376 echo "Sixth Failure Mode: OST/CLIENT `date`"
379 echo "Verify Lustre filesystem is up and running"
380 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
383 client_touch testfile || return 2
390 echo "Test Lustre stability after OST failure"
397 echo "Failing CLIENTs"
401 echo "Test Lustre stability after CLIENTs failure"
408 echo "Reintegrating OST/CLIENTs"
411 reintegrate_clients || return 1
414 wait_remote_prog "stat -f" $((TIMEOUT * 3 + 20))
418 echo "Verifying mount"
419 [ -z "$(mounted_lustre_filesystems)" ] && return 3
422 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
423 ###################################################
425 ############### Seventh Failure Mode ###############
427 echo "Seventh Failure Mode: CLIENT/MDS `date`"
430 echo "Verify Lustre filesystem is up and running"
431 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
434 client_touch testfile || return 1
437 echo "Part 1: Failing CLIENT"
441 echo "Test Lustre stability after CLIENTs failure"
443 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
444 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
447 echo "Wait 1 minutes"
451 echo "Verify Lustre filesystem is up and running"
452 [ -z "$(mounted_lustre_filesystems)" ] && return 2
458 for i in $(seq $MDSCOUNT) ; do
462 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
463 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
466 echo "Reintegrating CLIENTs"
467 reintegrate_clients || return 2
471 echo "wait 1 minutes"
474 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
475 ###################################################
477 ############### Eighth Failure Mode ###############
479 echo "Eighth Failure Mode: CLIENT/OST `date`"
482 echo "Verify Lustre filesystem is up and running"
483 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
486 client_touch testfile
489 echo "Failing CLIENTs"
493 echo "Test Lustre stability after CLIENTs failure"
495 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
496 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
499 echo "Wait 1 minutes"
503 echo "Verify Lustre filesystem is up and running"
504 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
507 client_touch testfile
514 echo "Test Lustre stability after OST failure"
518 #non-failout hangs forever here
519 #$PDSH $LIVE_CLIENT "ls -l $TESTDIR"
520 #$PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
523 echo "Reintegrating CLIENTs/OST"
524 reintegrate_clients || return 3
528 clients_up || return 1
529 client_touch testfile2 || return 2
532 echo "Wait 1 minutes"
535 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
536 ###################################################
538 ############### Ninth Failure Mode ###############
541 echo "Verify Lustre filesystem is up and running"
542 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
545 client_touch testfile || return 1
548 echo "Failing CLIENTs"
552 echo "Test Lustre stability after CLIENTs failure"
554 $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 1
555 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 2
558 echo "Wait 1 minutes"
562 echo "Verify Lustre filesystem is up and running"
563 client_up $LIVE_CLIENT || return 3
564 client_touch testfile || return 4
567 echo "Failing CLIENTs"
571 echo "Test Lustre stability after CLIENTs failure"
573 $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 5
574 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 6
577 echo "Reintegrating CLIENTs/CLIENTs"
578 reintegrate_clients || return 7
582 echo "Wait 1 minutes"
585 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
586 ###################################################
588 ############### Tenth Failure Mode ###############
590 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
595 # prepare for MDS failover
605 echo "Reintegrating OST"
608 start_ost 1 || return 2
613 # prepare for MDS failover
618 start_mdt 1 || return $?
621 start_mdt 2 || return $?
625 clients_recover_osts ost1
626 echo "Verify reintegration"
627 clients_up || return 1
629 run_test 10 "Tenth Failure Mode: MDT0/OST/MDT1 `date`"
630 ###################################################
632 ############### Seventh Failure Mode ###############
634 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
635 echo "Verify Lustre filesystem is up and running"
636 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
642 echo "Test Lustre stability after MDS failover"
646 echo "Failing 2 CLIENTS"
650 echo "Test Lustre stability after CLIENT failure"
654 echo "Reintegrating CLIENTS"
655 reintegrate_clients || return 1
659 clients_up || return 3
660 sleep 2 # give it a little time for fully recovered before next test
662 run_test 11 "Eleventh Failure Mode: MDS0/CLIENT/MDS1 `date`"
663 ###################################################
666 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
667 echo "Verify Lustre filesystem is up and running"
668 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
679 echo "Failing 2 CLIENTS"
683 echo "Test Lustre stability after CLIENT failure"
687 echo "Reintegrating CLIENTS"
688 reintegrate_clients || return 1
690 clients_up || return 3
691 sleep 2 # give it a little time for fully recovered before next test
693 run_test 12 "Twelve Failure Mode: MDS0,MDS1/OST0, OST1/CLIENTS `date`"
694 ###################################################
697 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
698 echo "Verify Lustre filesystem is up and running"
699 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
706 echo "Failing 2 CLIENTS"
710 echo "Test Lustre stability after CLIENT failure"
714 echo "Reintegrating CLIENTS"
715 reintegrate_clients || return 1
717 clients_up || return 3
718 sleep 2 # give it a little time for fully recovered before next test
722 clients_up || return 4
724 run_test 13 "Thirteen Failure Mode: MDS0,MDS1/CLIENTS/OST0,OST1 `date`"
725 ###################################################
728 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
729 echo "Verify Lustre filesystem is up and running"
730 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
737 echo "Failing 2 CLIENTS"
741 echo "Test Lustre stability after CLIENT failure"
745 echo "Reintegrating CLIENTS"
746 reintegrate_clients || return 1
748 clients_up || return 3
749 sleep 2 # give it a little time for fully recovered before next test
753 clients_up || return 4
755 run_test 14 "Fourteen Failure Mode: OST0,OST1/CLIENTS/MDS0,MDS1 `date`"
758 check_and_cleanup_lustre