2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
17 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
20 # Initialize all the ostN_HOST
22 if [ "$EXTRA_OSTS" ]; then
23 for host in $EXTRA_OSTS; do
24 NUMOST=$((NUMOST + 1))
26 eval ${OST}_HOST=$host
30 # This can be a regexp, to allow more clients
31 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
36 # fail clients round robin
38 # list of failable clients
39 FAIL_LIST=($FAIL_CLIENTS)
40 FAIL_NUM=${#FAIL_LIST[*]}
43 DOWN_NUM=0 # number of nodes currently down
45 # set next client to fail
47 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
48 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
49 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
54 if [ "$FAILURE_MODE" = HARD ]; then
56 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
57 echo "waiting for node $client to fail"
60 elif [ "$FAILURE_MODE" = SOFT ]; then
61 zconf_umount $client $MOUNT -f
67 if [ "$FAILURE_MODE" = HARD ]; then
74 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
75 num=$((FAIL_NUM - DOWN_NUM))
78 if [ -z "$num" ] || [ "$num" -le 0 ]; then
84 for i in `seq $num`; do
87 DOWN_CLIENTS="$DOWN_CLIENTS $client"
88 shutdown_client $client
91 echo "down clients: $DOWN_CLIENTS"
93 for client in $DOWN_CLIENTS; do
96 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
100 reintegrate_clients() {
101 for client in $DOWN_CLIENTS; do
102 wait_for_host $client
103 echo "Restarting $client"
104 zconf_mount $client $MOUNT || return 1
112 add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
114 if [ ! -z "$mdsfailover_HOST" ]; then
115 add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
118 add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
119 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
120 for i in `seq $NUMOST`; do
121 dev=`printf $OSTDEV $i`
122 add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
123 --journal-size $OSTJOURNALSIZE
127 add_client client mds --lov lov1 --path $MOUNT
132 for i in `seq $NUMOST`; do
134 start ost$i ${REFORMAT} $OSTLCONFARGS
136 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
138 start mds $MDSLCONFARGS ${REFORMAT}
139 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
140 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
145 zconf_umount $CLIENTS $MOUNT
147 stop mds ${FORCE} $MDSLCONFARGS || :
148 for i in `seq $NUMOST`; do
149 stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
157 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
158 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
159 $PDSH $c touch $MOUNT/${c}_$file || return 1
165 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
166 $PDSH $c rm $MOUNT/${c}_$file
171 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
172 echo "$c mkdir $MOUNT/$c"
173 $PDSH $c "mkdir $MOUNT/$c"
174 $PDSH $c "ls -l $MOUNT/$c"
179 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
180 echo "rmdir $MOUNT/$c"
181 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
185 clients_recover_osts() {
187 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
193 for i in `seq $NUMOST`; do
194 ostvar="ost${i}_HOST"
195 if [ "${!ostvar}" == $node ]; then
200 echo "No ost found for node; $node"
207 if [ "$ONLY" == "cleanup" ]; then
212 if [ -z "$NOSETUP" ]; then
217 if [ ! -z "$EVAL" ]; then
222 if [ "$ONLY" == "setup" ]; then
226 # 9 Different Failure Modes Combinations
227 echo "Starting Test 17 at `date`"
232 echo "Waiting for df pid: $DFPID"
233 wait $DFPID || return 1
237 echo "Waiting for df pid: $DFPID"
238 wait $DFPID || return 2
242 echo "Waiting for df pid: $DFPID"
243 wait $DFPID || return 3
246 run_test 0 "Fail all nodes, independently"
248 ############### First Failure Mode ###############
250 echo "Don't do a MDS - MDS Failure Case"
251 echo "This makes no sense"
253 run_test 1 "MDS/MDS failure"
254 ###################################################
256 ############### Second Failure Mode ###############
258 echo "Verify Lustre filesystem is up and running"
265 # prepare for MDS failover
276 echo "Reintegrating OST"
287 clients_recover_osts ost1
288 echo "Verify reintegration"
289 client_df || return 1
292 run_test 2 "Second Failure Mode: MDS/OST `date`"
293 ###################################################
296 ############### Third Failure Mode ###############
299 echo "Verify Lustre filesystem is up and running"
303 wait $DFPID || echo df failed: $?
306 echo "Test Lustre stability after MDS failover"
310 echo "Failing 2 CLIENTS"
314 echo "Test Lustre stability after CLIENT failure"
318 echo "Reintegrating CLIENTS"
319 reintegrate_clients || return 1
321 client_df || return 3
323 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
324 ###################################################
326 ############### Fourth Failure Mode ###############
328 echo "Fourth Failure Mode: OST/MDS `date`"
331 echo "Failing OST ost1"
335 echo "Test Lustre stability after OST failure"
343 # prepare for MDS failover
352 echo "Reintegrating OST"
363 clients_recover_osts ost1
364 echo "Test Lustre stability after MDS failover"
365 client_df || return 1
367 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
368 ###################################################
370 ############### Fifth Failure Mode ###############
372 echo "Fifth Failure Mode: OST/OST `date`"
375 echo "Verify Lustre filesystem is up and running"
384 echo "Test Lustre stability after OST failure"
393 echo "Test Lustre stability after OST failure"
397 echo "Reintegrating OSTs"
403 clients_recover_osts ost1
404 clients_recover_osts ost2
407 client_df || return 2
409 run_test 5 "Fifth Failure Mode: OST/OST `date`"
410 ###################################################
412 ############### Sixth Failure Mode ###############
414 echo "Sixth Failure Mode: OST/CLIENT `date`"
417 echo "Verify Lustre filesystem is up and running"
418 client_df || return 1
419 client_touch testfile || return 2
427 echo "Test Lustre stability after OST failure"
431 echo "Failing CLIENTs"
435 echo "Test Lustre stability after CLIENTs failure"
439 echo "Reintegrating OST/CLIENTs"
445 echo "Verifying mount"
446 client_df || return 3
448 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
449 ###################################################
452 ############### Seventh Failure Mode ###############
454 echo "Seventh Failure Mode: CLIENT/MDS `date`"
457 echo "Verify Lustre filesystem is up and running"
459 client_touch testfile || return 1
462 echo "Part 1: Failing CLIENT"
466 echo "Test Lustre stability after CLIENTs failure"
468 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
469 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
472 echo "Wait 1 minutes"
476 echo "Verify Lustre filesystem is up and running"
485 echo "Test Lustre stability after MDS failover"
486 wait $DFPID || echo "df on down clients fails " || return 1
487 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
488 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
491 echo "Reintegrating CLIENTs"
493 client_df || return 2
496 echo "wait 1 minutes"
499 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
500 ###################################################
503 ############### Eighth Failure Mode ###############
505 echo "Eighth Failure Mode: CLIENT/OST `date`"
508 echo "Verify Lustre filesystem is up and running"
510 client_touch testfile
513 echo "Failing CLIENTs"
517 echo "Test Lustre stability after CLIENTs failure"
519 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
520 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
523 echo "Wait 1 minutes"
527 echo "Verify Lustre filesystem is up and running"
529 client_touch testfile
538 echo "Test Lustre stability after OST failure"
540 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
541 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
544 echo "Reintegrating CLIENTs/OST"
548 client_df || return 1
549 client_touch testfile2 || return 2
552 echo "Wait 1 minutes"
555 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
556 ###################################################
559 ############### Ninth Failure Mode ###############
564 echo "Verify Lustre filesystem is up and running"
566 client_touch testfile || return 1
569 echo "Failing CLIENTs"
573 echo "Test Lustre stability after CLIENTs failure"
575 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
576 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
579 echo "Wait 1 minutes"
583 echo "Verify Lustre filesystem is up and running"
584 $PDSH $LIVE_CLIENT df $MOUNT || return 3
585 client_touch testfile || return 4
588 echo "Failing CLIENTs"
592 echo "Test Lustre stability after CLIENTs failure"
594 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
595 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
598 echo "Reintegrating CLIENTs/CLIENTs"
600 client_df || return 7
603 echo "Wait 1 minutes"
606 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
607 ###################################################
610 #Run availability after all failures
611 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
612 LOADTEST=${LOADTEST:-metadata-load.py}
613 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
615 run_test 10 "Running Availability for 6 hours..."
617 equals_msg "Done, cleaning up"