2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
17 UPCALL=${UPCALL:-DEFAULT}
21 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
24 # Initialize all the ostN_HOST
26 if [ "$EXTRA_OSTS" ]; then
27 for host in $EXTRA_OSTS; do
28 NUMOST=$((NUMOST + 1))
30 eval ${OST}_HOST=$host
34 # This can be a regexp, to allow more clients
35 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
40 # fail clients round robin
42 # list of failable clients
43 FAIL_LIST=($FAIL_CLIENTS)
44 FAIL_NUM=${#FAIL_LIST[*]}
47 DOWN_NUM=0 # number of nodes currently down
49 # set next client to fail
51 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
52 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
53 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
58 if [ "$FAILURE_MODE" = HARD ]; then
60 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
61 echo "waiting for node $client to fail"
64 elif [ "$FAILURE_MODE" = SOFT ]; then
65 zconf_umount $client $MOUNT -f
71 if [ "$FAILURE_MODE" = HARD ]; then
78 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
79 num=$((FAIL_NUM - DOWN_NUM))
82 if [ -z "$num" ] || [ "$num" -le 0 ]; then
88 for i in `seq $num`; do
91 DOWN_CLIENTS="$DOWN_CLIENTS $client"
92 shutdown_client $client
95 echo "down clients: $DOWN_CLIENTS"
97 for client in $DOWN_CLIENTS; do
100 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
104 reintegrate_clients() {
105 for client in $DOWN_CLIENTS; do
106 wait_for_host $client
107 echo "Restarting $client"
108 zconf_mount $client $MOUNT || return 1
116 add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
118 if [ ! -z "$mdsfailover_HOST" ]; then
119 add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
122 add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
123 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
124 for i in `seq $NUMOST`; do
125 dev=`printf $OSTDEV $i`
126 add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
127 --journal-size $OSTJOURNALSIZE
131 add_client client mds --lov lov1 --path $MOUNT
138 for i in `seq $NUMOST`; do
140 start ost$i ${REFORMAT} $OSTLCONFARGS
142 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
144 start mds $MDSLCONFARGS ${REFORMAT}
145 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
146 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
151 zconf_umount $CLIENTS $MOUNT
153 stop mds ${FORCE} $MDSLCONFARGS || :
154 for i in `seq $NUMOST`; do
155 stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
163 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
164 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
165 $PDSH $c touch $MOUNT/${c}_$file || return 1
171 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
172 $PDSH $c rm $MOUNT/${c}_$file
177 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
178 echo "$c mkdir $MOUNT/$c"
179 $PDSH $c "mkdir $MOUNT/$c"
180 $PDSH $c "ls -l $MOUNT/$c"
185 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
186 echo "rmdir $MOUNT/$c"
187 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
191 clients_recover_osts() {
193 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
199 for i in `seq $NUMOST`; do
200 ostvar="ost${i}_HOST"
201 if [ "${!ostvar}" == $node ]; then
206 echo "No ost found for node; $node"
213 if [ "$ONLY" == "cleanup" ]; then
218 if [ ! -z "$EVAL" ]; then
225 if [ "$ONLY" == "setup" ]; then
229 # 9 Different Failure Modes Combinations
230 echo "Starting Test 17 at `date`"
235 echo "Waiting for df pid: $DFPID"
236 wait $DFPID || { echo "df returned $?" && return 1; }
240 echo "Waiting for df pid: $DFPID"
241 wait $DFPID || { echo "df returned $?" && return 2; }
245 echo "Waiting for df pid: $DFPID"
246 wait $DFPID || { echo "df returned $?" && return 3; }
249 run_test 0 "Fail all nodes, independently"
251 ############### First Failure Mode ###############
253 echo "Don't do a MDS - MDS Failure Case"
254 echo "This makes no sense"
256 run_test 1 "MDS/MDS failure"
257 ###################################################
259 ############### Second Failure Mode ###############
261 echo "Verify Lustre filesystem is up and running"
268 # prepare for MDS failover
279 echo "Reintegrating OST"
290 clients_recover_osts ost1
291 echo "Verify reintegration"
292 client_df || return 1
295 run_test 2 "Second Failure Mode: MDS/OST `date`"
296 ###################################################
299 ############### Third Failure Mode ###############
302 echo "Verify Lustre filesystem is up and running"
306 wait $DFPID || echo df failed: $?
309 echo "Test Lustre stability after MDS failover"
313 echo "Failing 2 CLIENTS"
317 echo "Test Lustre stability after CLIENT failure"
321 echo "Reintegrating CLIENTS"
322 reintegrate_clients || return 1
324 client_df || return 3
326 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
327 ###################################################
329 ############### Fourth Failure Mode ###############
331 echo "Fourth Failure Mode: OST/MDS `date`"
334 echo "Failing OST ost1"
338 echo "Test Lustre stability after OST failure"
346 # prepare for MDS failover
355 echo "Reintegrating OST"
366 clients_recover_osts ost1
367 echo "Test Lustre stability after MDS failover"
368 client_df || return 1
370 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
371 ###################################################
373 ############### Fifth Failure Mode ###############
375 echo "Fifth Failure Mode: OST/OST `date`"
378 echo "Verify Lustre filesystem is up and running"
387 echo "Test Lustre stability after OST failure"
396 echo "Test Lustre stability after OST failure"
400 echo "Reintegrating OSTs"
406 clients_recover_osts ost1
407 clients_recover_osts ost2
410 client_df || return 2
412 run_test 5 "Fifth Failure Mode: OST/OST `date`"
413 ###################################################
415 ############### Sixth Failure Mode ###############
417 echo "Sixth Failure Mode: OST/CLIENT `date`"
420 echo "Verify Lustre filesystem is up and running"
421 client_df || return 1
422 client_touch testfile || return 2
430 echo "Test Lustre stability after OST failure"
434 echo "Failing CLIENTs"
438 echo "Test Lustre stability after CLIENTs failure"
442 echo "Reintegrating OST/CLIENTs"
448 echo "Verifying mount"
449 client_df || return 3
451 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
452 ###################################################
455 ############### Seventh Failure Mode ###############
457 echo "Seventh Failure Mode: CLIENT/MDS `date`"
460 echo "Verify Lustre filesystem is up and running"
462 client_touch testfile || return 1
465 echo "Part 1: Failing CLIENT"
469 echo "Test Lustre stability after CLIENTs failure"
471 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
472 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
475 echo "Wait 1 minutes"
479 echo "Verify Lustre filesystem is up and running"
488 echo "Test Lustre stability after MDS failover"
489 wait $DFPID || echo "df on down clients fails " || return 1
490 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
491 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
494 echo "Reintegrating CLIENTs"
496 client_df || return 2
499 echo "wait 1 minutes"
502 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
503 ###################################################
506 ############### Eighth Failure Mode ###############
508 echo "Eighth Failure Mode: CLIENT/OST `date`"
511 echo "Verify Lustre filesystem is up and running"
513 client_touch testfile
516 echo "Failing CLIENTs"
520 echo "Test Lustre stability after CLIENTs failure"
522 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
523 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
526 echo "Wait 1 minutes"
530 echo "Verify Lustre filesystem is up and running"
532 client_touch testfile
541 echo "Test Lustre stability after OST failure"
543 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
544 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
547 echo "Reintegrating CLIENTs/OST"
551 client_df || return 1
552 client_touch testfile2 || return 2
555 echo "Wait 1 minutes"
558 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
559 ###################################################
562 ############### Ninth Failure Mode ###############
567 echo "Verify Lustre filesystem is up and running"
569 client_touch testfile || return 1
572 echo "Failing CLIENTs"
576 echo "Test Lustre stability after CLIENTs failure"
578 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
579 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
582 echo "Wait 1 minutes"
586 echo "Verify Lustre filesystem is up and running"
587 $PDSH $LIVE_CLIENT df $MOUNT || return 3
588 client_touch testfile || return 4
591 echo "Failing CLIENTs"
595 echo "Test Lustre stability after CLIENTs failure"
597 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
598 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
601 echo "Reintegrating CLIENTs/CLIENTs"
603 client_df || return 7
606 echo "Wait 1 minutes"
609 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
610 ###################################################
613 #Run availability after all failures
614 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
615 LOADTEST=${LOADTEST:-metadata-load.py}
616 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
618 run_test 10 "Running Availability for 6 hours..."
620 equals_msg "Done, cleaning up"