2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
20 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
23 # Initialize all the ostN_HOST
25 if [ "$EXTRA_OSTS" ]; then
26 for host in $EXTRA_OSTS; do
27 NUMOST=$((NUMOST + 1))
29 eval ${OST}_HOST=$host
33 # This can be a regexp, to allow more clients
34 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
39 # fail clients round robin
41 # list of failable clients
42 FAIL_LIST=($FAIL_CLIENTS)
43 FAIL_NUM=${#FAIL_LIST[*]}
46 DOWN_NUM=0 # number of nodes currently down
48 # set next client to fail
50 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
51 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
52 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
57 if [ "$FAILURE_MODE" = HARD ]; then
59 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
60 echo "waiting for node $client to fail"
63 elif [ "$FAILURE_MODE" = SOFT ]; then
64 zconf_umount $client $MOUNT -f
70 if [ "$FAILURE_MODE" = HARD ]; then
77 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
78 num=$((FAIL_NUM - DOWN_NUM))
81 if [ -z "$num" ] || [ "$num" -le 0 ]; then
87 for i in `seq $num`; do
90 DOWN_CLIENTS="$DOWN_CLIENTS $client"
91 shutdown_client $client
94 echo "down clients: $DOWN_CLIENTS"
96 for client in $DOWN_CLIENTS; do
99 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
103 reintegrate_clients() {
104 for client in $DOWN_CLIENTS; do
105 wait_for_host $client
106 echo "Restarting $client"
107 zconf_mount $client $MOUNT || return 1
115 add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
117 if [ ! -z "$mdsfailover_HOST" ]; then
118 add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
121 add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
122 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
123 for i in `seq $NUMOST`; do
124 dev=`printf $OSTDEV $i`
125 add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
126 --journal-size $OSTJOURNALSIZE
130 add_client client mds --lov lov1 --path $MOUNT
137 for i in `seq $NUMOST`; do
139 start ost$i ${REFORMAT} $OSTLCONFARGS
141 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
143 start mds $MDSLCONFARGS ${REFORMAT}
144 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
145 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
150 zconf_umount $CLIENTS $MOUNT
152 stop mds ${FORCE} $MDSLCONFARGS || :
153 for i in `seq $NUMOST`; do
154 stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
162 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
163 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
164 $PDSH $c touch $MOUNT/${c}_$file || return 1
170 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
171 $PDSH $c rm $MOUNT/${c}_$file
176 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
177 echo "$c mkdir $MOUNT/$c"
178 $PDSH $c "mkdir $MOUNT/$c"
179 $PDSH $c "ls -l $MOUNT/$c"
184 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
185 echo "rmdir $MOUNT/$c"
186 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
190 clients_recover_osts() {
192 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
198 for i in `seq $NUMOST`; do
199 ostvar="ost${i}_HOST"
200 if [ "${!ostvar}" == $node ]; then
205 echo "No ost found for node; $node"
212 if [ "$ONLY" == "cleanup" ]; then
217 if [ ! -z "$EVAL" ]; then
224 if [ "$ONLY" == "setup" ]; then
228 # 9 Different Failure Modes Combinations
229 echo "Starting Test 17 at `date`"
234 echo "Waiting for df pid: $DFPID"
235 wait $DFPID || echo "df returned $?" && return 1
239 echo "Waiting for df pid: $DFPID"
240 wait $DFPID || echo "df returned $?" && return 2
244 echo "Waiting for df pid: $DFPID"
245 wait $DFPID || echo "df returned $?" && return 3
248 run_test 0 "Fail all nodes, independently"
250 ############### First Failure Mode ###############
252 echo "Don't do a MDS - MDS Failure Case"
253 echo "This makes no sense"
255 run_test 1 "MDS/MDS failure"
256 ###################################################
258 ############### Second Failure Mode ###############
260 echo "Verify Lustre filesystem is up and running"
267 # prepare for MDS failover
278 echo "Reintegrating OST"
289 clients_recover_osts ost1
290 echo "Verify reintegration"
291 client_df || return 1
294 run_test 2 "Second Failure Mode: MDS/OST `date`"
295 ###################################################
298 ############### Third Failure Mode ###############
301 echo "Verify Lustre filesystem is up and running"
305 wait $DFPID || echo df failed: $?
308 echo "Test Lustre stability after MDS failover"
312 echo "Failing 2 CLIENTS"
316 echo "Test Lustre stability after CLIENT failure"
320 echo "Reintegrating CLIENTS"
321 reintegrate_clients || return 1
323 client_df || return 3
325 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
326 ###################################################
328 ############### Fourth Failure Mode ###############
330 echo "Fourth Failure Mode: OST/MDS `date`"
333 echo "Failing OST ost1"
337 echo "Test Lustre stability after OST failure"
345 # prepare for MDS failover
354 echo "Reintegrating OST"
365 clients_recover_osts ost1
366 echo "Test Lustre stability after MDS failover"
367 client_df || return 1
369 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
370 ###################################################
372 ############### Fifth Failure Mode ###############
374 echo "Fifth Failure Mode: OST/OST `date`"
377 echo "Verify Lustre filesystem is up and running"
386 echo "Test Lustre stability after OST failure"
395 echo "Test Lustre stability after OST failure"
399 echo "Reintegrating OSTs"
405 clients_recover_osts ost1
406 clients_recover_osts ost2
409 client_df || return 2
411 run_test 5 "Fifth Failure Mode: OST/OST `date`"
412 ###################################################
414 ############### Sixth Failure Mode ###############
416 echo "Sixth Failure Mode: OST/CLIENT `date`"
419 echo "Verify Lustre filesystem is up and running"
420 client_df || return 1
421 client_touch testfile || return 2
429 echo "Test Lustre stability after OST failure"
433 echo "Failing CLIENTs"
437 echo "Test Lustre stability after CLIENTs failure"
441 echo "Reintegrating OST/CLIENTs"
447 echo "Verifying mount"
448 client_df || return 3
450 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
451 ###################################################
454 ############### Seventh Failure Mode ###############
456 echo "Seventh Failure Mode: CLIENT/MDS `date`"
459 echo "Verify Lustre filesystem is up and running"
461 client_touch testfile || return 1
464 echo "Part 1: Failing CLIENT"
468 echo "Test Lustre stability after CLIENTs failure"
470 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
471 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
474 echo "Wait 1 minutes"
478 echo "Verify Lustre filesystem is up and running"
487 echo "Test Lustre stability after MDS failover"
488 wait $DFPID || echo "df on down clients fails " || return 1
489 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
490 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
493 echo "Reintegrating CLIENTs"
495 client_df || return 2
498 echo "wait 1 minutes"
501 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
502 ###################################################
505 ############### Eighth Failure Mode ###############
507 echo "Eighth Failure Mode: CLIENT/OST `date`"
510 echo "Verify Lustre filesystem is up and running"
512 client_touch testfile
515 echo "Failing CLIENTs"
519 echo "Test Lustre stability after CLIENTs failure"
521 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
522 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
525 echo "Wait 1 minutes"
529 echo "Verify Lustre filesystem is up and running"
531 client_touch testfile
540 echo "Test Lustre stability after OST failure"
542 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
543 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
546 echo "Reintegrating CLIENTs/OST"
550 client_df || return 1
551 client_touch testfile2 || return 2
554 echo "Wait 1 minutes"
557 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
558 ###################################################
561 ############### Ninth Failure Mode ###############
566 echo "Verify Lustre filesystem is up and running"
568 client_touch testfile || return 1
571 echo "Failing CLIENTs"
575 echo "Test Lustre stability after CLIENTs failure"
577 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
578 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
581 echo "Wait 1 minutes"
585 echo "Verify Lustre filesystem is up and running"
586 $PDSH $LIVE_CLIENT df $MOUNT || return 3
587 client_touch testfile || return 4
590 echo "Failing CLIENTs"
594 echo "Test Lustre stability after CLIENTs failure"
596 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
597 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
600 echo "Reintegrating CLIENTs/CLIENTs"
602 client_df || return 7
605 echo "Wait 1 minutes"
608 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
609 ###################################################
612 #Run availability after all failures
613 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
614 LOADTEST=${LOADTEST:-metadata-load.py}
615 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
617 run_test 10 "Running Availability for 6 hours..."
619 equals_msg "Done, cleaning up"