2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
17 UPCALL=${UPCALL:-DEFAULT}
21 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
24 # Initialize all the ostN_HOST
26 if [ "$EXTRA_OSTS" ]; then
27 for host in $EXTRA_OSTS; do
28 NUMOST=$((NUMOST + 1))
30 eval ${OST}_HOST=$host
34 # This can be a regexp, to allow more clients
35 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
40 # fail clients round robin
42 # list of failable clients
43 FAIL_LIST=($FAIL_CLIENTS)
44 FAIL_NUM=${#FAIL_LIST[*]}
47 DOWN_NUM=0 # number of nodes currently down
49 # set next client to fail
51 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
52 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
53 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
58 if [ "$FAILURE_MODE" = HARD ]; then
60 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
61 echo "waiting for node $client to fail"
64 elif [ "$FAILURE_MODE" = SOFT ]; then
65 zconf_umount $client $MOUNT -f
71 if [ "$FAILURE_MODE" = HARD ]; then
78 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
79 num=$((FAIL_NUM - DOWN_NUM))
82 if [ -z "$num" ] || [ "$num" -le 0 ]; then
88 for i in `seq $num`; do
91 DOWN_CLIENTS="$DOWN_CLIENTS $client"
92 shutdown_client $client
95 echo "down clients: $DOWN_CLIENTS"
97 for client in $DOWN_CLIENTS; do
100 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
104 reintegrate_clients() {
105 for client in $DOWN_CLIENTS; do
106 wait_for_host $client
107 echo "Restarting $client"
108 zconf_mount $client $MOUNT || return 1
116 add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
118 if [ ! -z "$mdsfailover_HOST" ]; then
119 add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
122 add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
123 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
124 for i in `seq $NUMOST`; do
125 dev=`printf $OSTDEV $i`
126 add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
127 --journal-size $OSTJOURNALSIZE
131 add_client client mds --lov lov1 --path $MOUNT
138 for i in `seq $NUMOST`; do
140 start ost$i ${REFORMAT} $OSTLCONFARGS
142 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
144 start mds $MDSLCONFARGS ${REFORMAT}
145 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
146 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
151 zconf_umount $CLIENTS $MOUNT
153 stop mds ${FORCE} $MDSLCONFARGS || :
154 for i in `seq $NUMOST`; do
155 stop ost$i ${FORCE} $OSTLCONFARGS || :
163 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
164 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
165 $PDSH $c touch $MOUNT/${c}_$file || return 1
171 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
172 $PDSH $c rm $MOUNT/${c}_$file
177 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
178 echo "$c mkdir $MOUNT/$c"
179 $PDSH $c "mkdir $MOUNT/$c"
180 $PDSH $c "ls -l $MOUNT/$c"
185 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
186 echo "rmdir $MOUNT/$c"
187 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
191 clients_recover_osts() {
193 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
199 for i in `seq $NUMOST`; do
200 ostvar="ost${i}_HOST"
201 if [ "${!ostvar}" == $node ]; then
206 echo "No ost found for node; $node"
213 if [ "$ONLY" == "cleanup" ]; then
218 if [ ! -z "$EVAL" ]; then
225 if [ "$ONLY" == "setup" ]; then
229 # 9 Different Failure Modes Combinations
230 echo "Starting Test 17 at `date`"
235 echo "Waiting for df pid: $DFPID"
236 wait $DFPID || { echo "df returned $?" && return 1; }
240 echo "Waiting for df pid: $DFPID"
241 wait $DFPID || { echo "df returned $?" && return 2; }
245 echo "Waiting for df pid: $DFPID"
246 wait $DFPID || { echo "df returned $?" && return 3; }
249 run_test 0 "Fail all nodes, independently"
251 ############### First Failure Mode ###############
253 echo "Don't do a MDS - MDS Failure Case"
254 echo "This makes no sense"
256 run_test 1 "MDS/MDS failure"
257 ###################################################
259 ############### Second Failure Mode ###############
261 echo "Verify Lustre filesystem is up and running"
268 # prepare for MDS failover
279 echo "Reintegrating OST"
290 clients_recover_osts ost1
291 echo "Verify reintegration"
292 client_df || return 1
295 run_test 2 "Second Failure Mode: MDS/OST `date`"
296 ###################################################
299 ############### Third Failure Mode ###############
302 echo "Verify Lustre filesystem is up and running"
306 wait $DFPID || echo df failed: $?
309 echo "Test Lustre stability after MDS failover"
313 echo "Failing 2 CLIENTS"
317 echo "Test Lustre stability after CLIENT failure"
321 echo "Reintegrating CLIENTS"
322 reintegrate_clients || return 1
324 client_df || return 3
326 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
327 ###################################################
329 ############### Fourth Failure Mode ###############
331 echo "Fourth Failure Mode: OST/MDS `date`"
334 echo "Failing OST ost1"
338 echo "Test Lustre stability after OST failure"
348 # prepare for MDS failover
357 echo "Reintegrating OST"
369 clients_recover_osts ost1
370 echo "Test Lustre stability after MDS failover"
371 client_df || return 1
373 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
374 ###################################################
376 ############### Fifth Failure Mode ###############
378 echo "Fifth Failure Mode: OST/OST `date`"
381 echo "Verify Lustre filesystem is up and running"
390 echo "Test Lustre stability after OST failure"
401 echo "Test Lustre stability after OST failure"
407 echo "Reintegrating OSTs"
413 clients_recover_osts ost1
414 clients_recover_osts ost2
419 client_df || return 2
421 run_test 5 "Fifth Failure Mode: OST/OST `date`"
422 ###################################################
424 ############### Sixth Failure Mode ###############
426 echo "Sixth Failure Mode: OST/CLIENT `date`"
429 echo "Verify Lustre filesystem is up and running"
430 client_df || return 1
431 client_touch testfile || return 2
439 echo "Test Lustre stability after OST failure"
445 echo "Failing CLIENTs"
449 echo "Test Lustre stability after CLIENTs failure"
455 echo "Reintegrating OST/CLIENTs"
463 echo "Verifying mount"
464 client_df || return 3
466 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
467 ###################################################
470 ############### Seventh Failure Mode ###############
472 echo "Seventh Failure Mode: CLIENT/MDS `date`"
475 echo "Verify Lustre filesystem is up and running"
477 client_touch testfile || return 1
480 echo "Part 1: Failing CLIENT"
484 echo "Test Lustre stability after CLIENTs failure"
486 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
487 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
490 echo "Wait 1 minutes"
494 echo "Verify Lustre filesystem is up and running"
503 echo "Test Lustre stability after MDS failover"
504 wait $DFPID || echo "df on down clients fails " || return 1
505 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
506 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
509 echo "Reintegrating CLIENTs"
511 client_df || return 2
514 echo "wait 1 minutes"
517 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
518 ###################################################
521 ############### Eighth Failure Mode ###############
523 echo "Eighth Failure Mode: CLIENT/OST `date`"
526 echo "Verify Lustre filesystem is up and running"
528 client_touch testfile
531 echo "Failing CLIENTs"
535 echo "Test Lustre stability after CLIENTs failure"
537 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
538 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
541 echo "Wait 1 minutes"
545 echo "Verify Lustre filesystem is up and running"
547 client_touch testfile
556 echo "Test Lustre stability after OST failure"
560 #non-failout hangs forever here
561 #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
562 #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
565 echo "Reintegrating CLIENTs/OST"
570 client_df || return 1
571 client_touch testfile2 || return 2
574 echo "Wait 1 minutes"
577 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
578 ###################################################
581 ############### Ninth Failure Mode ###############
586 echo "Verify Lustre filesystem is up and running"
588 client_touch testfile || return 1
591 echo "Failing CLIENTs"
595 echo "Test Lustre stability after CLIENTs failure"
597 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
598 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
601 echo "Wait 1 minutes"
605 echo "Verify Lustre filesystem is up and running"
606 $PDSH $LIVE_CLIENT df $MOUNT || return 3
607 client_touch testfile || return 4
610 echo "Failing CLIENTs"
614 echo "Test Lustre stability after CLIENTs failure"
616 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
617 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
620 echo "Reintegrating CLIENTs/CLIENTs"
622 client_df || return 7
625 echo "Wait 1 minutes"
628 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
629 ###################################################
632 #Run availability after all failures
633 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
634 LOADTEST=${LOADTEST:-metadata-load.py}
635 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
637 run_test 10 "Running Availability for 6 hours..."
639 equals_msg "Done, cleaning up"
640 # we need to force cleanup for the stale MDS conns until bug 5921 is fixed
641 FORCE=--force $CLEANUP