2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-lmv.sh}
17 assert_env MDSCOUNT mds1_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
20 # Initialize all the ostN_HOST
22 if [ "$EXTRA_OSTS" ]; then
23 for host in $EXTRA_OSTS; do
24 NUMOST=$((NUMOST + 1))
26 eval ${OST}_HOST=$host
30 # This can be a regexp, to allow more clients
31 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
36 # fail clients round robin
38 # list of failable clients
39 FAIL_LIST=($FAIL_CLIENTS)
40 FAIL_NUM=${#FAIL_LIST[*]}
43 DOWN_NUM=0 # number of nodes currently down
45 # set next client to fail
47 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
48 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
49 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
54 if [ "$FAILURE_MODE" = HARD ]; then
56 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
57 echo "waiting for node $client to fail"
60 elif [ "$FAILURE_MODE" = SOFT ]; then
61 zconf_umount $client $MOUNT -f
67 if [ "$FAILURE_MODE" = HARD ]; then
74 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
75 num=$((FAIL_NUM - DOWN_NUM))
78 if [ -z "$num" ] || [ "$num" -le 0 ]; then
84 for i in `seq $num`; do
87 DOWN_CLIENTS="$DOWN_CLIENTS $client"
88 shutdown_client $client
91 echo "down clients: $DOWN_CLIENTS"
93 for client in $DOWN_CLIENTS; do
96 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
100 reintegrate_clients() {
101 for client in $DOWN_CLIENTS; do
102 wait_for_host $client
103 echo "Restarting $client"
104 zconf_mount $client $MOUNT || return 1
112 if [ "$MDSCOUNT" -gt 1 ]; then
114 for mds in `mds_list`; do
115 MDSDEV=$TMP/${mds}-`hostname`
116 add_mds $mds --dev $MDSDEV --size $MDSSIZE --lmv lmv1
119 add_lov_to_lmv lov1 lmv1 --stripe_sz $STRIPE_BYTES \
120 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
122 add_mds mds1 --dev $MDSDEV --size $MDSSIZE
123 if [ ! -z "$mds1failover_HOST" ]; then
124 add_mdsfailover mds1 --dev $MDSDEV --size $MDSSIZE
126 add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \
127 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
131 for i in `seq $NUMOST`; do
132 dev=`printf $OSTDEV $i`
133 add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
134 --journal-size $OSTJOURNALSIZE
137 add_client client --mds $MDS --lov lov1 --path $MOUNT
142 for i in `seq $NUMOST`; do
144 start ost$i ${REFORMAT} $OSTLCONFARGS
146 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
147 for mds in `mds_list`; do
149 start $mds $MDSLCONFARGS ${REFORMAT}
151 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
152 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
157 zconf_umount $CLIENTS $MOUNT
159 for mds in `mds_list`; do
160 stop $mds ${FORCE} $MDSLCONFARGS || :
162 for i in `seq $NUMOST`; do
163 stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
171 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
172 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
173 $PDSH $c touch $MOUNT/${c}_$file || return 1
179 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
180 $PDSH $c rm $MOUNT/${c}_$file
185 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
186 echo "$c mkdir $MOUNT/$c"
187 $PDSH $c "mkdir $MOUNT/$c"
188 $PDSH $c "ls -l $MOUNT/$c"
193 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
194 echo "rmdir $MOUNT/$c"
195 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
199 clients_recover_osts() {
201 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
207 for i in `seq $NUMOST`; do
208 ostvar="ost${i}_HOST"
209 if [ "${!ostvar}" == $node ]; then
214 echo "No ost found for node; $node"
221 if [ "$ONLY" == "cleanup" ]; then
226 if [ -z "$NOSETUP" ]; then
231 if [ ! -z "$EVAL" ]; then
236 if [ "$ONLY" == "setup" ]; then
240 # 9 Different Failure Modes Combinations
241 echo "Starting Test 17 at `date`"
246 echo "Waiting for df pid: $DFPID"
247 wait $DFPID || return 1
251 echo "Waiting for df pid: $DFPID"
252 wait $DFPID || return 2
256 echo "Waiting for df pid: $DFPID"
257 wait $DFPID || return 3
260 run_test 0 "Fail all nodes, independently"
262 ############### First Failure Mode ###############
264 echo "Don't do a MDS - MDS Failure Case"
265 echo "This makes no sense"
267 run_test 1 "MDS/MDS failure"
268 ###################################################
270 ############### Second Failure Mode ###############
272 echo "Verify Lustre filesystem is up and running"
279 # prepare for MDS failover
290 echo "Reintegrating OST"
301 clients_recover_osts ost1
302 echo "Verify reintegration"
303 client_df || return 1
306 run_test 2 "Second Failure Mode: MDS/OST `date`"
307 ###################################################
310 ############### Third Failure Mode ###############
313 echo "Verify Lustre filesystem is up and running"
317 wait $DFPID || echo df failed: $?
320 echo "Test Lustre stability after MDS failover"
324 echo "Failing 2 CLIENTS"
328 echo "Test Lustre stability after CLIENT failure"
332 echo "Reintegrating CLIENTS"
333 reintegrate_clients || return 1
335 client_df || return 3
337 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
338 ###################################################
340 ############### Fourth Failure Mode ###############
342 echo "Fourth Failure Mode: OST/MDS `date`"
345 echo "Failing OST ost1"
349 echo "Test Lustre stability after OST failure"
357 # prepare for MDS failover
366 echo "Reintegrating OST"
377 clients_recover_osts ost1
378 echo "Test Lustre stability after MDS failover"
379 client_df || return 1
381 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
382 ###################################################
384 ############### Fifth Failure Mode ###############
386 echo "Fifth Failure Mode: OST/OST `date`"
389 echo "Verify Lustre filesystem is up and running"
398 echo "Test Lustre stability after OST failure"
407 echo "Test Lustre stability after OST failure"
411 echo "Reintegrating OSTs"
417 clients_recover_osts ost1
418 clients_recover_osts ost2
421 client_df || return 2
423 run_test 5 "Fifth Failure Mode: OST/OST `date`"
424 ###################################################
426 ############### Sixth Failure Mode ###############
428 echo "Sixth Failure Mode: OST/CLIENT `date`"
431 echo "Verify Lustre filesystem is up and running"
432 client_df || return 1
433 client_touch testfile || return 2
441 echo "Test Lustre stability after OST failure"
445 echo "Failing CLIENTs"
449 echo "Test Lustre stability after CLIENTs failure"
453 echo "Reintegrating OST/CLIENTs"
459 echo "Verifying mount"
460 client_df || return 3
462 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
463 ###################################################
466 ############### Seventh Failure Mode ###############
468 echo "Seventh Failure Mode: CLIENT/MDS `date`"
471 echo "Verify Lustre filesystem is up and running"
473 client_touch testfile || return 1
476 echo "Part 1: Failing CLIENT"
480 echo "Test Lustre stability after CLIENTs failure"
482 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
483 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
486 echo "Wait 1 minutes"
490 echo "Verify Lustre filesystem is up and running"
499 echo "Test Lustre stability after MDS failover"
500 wait $DFPID || echo "df on down clients fails " || return 1
501 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
502 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
505 echo "Reintegrating CLIENTs"
507 client_df || return 2
510 echo "wait 1 minutes"
513 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
514 ###################################################
517 ############### Eighth Failure Mode ###############
519 echo "Eighth Failure Mode: CLIENT/OST `date`"
522 echo "Verify Lustre filesystem is up and running"
524 client_touch testfile
527 echo "Failing CLIENTs"
531 echo "Test Lustre stability after CLIENTs failure"
533 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
534 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
537 echo "Wait 1 minutes"
541 echo "Verify Lustre filesystem is up and running"
543 client_touch testfile
552 echo "Test Lustre stability after OST failure"
554 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
555 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
558 echo "Reintegrating CLIENTs/OST"
562 client_df || return 1
563 client_touch testfile2 || return 2
566 echo "Wait 1 minutes"
569 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
570 ###################################################
573 ############### Ninth Failure Mode ###############
578 echo "Verify Lustre filesystem is up and running"
580 client_touch testfile || return 1
583 echo "Failing CLIENTs"
587 echo "Test Lustre stability after CLIENTs failure"
589 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
590 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
593 echo "Wait 1 minutes"
597 echo "Verify Lustre filesystem is up and running"
598 $PDSH $LIVE_CLIENT df $MOUNT || return 3
599 client_touch testfile || return 4
602 echo "Failing CLIENTs"
606 echo "Test Lustre stability after CLIENTs failure"
608 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
609 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
612 echo "Reintegrating CLIENTs/CLIENTs"
614 client_df || return 7
617 echo "Wait 1 minutes"
620 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
621 ###################################################
624 #Run availability after all failures
625 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
626 LOADTEST=${LOADTEST:-metadata-load.py}
627 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
629 run_test 10 "Running Availability for 6 hours..."
631 equals_msg "Done, cleaning up"