2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-lmv.sh}
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
20 assert_env MDSCOUNT mds1_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
23 # Initialize all the ostN_HOST
25 if [ "$EXTRA_OSTS" ]; then
26 for host in $EXTRA_OSTS; do
27 NUMOST=$((NUMOST + 1))
29 eval ${OST}_HOST=$host
33 # This can be a regexp, to allow more clients
34 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
39 # fail clients round robin
41 # list of failable clients
42 FAIL_LIST=($FAIL_CLIENTS)
43 FAIL_NUM=${#FAIL_LIST[*]}
46 DOWN_NUM=0 # number of nodes currently down
48 # set next client to fail
50 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
51 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
52 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
57 if [ "$FAILURE_MODE" = HARD ]; then
59 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
60 echo "waiting for node $client to fail"
63 elif [ "$FAILURE_MODE" = SOFT ]; then
64 zconf_umount $client $MOUNT -f
70 if [ "$FAILURE_MODE" = HARD ]; then
77 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
78 num=$((FAIL_NUM - DOWN_NUM))
81 if [ -z "$num" ] || [ "$num" -le 0 ]; then
87 for i in `seq $num`; do
90 DOWN_CLIENTS="$DOWN_CLIENTS $client"
91 shutdown_client $client
94 echo "down clients: $DOWN_CLIENTS"
96 for client in $DOWN_CLIENTS; do
99 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
103 reintegrate_clients() {
104 for client in $DOWN_CLIENTS; do
105 wait_for_host $client
106 echo "Restarting $client"
107 zconf_mount $client $MOUNT || return 1
115 if [ "$MDSCOUNT" -gt 1 ]; then
117 for mds in `mds_list`; do
118 MDSDEV=$TMP/${mds}-`hostname`
119 add_mds $mds --dev $MDSDEV --size $MDSSIZE --lmv lmv1_svc
121 add_lov_to_lmv lov1 lmv1_svc --stripe_sz $STRIPE_BYTES \
122 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
125 add_mds mds1 --dev $MDSDEV --size $MDSSIZE
126 if [ ! -z "$mds1failover_HOST" ]; then
127 add_mdsfailover mds1 --dev $MDSDEV --size $MDSSIZE
129 add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \
130 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
134 for i in `seq $NUMOST`; do
135 dev=`printf $OSTDEV $i`
136 add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
137 --journal-size $OSTJOURNALSIZE
140 add_client client $MDS --lov lov1 --path $MOUNT
146 start_krb5_kdc || exit 1
148 for i in `seq $NUMOST`; do
150 start ost$i ${REFORMAT} $OSTLCONFARGS
152 start_lsvcgssd || exit 2
153 start_lgssd || exit 3
154 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
155 for mds in `mds_list`; do
157 start $mds $MDSLCONFARGS ${REFORMAT}
159 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
160 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
165 zconf_umount $CLIENTS $MOUNT
167 for mds in `mds_list`; do
168 stop $mds ${FORCE} $MDSLCONFARGS || :
172 for i in `seq $NUMOST`; do
173 stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
181 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
182 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
183 $PDSH $c touch $MOUNT/${c}_$file || return 1
189 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
190 $PDSH $c rm $MOUNT/${c}_$file
195 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
196 echo "$c mkdir $MOUNT/$c"
197 $PDSH $c "mkdir $MOUNT/$c"
198 $PDSH $c "ls -l $MOUNT/$c"
203 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
204 echo "rmdir $MOUNT/$c"
205 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
209 clients_recover_osts() {
211 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
217 for i in `seq $NUMOST`; do
218 ostvar="ost${i}_HOST"
219 if [ "${!ostvar}" == $node ]; then
224 echo "No ost found for node; $node"
231 if [ "$ONLY" == "cleanup" ]; then
236 if [ ! -z "$EVAL" ]; then
243 if [ "$ONLY" == "setup" ]; then
247 # 9 Different Failure Modes Combinations
248 echo "Starting Test 17 at `date`"
253 echo "Waiting for df pid: $DFPID"
254 wait $DFPID || { echo "df returned $?" && return 1; }
258 echo "Waiting for df pid: $DFPID"
259 wait $DFPID || { echo "df returned $?" && return 2; }
263 echo "Waiting for df pid: $DFPID"
264 wait $DFPID || { echo "df returned $?" && return 3; }
267 run_test 0 "Fail all nodes, independently"
269 ############### First Failure Mode ###############
271 echo "Don't do a MDS - MDS Failure Case"
272 echo "This makes no sense"
274 run_test 1 "MDS/MDS failure"
275 ###################################################
277 ############### Second Failure Mode ###############
279 echo "Verify Lustre filesystem is up and running"
286 # prepare for MDS failover
297 echo "Reintegrating OST"
308 clients_recover_osts ost1
309 echo "Verify reintegration"
310 client_df || return 1
313 run_test 2 "Second Failure Mode: MDS/OST `date`"
314 ###################################################
317 ############### Third Failure Mode ###############
320 echo "Verify Lustre filesystem is up and running"
324 wait $DFPID || echo df failed: $?
327 echo "Test Lustre stability after MDS failover"
331 echo "Failing 2 CLIENTS"
335 echo "Test Lustre stability after CLIENT failure"
339 echo "Reintegrating CLIENTS"
340 reintegrate_clients || return 1
342 client_df || return 3
344 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
345 ###################################################
347 ############### Fourth Failure Mode ###############
349 echo "Fourth Failure Mode: OST/MDS `date`"
352 echo "Failing OST ost1"
356 echo "Test Lustre stability after OST failure"
364 # prepare for MDS failover
373 echo "Reintegrating OST"
384 clients_recover_osts ost1
385 echo "Test Lustre stability after MDS failover"
386 client_df || return 1
388 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
389 ###################################################
391 ############### Fifth Failure Mode ###############
393 echo "Fifth Failure Mode: OST/OST `date`"
396 echo "Verify Lustre filesystem is up and running"
405 echo "Test Lustre stability after OST failure"
414 echo "Test Lustre stability after OST failure"
418 echo "Reintegrating OSTs"
424 clients_recover_osts ost1
425 clients_recover_osts ost2
428 client_df || return 2
430 run_test 5 "Fifth Failure Mode: OST/OST `date`"
431 ###################################################
433 ############### Sixth Failure Mode ###############
435 echo "Sixth Failure Mode: OST/CLIENT `date`"
438 echo "Verify Lustre filesystem is up and running"
439 client_df || return 1
440 client_touch testfile || return 2
448 echo "Test Lustre stability after OST failure"
452 echo "Failing CLIENTs"
456 echo "Test Lustre stability after CLIENTs failure"
460 echo "Reintegrating OST/CLIENTs"
466 echo "Verifying mount"
467 client_df || return 3
469 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
470 ###################################################
473 ############### Seventh Failure Mode ###############
475 echo "Seventh Failure Mode: CLIENT/MDS `date`"
478 echo "Verify Lustre filesystem is up and running"
480 client_touch testfile || return 1
483 echo "Part 1: Failing CLIENT"
487 echo "Test Lustre stability after CLIENTs failure"
489 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
490 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
493 echo "Wait 1 minutes"
497 echo "Verify Lustre filesystem is up and running"
506 echo "Test Lustre stability after MDS failover"
507 wait $DFPID || echo "df on down clients fails " || return 1
508 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
509 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
512 echo "Reintegrating CLIENTs"
514 client_df || return 2
517 echo "wait 1 minutes"
520 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
521 ###################################################
524 ############### Eighth Failure Mode ###############
526 echo "Eighth Failure Mode: CLIENT/OST `date`"
529 echo "Verify Lustre filesystem is up and running"
531 client_touch testfile
534 echo "Failing CLIENTs"
538 echo "Test Lustre stability after CLIENTs failure"
540 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
541 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
544 echo "Wait 1 minutes"
548 echo "Verify Lustre filesystem is up and running"
550 client_touch testfile
559 echo "Test Lustre stability after OST failure"
561 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
562 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
565 echo "Reintegrating CLIENTs/OST"
569 client_df || return 1
570 client_touch testfile2 || return 2
573 echo "Wait 1 minutes"
576 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
577 ###################################################
580 ############### Ninth Failure Mode ###############
585 echo "Verify Lustre filesystem is up and running"
587 client_touch testfile || return 1
590 echo "Failing CLIENTs"
594 echo "Test Lustre stability after CLIENTs failure"
596 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
597 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
600 echo "Wait 1 minutes"
604 echo "Verify Lustre filesystem is up and running"
605 $PDSH $LIVE_CLIENT df $MOUNT || return 3
606 client_touch testfile || return 4
609 echo "Failing CLIENTs"
613 echo "Test Lustre stability after CLIENTs failure"
615 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
616 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
619 echo "Reintegrating CLIENTs/CLIENTs"
621 client_df || return 7
624 echo "Wait 1 minutes"
627 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
628 ###################################################
631 #Run availability after all failures
632 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
633 LOADTEST=${LOADTEST:-metadata-load.py}
634 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
636 run_test 10 "Running Availability for 6 hours..."
638 equals_msg "Done, cleaning up"