2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-lmv.sh}
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
20 assert_env MDSCOUNT mds1_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
23 # Initialize all the ostN_HOST
25 if [ "$EXTRA_OSTS" ]; then
26 for host in $EXTRA_OSTS; do
27 NUMOST=$((NUMOST + 1))
29 eval ${OST}_HOST=$host
33 # This can be a regexp, to allow more clients
34 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
39 # fail clients round robin
41 # list of failable clients
42 FAIL_LIST=($FAIL_CLIENTS)
43 FAIL_NUM=${#FAIL_LIST[*]}
46 DOWN_NUM=0 # number of nodes currently down
48 # set next client to fail
50 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
51 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
52 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
57 if [ "$FAILURE_MODE" = HARD ]; then
59 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
60 echo "waiting for node $client to fail"
63 elif [ "$FAILURE_MODE" = SOFT ]; then
64 zconf_umount $client $MOUNT -f
70 if [ "$FAILURE_MODE" = HARD ]; then
77 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
78 num=$((FAIL_NUM - DOWN_NUM))
81 if [ -z "$num" ] || [ "$num" -le 0 ]; then
87 for i in `seq $num`; do
90 DOWN_CLIENTS="$DOWN_CLIENTS $client"
91 shutdown_client $client
94 echo "down clients: $DOWN_CLIENTS"
96 for client in $DOWN_CLIENTS; do
99 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
103 reintegrate_clients() {
104 for client in $DOWN_CLIENTS; do
105 wait_for_host $client
106 echo "Restarting $client"
107 zconf_mount $client $MOUNT || return 1
115 if [ "$MDSCOUNT" -gt 1 ]; then
117 for mds in `mds_list`; do
118 MDSDEV=$TMP/${mds}-`hostname`
119 add_mds $mds --dev $MDSDEV --size $MDSSIZE --lmv lmv1_svc
121 add_lov_to_lmv lov1 lmv1_svc --stripe_sz $STRIPE_BYTES \
122 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
125 add_mds mds1 --dev $MDSDEV --size $MDSSIZE
126 if [ ! -z "$mds1failover_HOST" ]; then
127 add_mdsfailover mds1 --dev $MDSDEV --size $MDSSIZE
129 add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \
130 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
134 for i in `seq $NUMOST`; do
135 dev=`printf $OSTDEV $i`
136 add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
137 --journal-size $OSTJOURNALSIZE
140 add_client client $MDS --lov lov1 --path $MOUNT
147 for i in `seq $NUMOST`; do
149 start ost$i ${REFORMAT} $OSTLCONFARGS
151 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
152 for mds in `mds_list`; do
154 start $mds $MDSLCONFARGS ${REFORMAT}
156 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
157 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
162 zconf_umount $CLIENTS $MOUNT
164 for mds in `mds_list`; do
165 stop $mds ${FORCE} $MDSLCONFARGS || :
167 for i in `seq $NUMOST`; do
168 stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
176 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
177 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
178 $PDSH $c touch $MOUNT/${c}_$file || return 1
184 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
185 $PDSH $c rm $MOUNT/${c}_$file
190 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
191 echo "$c mkdir $MOUNT/$c"
192 $PDSH $c "mkdir $MOUNT/$c"
193 $PDSH $c "ls -l $MOUNT/$c"
198 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
199 echo "rmdir $MOUNT/$c"
200 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
204 clients_recover_osts() {
206 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
212 for i in `seq $NUMOST`; do
213 ostvar="ost${i}_HOST"
214 if [ "${!ostvar}" == $node ]; then
219 echo "No ost found for node; $node"
226 if [ "$ONLY" == "cleanup" ]; then
231 if [ ! -z "$EVAL" ]; then
238 if [ "$ONLY" == "setup" ]; then
242 # 9 Different Failure Modes Combinations
243 echo "Starting Test 17 at `date`"
248 echo "Waiting for df pid: $DFPID"
249 wait $DFPID || { echo "df returned $?" && return 1; }
253 echo "Waiting for df pid: $DFPID"
254 wait $DFPID || { echo "df returned $?" && return 2; }
258 echo "Waiting for df pid: $DFPID"
259 wait $DFPID || { echo "df returned $?" && return 3; }
262 run_test 0 "Fail all nodes, independently"
264 ############### First Failure Mode ###############
266 echo "Don't do a MDS - MDS Failure Case"
267 echo "This makes no sense"
269 run_test 1 "MDS/MDS failure"
270 ###################################################
272 ############### Second Failure Mode ###############
274 echo "Verify Lustre filesystem is up and running"
281 # prepare for MDS failover
292 echo "Reintegrating OST"
303 clients_recover_osts ost1
304 echo "Verify reintegration"
305 client_df || return 1
308 run_test 2 "Second Failure Mode: MDS/OST `date`"
309 ###################################################
312 ############### Third Failure Mode ###############
315 echo "Verify Lustre filesystem is up and running"
319 wait $DFPID || echo df failed: $?
322 echo "Test Lustre stability after MDS failover"
326 echo "Failing 2 CLIENTS"
330 echo "Test Lustre stability after CLIENT failure"
334 echo "Reintegrating CLIENTS"
335 reintegrate_clients || return 1
337 client_df || return 3
339 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
340 ###################################################
342 ############### Fourth Failure Mode ###############
344 echo "Fourth Failure Mode: OST/MDS `date`"
347 echo "Failing OST ost1"
351 echo "Test Lustre stability after OST failure"
359 # prepare for MDS failover
368 echo "Reintegrating OST"
379 clients_recover_osts ost1
380 echo "Test Lustre stability after MDS failover"
381 client_df || return 1
383 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
384 ###################################################
386 ############### Fifth Failure Mode ###############
388 echo "Fifth Failure Mode: OST/OST `date`"
391 echo "Verify Lustre filesystem is up and running"
400 echo "Test Lustre stability after OST failure"
409 echo "Test Lustre stability after OST failure"
413 echo "Reintegrating OSTs"
419 clients_recover_osts ost1
420 clients_recover_osts ost2
423 client_df || return 2
425 run_test 5 "Fifth Failure Mode: OST/OST `date`"
426 ###################################################
428 ############### Sixth Failure Mode ###############
430 echo "Sixth Failure Mode: OST/CLIENT `date`"
433 echo "Verify Lustre filesystem is up and running"
434 client_df || return 1
435 client_touch testfile || return 2
443 echo "Test Lustre stability after OST failure"
447 echo "Failing CLIENTs"
451 echo "Test Lustre stability after CLIENTs failure"
455 echo "Reintegrating OST/CLIENTs"
461 echo "Verifying mount"
462 client_df || return 3
464 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
465 ###################################################
468 ############### Seventh Failure Mode ###############
470 echo "Seventh Failure Mode: CLIENT/MDS `date`"
473 echo "Verify Lustre filesystem is up and running"
475 client_touch testfile || return 1
478 echo "Part 1: Failing CLIENT"
482 echo "Test Lustre stability after CLIENTs failure"
484 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
485 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
488 echo "Wait 1 minutes"
492 echo "Verify Lustre filesystem is up and running"
501 echo "Test Lustre stability after MDS failover"
502 wait $DFPID || echo "df on down clients fails " || return 1
503 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
504 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
507 echo "Reintegrating CLIENTs"
509 client_df || return 2
512 echo "wait 1 minutes"
515 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
516 ###################################################
519 ############### Eighth Failure Mode ###############
521 echo "Eighth Failure Mode: CLIENT/OST `date`"
524 echo "Verify Lustre filesystem is up and running"
526 client_touch testfile
529 echo "Failing CLIENTs"
533 echo "Test Lustre stability after CLIENTs failure"
535 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
536 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
539 echo "Wait 1 minutes"
543 echo "Verify Lustre filesystem is up and running"
545 client_touch testfile
554 echo "Test Lustre stability after OST failure"
556 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
557 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
560 echo "Reintegrating CLIENTs/OST"
564 client_df || return 1
565 client_touch testfile2 || return 2
568 echo "Wait 1 minutes"
571 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
572 ###################################################
575 ############### Ninth Failure Mode ###############
580 echo "Verify Lustre filesystem is up and running"
582 client_touch testfile || return 1
585 echo "Failing CLIENTs"
589 echo "Test Lustre stability after CLIENTs failure"
591 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
592 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
595 echo "Wait 1 minutes"
599 echo "Verify Lustre filesystem is up and running"
600 $PDSH $LIVE_CLIENT df $MOUNT || return 3
601 client_touch testfile || return 4
604 echo "Failing CLIENTs"
608 echo "Test Lustre stability after CLIENTs failure"
610 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
611 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
614 echo "Reintegrating CLIENTs/CLIENTs"
616 client_df || return 7
619 echo "Wait 1 minutes"
622 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
623 ###################################################
626 #Run availability after all failures
627 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
628 LOADTEST=${LOADTEST:-metadata-load.py}
629 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
631 run_test 10 "Running Availability for 6 hours..."
633 equals_msg "Done, cleaning up"