2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
13 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
15 SETUP=${SETUP:-"setup"}
16 CLEANUP=${CLEANUP:-"cleanup"}
20 assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
21 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
22 assert_env LIVE_CLIENT FSNAME
25 # This can be a regexp, to allow more clients
26 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
31 # fail clients round robin
33 # list of failable clients
34 FAIL_LIST=($FAIL_CLIENTS)
35 FAIL_NUM=${#FAIL_LIST[*]}
38 DOWN_NUM=0 # number of nodes currently down
40 # set next client to fail
42 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
43 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
44 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
49 if [ "$FAILURE_MODE" = HARD ]; then
51 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
52 echo "waiting for node $client to fail"
55 elif [ "$FAILURE_MODE" = SOFT ]; then
56 zconf_umount $client $MOUNT -f
62 if [ "$FAILURE_MODE" = HARD ]; then
69 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
70 num=$((FAIL_NUM - DOWN_NUM))
73 if [ -z "$num" ] || [ "$num" -le 0 ]; then
79 for i in `seq $num`; do
82 DOWN_CLIENTS="$DOWN_CLIENTS $client"
83 shutdown_client $client
86 echo "down clients: $DOWN_CLIENTS"
88 for client in $DOWN_CLIENTS; do
91 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
95 reintegrate_clients() {
96 for client in $DOWN_CLIENTS; do
98 echo "Restarting $client"
99 zconf_mount $client $MOUNT || return 1
106 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
115 while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
116 grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
120 zconf_umount $CLIENTS $MOUNT
129 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
130 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
131 $PDSH $c touch $MOUNT/${c}_$file || return 1
137 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
138 $PDSH $c rm $MOUNT/${c}_$file
143 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
144 echo "$c mkdir $MOUNT/$c"
145 $PDSH $c "mkdir $MOUNT/$c"
146 $PDSH $c "ls -l $MOUNT/$c"
151 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
152 echo "rmdir $MOUNT/$c"
153 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
157 clients_recover_osts() {
159 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
162 if [ "$ONLY" == "cleanup" ]; then
167 if [ ! -z "$EVAL" ]; then
174 if [ "$ONLY" == "setup" ]; then
178 # 9 Different Failure Modes Combinations
179 echo "Starting Test 17 at `date`"
183 echo "Waiting for df pid: $DFPID"
184 wait $DFPID || { echo "df returned $?" && return 1; }
186 facet_failover ost1 || return 4
187 echo "Waiting for df pid: $DFPID"
188 wait $DFPID || { echo "df returned $?" && return 2; }
190 if [ $OSTCOUNT -gt 1 ]; then
191 facet_failover ost2 || return 5
192 echo "Waiting for df pid: $DFPID"
193 wait $DFPID || { echo "df returned $?" && return 3; }
197 run_test 0 "Fail all nodes, independently"
199 ############### First Failure Mode ###############
201 echo "Don't do a MDS - MDS Failure Case"
202 echo "This makes no sense"
204 run_test 1 "MDS/MDS failure"
205 ###################################################
207 ############### Second Failure Mode ###############
209 echo "Verify Lustre filesystem is up and running"
215 # prepare for MDS failover
225 echo "Reintegrating OST"
228 start_ost 1 || return 2
231 start mds $MDSDEV $MDS_MOUNT_OPTS || return $?
235 clients_recover_osts ost1
236 echo "Verify reintegration"
237 client_df || return 1
240 run_test 2 "Second Failure Mode: MDS/OST `date`"
241 ###################################################
244 ############### Third Failure Mode ###############
247 echo "Verify Lustre filesystem is up and running"
251 wait $DFPID || echo df failed: $?
254 echo "Test Lustre stability after MDS failover"
258 echo "Failing 2 CLIENTS"
262 echo "Test Lustre stability after CLIENT failure"
266 echo "Reintegrating CLIENTS"
267 reintegrate_clients || return 1
269 client_df || return 3
271 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
272 ###################################################
274 ############### Fourth Failure Mode ###############
276 echo "Fourth Failure Mode: OST/MDS `date`"
282 echo "Test Lustre stability after OST failure"
291 # prepare for MDS failover
300 echo "Reintegrating OST"
306 start mds $MDSDEV $MDS_MOUNT_OPTS
311 clients_recover_osts ost1
312 echo "Test Lustre stability after MDS failover"
313 client_df || return 1
315 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
316 ###################################################
318 ############### Fifth Failure Mode ###############
320 [ $OSTCOUNT -lt 1 ] && skip "$OSTCOUNT < 1, not enough OSTs" && return 0
322 echo "Fifth Failure Mode: OST/OST `date`"
325 echo "Verify Lustre filesystem is up and running"
333 echo "Test Lustre stability after OST failure"
343 echo "Test Lustre stability after OST failure"
349 echo "Reintegrating OSTs"
355 clients_recover_osts ost1
356 clients_recover_osts ost2
361 client_df || return 2
363 run_test 5 "Fifth Failure Mode: OST/OST `date`"
364 ###################################################
366 ############### Sixth Failure Mode ###############
368 echo "Sixth Failure Mode: OST/CLIENT `date`"
371 echo "Verify Lustre filesystem is up and running"
372 client_df || return 1
373 client_touch testfile || return 2
380 echo "Test Lustre stability after OST failure"
386 echo "Failing CLIENTs"
390 echo "Test Lustre stability after CLIENTs failure"
396 echo "Reintegrating OST/CLIENTs"
404 echo "Verifying mount"
405 client_df || return 3
407 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
408 ###################################################
411 ############### Seventh Failure Mode ###############
413 echo "Seventh Failure Mode: CLIENT/MDS `date`"
416 echo "Verify Lustre filesystem is up and running"
418 client_touch testfile || return 1
421 echo "Part 1: Failing CLIENT"
425 echo "Test Lustre stability after CLIENTs failure"
427 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
428 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
431 echo "Wait 1 minutes"
435 echo "Verify Lustre filesystem is up and running"
443 echo "Test Lustre stability after MDS failover"
444 wait $DFPID || echo "df on down clients fails " || return 1
445 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
446 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
449 echo "Reintegrating CLIENTs"
451 client_df || return 2
454 echo "wait 1 minutes"
457 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
458 ###################################################
461 ############### Eighth Failure Mode ###############
463 echo "Eighth Failure Mode: CLIENT/OST `date`"
466 echo "Verify Lustre filesystem is up and running"
468 client_touch testfile
471 echo "Failing CLIENTs"
475 echo "Test Lustre stability after CLIENTs failure"
477 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
478 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
481 echo "Wait 1 minutes"
485 echo "Verify Lustre filesystem is up and running"
487 client_touch testfile
495 echo "Test Lustre stability after OST failure"
499 #non-failout hangs forever here
500 #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
501 #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
504 echo "Reintegrating CLIENTs/OST"
509 client_df || return 1
510 client_touch testfile2 || return 2
513 echo "Wait 1 minutes"
516 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
517 ###################################################
520 ############### Ninth Failure Mode ###############
525 echo "Verify Lustre filesystem is up and running"
527 client_touch testfile || return 1
530 echo "Failing CLIENTs"
534 echo "Test Lustre stability after CLIENTs failure"
536 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
537 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
540 echo "Wait 1 minutes"
544 echo "Verify Lustre filesystem is up and running"
545 $PDSH $LIVE_CLIENT df $MOUNT || return 3
546 client_touch testfile || return 4
549 echo "Failing CLIENTs"
553 echo "Test Lustre stability after CLIENTs failure"
555 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
556 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
559 echo "Reintegrating CLIENTs/CLIENTs"
561 client_df || return 7
564 echo "Wait 1 minutes"
567 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
568 ###################################################
571 #Run availability after all failures
572 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
573 LOADTEST=${LOADTEST:-metadata-load.py}
574 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
576 run_test 10 "Running Availability for 6 hours..."
578 equals_msg `basename $0`: test complete, cleaning up
580 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true