2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
13 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
16 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
19 CLEANUP=${CLEANUP:-""}
23 assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
24 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
25 assert_env LIVE_CLIENT FSNAME
28 # This can be a regexp, to allow more clients
29 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
34 # fail clients round robin
36 # list of failable clients
37 FAIL_LIST=($FAIL_CLIENTS)
38 FAIL_NUM=${#FAIL_LIST[*]}
41 DOWN_NUM=0 # number of nodes currently down
43 # set next client to fail
45 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
46 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
47 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
52 if [ "$FAILURE_MODE" = HARD ]; then
54 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
55 echo "waiting for node $client to fail"
58 elif [ "$FAILURE_MODE" = SOFT ]; then
59 zconf_umount $client $MOUNT -f
65 if [ "$FAILURE_MODE" = HARD ]; then
72 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
73 num=$((FAIL_NUM - DOWN_NUM))
76 if [ -z "$num" ] || [ "$num" -le 0 ]; then
82 for i in `seq $num`; do
85 DOWN_CLIENTS="$DOWN_CLIENTS $client"
86 shutdown_client $client
89 echo "down clients: $DOWN_CLIENTS"
91 for client in $DOWN_CLIENTS; do
94 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
98 reintegrate_clients() {
99 for client in $DOWN_CLIENTS; do
100 wait_for_host $client
101 echo "Restarting $client"
102 zconf_mount $client $MOUNT || return 1
109 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
116 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
117 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
118 $PDSH $c touch $MOUNT/${c}_$file || return 1
124 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
125 $PDSH $c rm $MOUNT/${c}_$file
130 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
131 echo "$c mkdir $MOUNT/$c"
132 $PDSH $c "mkdir $MOUNT/$c"
133 $PDSH $c "ls -l $MOUNT/$c"
138 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
139 echo "rmdir $MOUNT/$c"
140 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
144 clients_recover_osts() {
146 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
149 cleanup_and_setup_lustre
151 # 9 Different Failure Modes Combinations
152 echo "Starting Test 17 at `date`"
156 echo "Waiting for df pid: $DFPID"
157 wait $DFPID || { echo "df returned $?" && return 1; }
159 facet_failover ost1 || return 4
160 echo "Waiting for df pid: $DFPID"
161 wait $DFPID || { echo "df returned $?" && return 2; }
163 if [ $OSTCOUNT -gt 1 ]; then
164 facet_failover ost2 || return 5
165 echo "Waiting for df pid: $DFPID"
166 wait $DFPID || { echo "df returned $?" && return 3; }
170 run_test 0 "Fail all nodes, independently"
172 ############### First Failure Mode ###############
174 echo "Don't do a MDS - MDS Failure Case"
175 echo "This makes no sense"
177 run_test 1 "MDS/MDS failure"
178 ###################################################
180 ############### Second Failure Mode ###############
182 echo "Verify Lustre filesystem is up and running"
188 # prepare for MDS failover
198 echo "Reintegrating OST"
201 start_ost 1 || return 2
204 start mds $MDSDEV $MDS_MOUNT_OPTS || return $?
208 clients_recover_osts ost1
209 echo "Verify reintegration"
210 client_df || return 1
213 run_test 2 "Second Failure Mode: MDS/OST `date`"
214 ###################################################
217 ############### Third Failure Mode ###############
220 echo "Verify Lustre filesystem is up and running"
224 wait $DFPID || echo df failed: $?
227 echo "Test Lustre stability after MDS failover"
231 echo "Failing 2 CLIENTS"
235 echo "Test Lustre stability after CLIENT failure"
239 echo "Reintegrating CLIENTS"
240 reintegrate_clients || return 1
242 client_df || return 3
244 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
245 ###################################################
247 ############### Fourth Failure Mode ###############
249 echo "Fourth Failure Mode: OST/MDS `date`"
255 echo "Test Lustre stability after OST failure"
264 # prepare for MDS failover
273 echo "Reintegrating OST"
279 start mds $MDSDEV $MDS_MOUNT_OPTS
284 clients_recover_osts ost1
285 echo "Test Lustre stability after MDS failover"
286 client_df || return 1
288 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
289 ###################################################
291 ############### Fifth Failure Mode ###############
293 [ $OSTCOUNT -lt 1 ] && skip "$OSTCOUNT < 1, not enough OSTs" && return 0
295 echo "Fifth Failure Mode: OST/OST `date`"
298 echo "Verify Lustre filesystem is up and running"
306 echo "Test Lustre stability after OST failure"
316 echo "Test Lustre stability after OST failure"
322 echo "Reintegrating OSTs"
328 clients_recover_osts ost1
329 clients_recover_osts ost2
334 client_df || return 2
336 run_test 5 "Fifth Failure Mode: OST/OST `date`"
337 ###################################################
339 ############### Sixth Failure Mode ###############
341 echo "Sixth Failure Mode: OST/CLIENT `date`"
344 echo "Verify Lustre filesystem is up and running"
345 client_df || return 1
346 client_touch testfile || return 2
353 echo "Test Lustre stability after OST failure"
359 echo "Failing CLIENTs"
363 echo "Test Lustre stability after CLIENTs failure"
369 echo "Reintegrating OST/CLIENTs"
377 echo "Verifying mount"
378 client_df || return 3
380 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
381 ###################################################
384 ############### Seventh Failure Mode ###############
386 echo "Seventh Failure Mode: CLIENT/MDS `date`"
389 echo "Verify Lustre filesystem is up and running"
391 client_touch testfile || return 1
394 echo "Part 1: Failing CLIENT"
398 echo "Test Lustre stability after CLIENTs failure"
400 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
401 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
404 echo "Wait 1 minutes"
408 echo "Verify Lustre filesystem is up and running"
416 echo "Test Lustre stability after MDS failover"
417 wait $DFPID || echo "df on down clients fails " || return 1
418 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
419 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
422 echo "Reintegrating CLIENTs"
424 client_df || return 2
427 echo "wait 1 minutes"
430 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
431 ###################################################
434 ############### Eighth Failure Mode ###############
436 echo "Eighth Failure Mode: CLIENT/OST `date`"
439 echo "Verify Lustre filesystem is up and running"
441 client_touch testfile
444 echo "Failing CLIENTs"
448 echo "Test Lustre stability after CLIENTs failure"
450 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
451 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
454 echo "Wait 1 minutes"
458 echo "Verify Lustre filesystem is up and running"
460 client_touch testfile
468 echo "Test Lustre stability after OST failure"
472 #non-failout hangs forever here
473 #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
474 #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
477 echo "Reintegrating CLIENTs/OST"
482 client_df || return 1
483 client_touch testfile2 || return 2
486 echo "Wait 1 minutes"
489 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
490 ###################################################
493 ############### Ninth Failure Mode ###############
498 echo "Verify Lustre filesystem is up and running"
500 client_touch testfile || return 1
503 echo "Failing CLIENTs"
507 echo "Test Lustre stability after CLIENTs failure"
509 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
510 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
513 echo "Wait 1 minutes"
517 echo "Verify Lustre filesystem is up and running"
518 $PDSH $LIVE_CLIENT df $MOUNT || return 3
519 client_touch testfile || return 4
522 echo "Failing CLIENTs"
526 echo "Test Lustre stability after CLIENTs failure"
528 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
529 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
532 echo "Reintegrating CLIENTs/CLIENTs"
534 client_df || return 7
537 echo "Wait 1 minutes"
540 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
541 ###################################################
544 #Run availability after all failures
545 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
546 LOADTEST=${LOADTEST:-metadata-load.py}
547 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
549 run_test 10 "Running Availability for 6 hours..."
551 equals_msg `basename $0`: test complete, cleaning up
552 check_and_cleanup_lustre
553 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true