2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
14 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
16 if [ "$FAILURE_MODE" = "HARD" ]; then
17 mixed_ost_devs && CONFIG_EXCEPTIONS="0 2 4 5 6 8" && \
18 echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " && \
19 echo "Except the tests: $CONFIG_EXCEPTIONS" && \
20 ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
24 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
27 CLEANUP=${CLEANUP:-""}
31 SINGLECLIENT=${SINGLECLIENT:-$HOSTNAME}
32 LIVE_CLIENT=${LIVE_CLIENT:-$SINGLECLIENT}
33 FAIL_CLIENTS=${FAIL_CLIENTS:-$RCLIENTS}
35 assert_env mds_HOST MDS_MKFS_OPTS
36 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
37 assert_env LIVE_CLIENT FSNAME
39 require_dsh_mds || exit 0
40 require_dsh_ost || exit 0
42 # FAIL_CLIENTS list should not contain the LIVE_CLIENT
43 FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
46 TESTDIR=$DIR/d0.$(basename $0 .sh)
49 # fail clients round robin
51 # list of failable clients
52 FAIL_LIST=($FAIL_CLIENTS)
53 FAIL_NUM=${#FAIL_LIST[*]}
56 DOWN_NUM=0 # number of nodes currently down
58 # set next client to fail
60 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
61 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
62 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
68 log "Request clients to fail: ${num}. Num of clients to fail: ${FAIL_NUM}, already failed: $DOWN_NUM"
69 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
70 num=$((FAIL_NUM - DOWN_NUM))
73 if [ -z "$num" ] || [ "$num" -le 0 ]; then
74 log "No clients failed!"
80 for i in `seq $num`; do
83 DOWN_CLIENTS="$DOWN_CLIENTS $client"
84 shutdown_client $client
87 echo "down clients: $DOWN_CLIENTS"
89 for client in $DOWN_CLIENTS; do
92 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
96 reintegrate_clients() {
97 for client in $DOWN_CLIENTS; do
99 echo "Restarting $client"
100 zconf_mount $client $MOUNT || return 1
107 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
114 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
115 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
116 $PDSH $c touch $TESTDIR/${c}_$file || return 1
122 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
123 $PDSH $c rm $TESTDIR/${c}_$file
128 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
129 echo "$c mkdir $TESTDIR/$c"
130 $PDSH $c "mkdir $TESTDIR/$c && ls -l $TESTDIR/$c"
135 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
136 echo "rmdir $TESTDIR/$c"
137 $PDSH $LIVE_CLIENT "rmdir $TESTDIR/$c"
141 clients_recover_osts() {
143 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
146 check_and_setup_lustre
151 # 9 Different Failure Modes Combinations
152 echo "Starting Test 17 at `date`"
157 for i in $(seq $OSTCOUNT) ; do
162 run_test 0 "Fail all nodes, independently"
164 ############### First Failure Mode ###############
166 echo "Don't do a MDS - MDS Failure Case"
167 echo "This makes no sense"
169 run_test 1 "MDS/MDS failure"
170 ###################################################
172 ############### Second Failure Mode ###############
174 echo "Verify Lustre filesystem is up and running"
175 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
179 shutdown_facet $SINGLEMDS
180 reboot_facet $SINGLEMDS
182 # prepare for MDS failover
183 change_active $SINGLEMDS
184 reboot_facet $SINGLEMDS
192 echo "Reintegrating OST"
195 start_ost 1 || return 2
198 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
202 clients_recover_osts ost1
203 echo "Verify reintegration"
204 clients_up || return 1
207 run_test 2 "Second Failure Mode: MDS/OST `date`"
208 ###################################################
211 ############### Third Failure Mode ###############
214 echo "Verify Lustre filesystem is up and running"
215 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
221 echo "Test Lustre stability after MDS failover"
225 echo "Failing 2 CLIENTS"
229 echo "Test Lustre stability after CLIENT failure"
233 echo "Reintegrating CLIENTS"
234 reintegrate_clients || return 1
236 clients_up || return 3
237 sleep 2 # give it a little time for fully recovered before next test
239 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
240 ###################################################
242 ############### Fourth Failure Mode ###############
244 echo "Fourth Failure Mode: OST/MDS `date`"
250 echo "Test Lustre stability after OST failure"
256 shutdown_facet $SINGLEMDS
257 reboot_facet $SINGLEMDS
259 # prepare for MDS failover
260 change_active $SINGLEMDS
261 reboot_facet $SINGLEMDS
268 echo "Reintegrating OST"
274 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
279 clients_recover_osts ost1
280 echo "Test Lustre stability after MDS failover"
281 clients_up || return 1
283 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
284 ###################################################
286 ############### Fifth Failure Mode ###############
288 [ $OSTCOUNT -lt 2 ] && skip_env "$OSTCOUNT < 2, not enough OSTs" && return 0
290 echo "Fifth Failure Mode: OST/OST `date`"
293 echo "Verify Lustre filesystem is up and running"
294 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
303 echo "Test Lustre stability after OST failure"
313 echo "Test Lustre stability after OST failure"
319 echo "Reintegrating OSTs"
325 clients_recover_osts ost1
326 clients_recover_osts ost2
331 clients_up || return 2
333 run_test 5 "Fifth Failure Mode: OST/OST `date`"
334 ###################################################
336 ############### Sixth Failure Mode ###############
338 echo "Sixth Failure Mode: OST/CLIENT `date`"
341 echo "Verify Lustre filesystem is up and running"
342 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
345 client_touch testfile || return 2
352 echo "Test Lustre stability after OST failure"
359 echo "Failing CLIENTs"
363 echo "Test Lustre stability after CLIENTs failure"
370 echo "Reintegrating OST/CLIENTs"
373 reintegrate_clients || return 1
376 wait_remote_prog "stat -f" $((TIMEOUT * 3 + 20))
380 echo "Verifying mount"
381 [ -z "$(mounted_lustre_filesystems)" ] && return 3
384 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
385 ###################################################
388 ############### Seventh Failure Mode ###############
390 echo "Seventh Failure Mode: CLIENT/MDS `date`"
393 echo "Verify Lustre filesystem is up and running"
394 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
397 client_touch testfile || return 1
400 echo "Part 1: Failing CLIENT"
404 echo "Test Lustre stability after CLIENTs failure"
406 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
407 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
410 echo "Wait 1 minutes"
414 echo "Verify Lustre filesystem is up and running"
415 [ -z "$(mounted_lustre_filesystems)" ] && return 2
423 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
424 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
427 echo "Reintegrating CLIENTs"
428 reintegrate_clients || return 2
432 echo "wait 1 minutes"
435 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
436 ###################################################
439 ############### Eighth Failure Mode ###############
441 echo "Eighth Failure Mode: CLIENT/OST `date`"
444 echo "Verify Lustre filesystem is up and running"
445 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
448 client_touch testfile
451 echo "Failing CLIENTs"
455 echo "Test Lustre stability after CLIENTs failure"
457 $PDSH $LIVE_CLIENT "ls -l $TESTDIR"
458 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
461 echo "Wait 1 minutes"
465 echo "Verify Lustre filesystem is up and running"
466 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
469 client_touch testfile
477 echo "Test Lustre stability after OST failure"
481 #non-failout hangs forever here
482 #$PDSH $LIVE_CLIENT "ls -l $TESTDIR"
483 #$PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile"
486 echo "Reintegrating CLIENTs/OST"
487 reintegrate_clients || return 3
491 clients_up || return 1
492 client_touch testfile2 || return 2
495 echo "Wait 1 minutes"
498 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
499 ###################################################
502 ############### Ninth Failure Mode ###############
507 echo "Verify Lustre filesystem is up and running"
508 [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
511 client_touch testfile || return 1
514 echo "Failing CLIENTs"
518 echo "Test Lustre stability after CLIENTs failure"
520 $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 1
521 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 2
524 echo "Wait 1 minutes"
528 echo "Verify Lustre filesystem is up and running"
529 client_up $LIVE_CLIENT || return 3
530 client_touch testfile || return 4
533 echo "Failing CLIENTs"
537 echo "Test Lustre stability after CLIENTs failure"
539 $PDSH $LIVE_CLIENT "ls -l $TESTDIR" || return 5
540 $PDSH $LIVE_CLIENT "rm -f $TESTDIR/*_testfile" || return 6
543 echo "Reintegrating CLIENTs/CLIENTs"
544 reintegrate_clients || return 7
548 echo "Wait 1 minutes"
551 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
552 ###################################################
555 #Run availability after all failures
556 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
557 LOADTEST=${LOADTEST:-metadata-load.py}
558 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
560 run_test 10 "Running Availability for 6 hours..."
562 equals_msg `basename $0`: test complete, cleaning up
563 check_and_cleanup_lustre
564 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true