2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
13 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
16 CLEANUP=${CLEANUP:-""}
20 assert_env mds_HOST MDS_MKFS_OPTS
21 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
22 assert_env LIVE_CLIENT FSNAME
25 # This can be a regexp, to allow more clients
26 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
31 # fail clients round robin
33 # list of failable clients
34 FAIL_LIST=($FAIL_CLIENTS)
35 FAIL_NUM=${#FAIL_LIST[*]}
38 DOWN_NUM=0 # number of nodes currently down
40 # set next client to fail
42 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
43 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
44 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
49 if [ "$FAILURE_MODE" = HARD ]; then
51 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
52 echo "waiting for node $client to fail"
55 elif [ "$FAILURE_MODE" = SOFT ]; then
56 zconf_umount $client $MOUNT -f
62 if [ "$FAILURE_MODE" = HARD ]; then
69 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
70 num=$((FAIL_NUM - DOWN_NUM))
73 if [ -z "$num" ] || [ "$num" -le 0 ]; then
79 for i in `seq $num`; do
82 DOWN_CLIENTS="$DOWN_CLIENTS $client"
83 shutdown_client $client
86 echo "down clients: $DOWN_CLIENTS"
88 for client in $DOWN_CLIENTS; do
91 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
95 reintegrate_clients() {
96 for client in $DOWN_CLIENTS; do
98 echo "Restarting $client"
99 zconf_mount $client $MOUNT || return 1
106 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
113 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
114 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
115 $PDSH $c touch $MOUNT/${c}_$file || return 1
121 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
122 $PDSH $c rm $MOUNT/${c}_$file
127 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
128 echo "$c mkdir $MOUNT/$c"
129 $PDSH $c "mkdir $MOUNT/$c"
130 $PDSH $c "ls -l $MOUNT/$c"
135 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
136 echo "rmdir $MOUNT/$c"
137 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
141 clients_recover_osts() {
143 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
146 cleanup_and_setup_lustre
148 # 9 Different Failure Modes Combinations
149 echo "Starting Test 17 at `date`"
152 facet_failover $SINGLEMDS
153 echo "Waiting for df pid: $DFPID"
154 wait $DFPID || { echo "df returned $?" && return 1; }
156 facet_failover ost1 || return 4
157 echo "Waiting for df pid: $DFPID"
158 wait $DFPID || { echo "df returned $?" && return 2; }
160 facet_failover ost2 || return 5
161 echo "Waiting for df pid: $DFPID"
162 wait $DFPID || { echo "df returned $?" && return 3; }
165 run_test 0 "Fail all nodes, independently"
167 ############### First Failure Mode ###############
169 echo "Don't do a MDS - MDS Failure Case"
170 echo "This makes no sense"
172 run_test 1 "MDS/MDS failure"
173 ###################################################
175 ############### Second Failure Mode ###############
177 echo "Verify Lustre filesystem is up and running"
180 shutdown_facet $SINGLEMDS
181 reboot_facet $SINGLEMDS
183 # prepare for MDS failover
184 change_active $SINGLEMDS
185 reboot_facet $SINGLEMDS
193 echo "Reintegrating OST"
196 start_ost 1 || return 2
199 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
203 clients_recover_osts ost1
204 echo "Verify reintegration"
205 client_df || return 1
208 run_test 2 "Second Failure Mode: MDS/OST `date`"
209 ###################################################
212 ############### Third Failure Mode ###############
215 echo "Verify Lustre filesystem is up and running"
218 facet_failover $SINGLEMDS
219 wait $DFPID || echo df failed: $?
222 echo "Test Lustre stability after MDS failover"
226 echo "Failing 2 CLIENTS"
230 echo "Test Lustre stability after CLIENT failure"
234 echo "Reintegrating CLIENTS"
235 reintegrate_clients || return 1
237 client_df || return 3
238 sleep 2 # give it a little time for fully recovered before next test
240 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
241 ###################################################
243 ############### Fourth Failure Mode ###############
245 echo "Fourth Failure Mode: OST/MDS `date`"
251 echo "Test Lustre stability after OST failure"
257 shutdown_facet $SINGLEMDS
258 reboot_facet $SINGLEMDS
260 # prepare for MDS failover
261 change_active $SINGLEMDS
262 reboot_facet $SINGLEMDS
269 echo "Reintegrating OST"
275 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
280 clients_recover_osts ost1
281 echo "Test Lustre stability after MDS failover"
282 client_df || return 1
284 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
285 ###################################################
287 ############### Fifth Failure Mode ###############
289 echo "Fifth Failure Mode: OST/OST `date`"
292 echo "Verify Lustre filesystem is up and running"
300 echo "Test Lustre stability after OST failure"
310 echo "Test Lustre stability after OST failure"
316 echo "Reintegrating OSTs"
322 clients_recover_osts ost1
323 clients_recover_osts ost2
328 client_df || return 2
330 run_test 5 "Fifth Failure Mode: OST/OST `date`"
331 ###################################################
333 ############### Sixth Failure Mode ###############
335 echo "Sixth Failure Mode: OST/CLIENT `date`"
338 echo "Verify Lustre filesystem is up and running"
339 client_df || return 1
340 client_touch testfile || return 2
347 echo "Test Lustre stability after OST failure"
353 echo "Failing CLIENTs"
357 echo "Test Lustre stability after CLIENTs failure"
363 echo "Reintegrating OST/CLIENTs"
371 echo "Verifying mount"
372 client_df || return 3
374 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
375 ###################################################
378 ############### Seventh Failure Mode ###############
380 echo "Seventh Failure Mode: CLIENT/MDS `date`"
383 echo "Verify Lustre filesystem is up and running"
385 client_touch testfile || return 1
388 echo "Part 1: Failing CLIENT"
392 echo "Test Lustre stability after CLIENTs failure"
394 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
395 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
398 echo "Wait 1 minutes"
402 echo "Verify Lustre filesystem is up and running"
407 facet_failover $SINGLEMDS
410 echo "Test Lustre stability after MDS failover"
411 wait $DFPID || echo "df on down clients fails " || return 1
412 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
413 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
416 echo "Reintegrating CLIENTs"
418 client_df || return 2
421 echo "wait 1 minutes"
424 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
425 ###################################################
428 ############### Eighth Failure Mode ###############
430 echo "Eighth Failure Mode: CLIENT/OST `date`"
433 echo "Verify Lustre filesystem is up and running"
435 client_touch testfile
438 echo "Failing CLIENTs"
442 echo "Test Lustre stability after CLIENTs failure"
444 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
445 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
448 echo "Wait 1 minutes"
452 echo "Verify Lustre filesystem is up and running"
454 client_touch testfile
462 echo "Test Lustre stability after OST failure"
466 #non-failout hangs forever here
467 #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
468 #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
471 echo "Reintegrating CLIENTs/OST"
476 client_df || return 1
477 client_touch testfile2 || return 2
480 echo "Wait 1 minutes"
483 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
484 ###################################################
487 ############### Ninth Failure Mode ###############
492 echo "Verify Lustre filesystem is up and running"
494 client_touch testfile || return 1
497 echo "Failing CLIENTs"
501 echo "Test Lustre stability after CLIENTs failure"
503 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
504 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
507 echo "Wait 1 minutes"
511 echo "Verify Lustre filesystem is up and running"
512 $PDSH $LIVE_CLIENT df $MOUNT || return 3
513 client_touch testfile || return 4
516 echo "Failing CLIENTs"
520 echo "Test Lustre stability after CLIENTs failure"
522 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
523 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
526 echo "Reintegrating CLIENTs/CLIENTs"
528 client_df || return 7
531 echo "Wait 1 minutes"
534 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
535 ###################################################
538 #Run availability after all failures
539 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
540 LOADTEST=${LOADTEST:-metadata-load.py}
541 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
543 run_test 10 "Running Availability for 6 hours..."
545 equals_msg `basename $0`: test complete, cleaning up
546 check_and_cleanup_lustre
547 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true