2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
13 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
16 [ "$SLOW" = "no" ] && EXCEPT="$EXCEPT "
19 CLEANUP=${CLEANUP:-""}
23 assert_env mds_HOST MDS_MKFS_OPTS
24 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
25 assert_env LIVE_CLIENT FSNAME
28 # This can be a regexp, to allow more clients
29 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
34 # fail clients round robin
36 # list of failable clients
37 FAIL_LIST=($FAIL_CLIENTS)
38 FAIL_NUM=${#FAIL_LIST[*]}
41 DOWN_NUM=0 # number of nodes currently down
43 # set next client to fail
45 FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
46 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
47 echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
52 if [ "$FAILURE_MODE" = HARD ]; then
54 while ping -w 3 -c 1 $client > /dev/null 2>&1; do
55 echo "waiting for node $client to fail"
58 elif [ "$FAILURE_MODE" = SOFT ]; then
59 zconf_umount $client $MOUNT -f
65 if [ "$FAILURE_MODE" = HARD ]; then
72 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
73 num=$((FAIL_NUM - DOWN_NUM))
76 if [ -z "$num" ] || [ "$num" -le 0 ]; then
82 for i in `seq $num`; do
85 DOWN_CLIENTS="$DOWN_CLIENTS $client"
86 shutdown_client $client
89 echo "down clients: $DOWN_CLIENTS"
91 for client in $DOWN_CLIENTS; do
94 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
98 reintegrate_clients() {
99 for client in $DOWN_CLIENTS; do
100 wait_for_host $client
101 echo "Restarting $client"
102 zconf_mount $client $MOUNT || return 1
109 start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
116 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
117 if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
118 $PDSH $c touch $MOUNT/${c}_$file || return 1
124 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
125 $PDSH $c rm $MOUNT/${c}_$file
130 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
131 echo "$c mkdir $MOUNT/$c"
132 $PDSH $c "mkdir $MOUNT/$c"
133 $PDSH $c "ls -l $MOUNT/$c"
138 for c in $LIVE_CLIENT $FAIL_CLIENTS; do
139 echo "rmdir $MOUNT/$c"
140 $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
144 clients_recover_osts() {
146 # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
149 cleanup_and_setup_lustre
151 # 9 Different Failure Modes Combinations
152 echo "Starting Test 17 at `date`"
155 facet_failover $SINGLEMDS
156 echo "Waiting for df pid: $DFPID"
157 wait $DFPID || { echo "df returned $?" && return 1; }
159 facet_failover ost1 || return 4
160 echo "Waiting for df pid: $DFPID"
161 wait $DFPID || { echo "df returned $?" && return 2; }
163 facet_failover ost2 || return 5
164 echo "Waiting for df pid: $DFPID"
165 wait $DFPID || { echo "df returned $?" && return 3; }
168 run_test 0 "Fail all nodes, independently"
170 ############### First Failure Mode ###############
172 echo "Don't do a MDS - MDS Failure Case"
173 echo "This makes no sense"
175 run_test 1 "MDS/MDS failure"
176 ###################################################
178 ############### Second Failure Mode ###############
180 echo "Verify Lustre filesystem is up and running"
183 shutdown_facet $SINGLEMDS
184 reboot_facet $SINGLEMDS
186 # prepare for MDS failover
187 change_active $SINGLEMDS
188 reboot_facet $SINGLEMDS
196 echo "Reintegrating OST"
199 start_ost 1 || return 2
202 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
206 clients_recover_osts ost1
207 echo "Verify reintegration"
208 client_df || return 1
211 run_test 2 "Second Failure Mode: MDS/OST `date`"
212 ###################################################
215 ############### Third Failure Mode ###############
218 echo "Verify Lustre filesystem is up and running"
221 facet_failover $SINGLEMDS
222 wait $DFPID || echo df failed: $?
225 echo "Test Lustre stability after MDS failover"
229 echo "Failing 2 CLIENTS"
233 echo "Test Lustre stability after CLIENT failure"
237 echo "Reintegrating CLIENTS"
238 reintegrate_clients || return 1
240 client_df || return 3
241 sleep 2 # give it a little time for fully recovered before next test
243 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
244 ###################################################
246 ############### Fourth Failure Mode ###############
248 echo "Fourth Failure Mode: OST/MDS `date`"
254 echo "Test Lustre stability after OST failure"
260 shutdown_facet $SINGLEMDS
261 reboot_facet $SINGLEMDS
263 # prepare for MDS failover
264 change_active $SINGLEMDS
265 reboot_facet $SINGLEMDS
272 echo "Reintegrating OST"
278 start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
283 clients_recover_osts ost1
284 echo "Test Lustre stability after MDS failover"
285 client_df || return 1
287 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
288 ###################################################
290 ############### Fifth Failure Mode ###############
292 echo "Fifth Failure Mode: OST/OST `date`"
295 echo "Verify Lustre filesystem is up and running"
303 echo "Test Lustre stability after OST failure"
313 echo "Test Lustre stability after OST failure"
319 echo "Reintegrating OSTs"
325 clients_recover_osts ost1
326 clients_recover_osts ost2
331 client_df || return 2
333 run_test 5 "Fifth Failure Mode: OST/OST `date`"
334 ###################################################
336 ############### Sixth Failure Mode ###############
338 echo "Sixth Failure Mode: OST/CLIENT `date`"
341 echo "Verify Lustre filesystem is up and running"
342 client_df || return 1
343 client_touch testfile || return 2
350 echo "Test Lustre stability after OST failure"
356 echo "Failing CLIENTs"
360 echo "Test Lustre stability after CLIENTs failure"
366 echo "Reintegrating OST/CLIENTs"
374 echo "Verifying mount"
375 client_df || return 3
377 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
378 ###################################################
381 ############### Seventh Failure Mode ###############
383 echo "Seventh Failure Mode: CLIENT/MDS `date`"
386 echo "Verify Lustre filesystem is up and running"
388 client_touch testfile || return 1
391 echo "Part 1: Failing CLIENT"
395 echo "Test Lustre stability after CLIENTs failure"
397 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
398 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
401 echo "Wait 1 minutes"
405 echo "Verify Lustre filesystem is up and running"
410 facet_failover $SINGLEMDS
413 echo "Test Lustre stability after MDS failover"
414 wait $DFPID || echo "df on down clients fails " || return 1
415 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
416 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
419 echo "Reintegrating CLIENTs"
421 client_df || return 2
424 echo "wait 1 minutes"
427 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
428 ###################################################
431 ############### Eighth Failure Mode ###############
433 echo "Eighth Failure Mode: CLIENT/OST `date`"
436 echo "Verify Lustre filesystem is up and running"
438 client_touch testfile
441 echo "Failing CLIENTs"
445 echo "Test Lustre stability after CLIENTs failure"
447 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
448 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
451 echo "Wait 1 minutes"
455 echo "Verify Lustre filesystem is up and running"
457 client_touch testfile
465 echo "Test Lustre stability after OST failure"
469 #non-failout hangs forever here
470 #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
471 #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
474 echo "Reintegrating CLIENTs/OST"
479 client_df || return 1
480 client_touch testfile2 || return 2
483 echo "Wait 1 minutes"
486 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
487 ###################################################
490 ############### Ninth Failure Mode ###############
495 echo "Verify Lustre filesystem is up and running"
497 client_touch testfile || return 1
500 echo "Failing CLIENTs"
504 echo "Test Lustre stability after CLIENTs failure"
506 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
507 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
510 echo "Wait 1 minutes"
514 echo "Verify Lustre filesystem is up and running"
515 $PDSH $LIVE_CLIENT df $MOUNT || return 3
516 client_touch testfile || return 4
519 echo "Failing CLIENTs"
523 echo "Test Lustre stability after CLIENTs failure"
525 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
526 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
529 echo "Reintegrating CLIENTs/CLIENTs"
531 client_df || return 7
534 echo "Wait 1 minutes"
537 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
538 ###################################################
541 #Run availability after all failures
542 DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
543 LOADTEST=${LOADTEST:-metadata-load.py}
544 $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
546 run_test 10 "Running Availability for 6 hours..."
548 equals_msg `basename $0`: test complete, cleaning up
549 check_and_cleanup_lustre
550 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true