2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
17 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
19 # This can be a regexp, to allow more clients
20 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS`"}
22 CLIENTLIST="$LIVE_CLIENT $FAIL_CLIENTS"
27 # fail clients round robin
29 # list of failable clients
30 FAIL_LIST=($FAIL_CLIENTS)
31 FAIL_NUM=${#FAIL_LIST[*]}
33 DOWN_NUM=0 # number of nodes currently down
35 # return next client to fail
37 ret=${FAIL_LIST[$FAIL_NEXT]}
38 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
44 if [ "$FAILURE_MODE" = HARD ]; then
46 elif [ "$FAILURE_MODE" = SOFT ]; then
47 $PDSH $client $LCONF --clenaup --force --nomod $XMLCONFIG
53 if [ "$FAILURE_MODE" = HARD ]; then
60 if [ -z "$num" -o $num -gt $((FAIL_NUM - DOWN_NUM)) ]; then
61 num=$((FAIL_NUM - DOWN_NUM))
64 if [ -z "$num" -o $num -le 0 ]; then
68 for i in `seq $num`; do
70 DOWN_CLIENTS="$DOWN_CLIENTS $client"
72 shutdown_client $client
75 for client in $DOWN_CLIENTS; do
78 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
79 $PDSH $LIVE_CLIENT "cd $MOUNT && rmdir $CLIENTLIST"
82 reintegrate_clients() {
83 for client in $DOWN_CLIENTS; do
85 $PDSH $client "$LCONF --node client --select mds_svc=`facet_active mds` $CLIENTOPTS $XMLCONFIG"
93 add_mds mds --dev $MDSDEV --size $MDSSIZE
95 if [ ! -z "$mdsfailover_HOST" ]; then
96 add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
99 add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
100 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
101 add_ost ost1 --lov lov1 --dev $OSTDEV --size $OSTSIZE
102 add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE
103 add_client client mds --lov lov1 --path $MOUNT
108 start ost1 ${REFORMAT} $OSTLCONFARGS
110 start ost2 ${REFORMAT} $OSTLCONFARGS
111 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
113 start mds $MDSLCONFARGS ${REFORMAT}
114 while ! $PDSH $HOST "ls -ld $LUSTRE"; do sleep 5; done
115 do_node $CLIENTS lconf --node client_facet \
116 --select mds_service=$ACTIVEMDS $XMLCONFIG
120 # make sure we are using the primary MDS, so the config log will
121 # be able to clean up properly.
122 activemds=`facet_active mds`
123 # if [ $activemds != "mds" ]; then
126 for node in $CLIENTS; do
127 do_node $node lconf ${FORCE} --select mds_svc=${activemds}_facet --cleanup --node client_facet $XMLCONFIG || true
130 stop mds ${FORCE} $MDSLCONFARGS
132 stop ost2 ${FORCE} --dump cleanup.log
138 $PDSH $CLIENTS "mkdir $MOUNT/\`hostname\`; ls $MOUNT/\`hostname\` > /dev/null"
141 clients_recover_osts() {
143 $PDSH $CLIENTS "$LCTL "'--device %OSC_`hostname`_OST_'"${facet}_svc_MNT_client recover"
146 if [ "$ONLY" == "cleanup" ]; then
154 if [ "$ONLY" == "setup" ]; then
158 # 9 Different Failure Modes Combinations
159 echo "Starting Test 17 at `date`"
164 wait $DFPID || return 1
168 wait $DFPID || return 2
172 wait $DFPID || return 3
175 run_test 0 "Fail all nodes, independently"
177 ############### First Failure Mode ###############
179 echo "Don't do a MDS - MDS Failure Case"
180 echo "This makes no sense"
181 # FIXME every test makes sense
183 run_test 1 "MDS/MDS failure"
184 ###################################################
186 ############### Second Failure Mode ###############
188 echo "Verify Lustre filesystem is up and running"
195 # prepare for MDS failover
206 echo "Reintegrating OST"
217 clients_recover_osts ost1
218 echo "Verify reintegration"
222 run_test 2 "Second Failure Mode: MDS/OST `date`"
223 ###################################################
226 ############### Third Failure Mode ###############
229 echo "Verify Lustre filesystem is up and running"
233 wait $DFPID || echo df failed: $?
236 echo "Test Lustre stability after MDS failover"
240 echo "Failing 2 CLIENTS"
244 echo "Test Lustre stability after CLIENT failure"
248 echo "Reintegrating CLIENTS"
253 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
254 ###################################################
256 ############### Fourth Failure Mode ###############
258 echo "Fourth Failure Mode: OST/MDS `date`"
261 echo "Failing OST ost1"
265 echo "Test Lustre stability after OST failure"
273 # prepare for MDS failover
282 echo "Reintegrating OST"
293 clients_recover_osts ost1
294 echo "Test Lustre stability after MDS failover"
297 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
298 ###################################################
300 ############### Fifth Failure Mode ###############
302 echo "Fifth Failure Mode: OST/OST `date`"
305 echo "Verify Lustre filesystem is up and running"
314 echo "Test Lustre stability after OST failure"
323 echo "Test Lustre stability after OST failure"
327 echo "Reintegrating OSTs"
333 clients_recover_osts ost1
334 clients_recover_osts ost2
337 run_test 5 "Fifth Failure Mode: OST/OST `date`"
338 ###################################################
340 ############### Sixth Failure Mode ###############
342 echo "Sixth Failure Mode: OST/CLIENT `date`"
345 echo "Verify Lustre filesystem is up and running"
347 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
355 echo "Test Lustre stability after OST failure"
359 echo "Failing CLIENTs"
363 echo "Test Lustre stability after CLIENTs failure"
367 echo "Reintegrating OST/CLIENTs"
372 echo "Verifying mount"
375 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
376 ###################################################
379 ############### Seventh Failure Mode ###############
381 echo "Seventh Failure Mode: CLIENT/MDS `date`"
384 echo "Verify Lustre filesystem is up and running"
386 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
389 echo "Part 1: Failing CLIENT"
393 echo "Test Lustre stability after CLIENTs failure"
395 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
396 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
399 echo "Wait 1 minutes"
403 echo "Verify Lustre filesystem is up and running"
405 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
412 echo "Test Lustre stability after MDS failover"
414 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
415 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
418 echo "Reintegrating CLIENTs"
423 echo "wait 1 minutes"
426 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
427 ###################################################
430 ############### Eighth Failure Mode ###############
432 echo "Eighth Failure Mode: CLIENT/OST `date`"
435 echo "Verify Lustre filesystem is up and running"
437 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
440 echo "Failing CLIENTs"
444 echo "Test Lustre stability after CLIENTs failure"
446 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
447 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
450 echo "Wait 1 minutes"
454 echo "Verify Lustre filesystem is up and running"
456 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
464 echo "Test Lustre stability after OST failure"
466 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
467 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
470 echo "Reintegrating CLIENTs/OST"
474 $PDSH $CLIENTS "/bin/touch $MOUNT/CLIENT_OST_2\`hostname\`_testfile"
477 echo "Wait 1 minutes"
480 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
481 ###################################################
484 ############### Ninth Failure Mode ###############
489 echo "Verify Lustre filesystem is up and running"
491 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
494 echo "Failing CLIENTs"
498 echo "Test Lustre stability after CLIENTs failure"
500 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
501 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
504 echo "Wait 1 minutes"
508 echo "Verify Lustre filesystem is up and running"
510 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
513 echo "Failing CLIENTs"
517 echo "Test Lustre stability after CLIENTs failure"
519 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
520 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
523 echo "Reintegrating CLIENTs/CLIENTs"
528 echo "Wait 1 minutes"
531 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
532 ###################################################
535 #Run availability after all failures
536 ./availability.sh 21600
538 run_test 10 "Running Availability for 6 hours..."
540 equals_msg "Done, cleaning up"