2 # Test multiple failures, AKA Test 17
6 LUSTRE=${LUSTRE:-`dirname $0`/..}
7 . $LUSTRE/tests/test-framework.sh
11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
17 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
19 # This can be a regexp, to allow more clients
20 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS`"}
22 CLIENTLIST="$LIVE_CLIENT $FAIL_CLIENTS"
27 # fail clients round robin
29 # list of failable clients
30 FAIL_LIST=($FAIL_CLIENTS)
31 FAIL_NUM=${#FAIL_LIST[*]}
33 DOWN_NUM=0 # number of nodes currently down
35 # return next client to fail
37 ret=${FAIL_LIST[$FAIL_NEXT]}
38 FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
44 if [ "$FAILURE_MODE" = HARD ]; then
46 elif [ "$FAILURE_MODE" = SOFT ]; then
47 $PDSH $client $LCONF --clenaup --force --nomod $XMLCONFIG
53 if [ "$FAILURE_MODE" = HARD ]; then
60 if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
61 num=$((FAIL_NUM - DOWN_NUM))
64 if [ -z "$num" ] || [ "$num" -le 0 ]; then
68 for i in `seq $num`; do
70 DOWN_CLIENTS="$DOWN_CLIENTS $client"
72 shutdown_client $client
75 for client in $DOWN_CLIENTS; do
78 DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
79 $PDSH $LIVE_CLIENT "cd $MOUNT && rmdir $CLIENTLIST"
82 reintegrate_clients() {
83 for client in $DOWN_CLIENTS; do
85 $PDSH $client "$LCONF --node client --select mds_svc=`facet_active mds` $CLIENTOPTS $XMLCONFIG"
93 add_mds mds --dev $MDSDEV --size $MDSSIZE
95 if [ ! -z "$mdsfailover_HOST" ]; then
96 add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
99 add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
100 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
101 add_ost ost1 --lov lov1 --dev $OSTDEV --size $OSTSIZE
102 add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE
103 add_client client mds --lov lov1 --path $MOUNT
108 start ost1 ${REFORMAT} $OSTLCONFARGS
110 start ost2 ${REFORMAT} $OSTLCONFARGS
111 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
113 start mds $MDSLCONFARGS ${REFORMAT}
114 while ! do_node $HOST "$CHECKSTAT -t dir $LUSTRE"; do sleep 5; done
115 do_node $CLIENTS lconf --node client_facet \
116 --select mds_service=$ACTIVEMDS $XMLCONFIG
120 # make sure we are using the primary MDS, so the config log will
121 # be able to clean up properly.
122 activemds=`facet_active mds`
123 # if [ $activemds != "mds" ]; then
126 for node in $CLIENTS; do
127 do_node $node lconf ${FORCE} --select mds_svc=${activemds}_facet --cleanup --node client_facet $XMLCONFIG || true
130 stop mds ${FORCE} $MDSLCONFARGS
132 stop ost2 ${FORCE} --dump cleanup.log
138 $PDSH $CLIENTS "mkdir $MOUNT/\`hostname\`; ls $MOUNT/\`hostname\` > /dev/null"
141 clients_recover_osts() {
143 $PDSH $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
146 if [ "$ONLY" == "cleanup" ]; then
154 if [ "$ONLY" == "setup" ]; then
158 # 9 Different Failure Modes Combinations
159 echo "Starting Test 17 at `date`"
164 wait $DFPID || return 1
168 wait $DFPID || return 2
172 wait $DFPID || return 3
175 run_test 0 "Fail all nodes, independently"
177 ############### First Failure Mode ###############
179 echo "Don't do a MDS - MDS Failure Case"
180 echo "This makes no sense"
181 # FIXME every test makes sense
183 run_test 1 "MDS/MDS failure"
184 ###################################################
186 ############### Second Failure Mode ###############
188 echo "Verify Lustre filesystem is up and running"
195 # prepare for MDS failover
206 echo "Reintegrating OST"
217 clients_recover_osts ost1
218 echo "Verify reintegration"
219 client_df || return 1
222 run_test 2 "Second Failure Mode: MDS/OST `date`"
223 ###################################################
226 ############### Third Failure Mode ###############
229 echo "Verify Lustre filesystem is up and running"
233 wait $DFPID || echo df failed: $?
236 echo "Test Lustre stability after MDS failover"
240 echo "Failing 2 CLIENTS"
244 echo "Test Lustre stability after CLIENT failure"
248 echo "Reintegrating CLIENTS"
251 client_df || return 1
253 run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
254 ###################################################
256 ############### Fourth Failure Mode ###############
258 echo "Fourth Failure Mode: OST/MDS `date`"
261 echo "Failing OST ost1"
265 echo "Test Lustre stability after OST failure"
273 # prepare for MDS failover
282 echo "Reintegrating OST"
293 clients_recover_osts ost1
294 echo "Test Lustre stability after MDS failover"
295 client_df || return 1
297 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
298 ###################################################
300 ############### Fifth Failure Mode ###############
302 echo "Fifth Failure Mode: OST/OST `date`"
305 echo "Verify Lustre filesystem is up and running"
314 echo "Test Lustre stability after OST failure"
323 echo "Test Lustre stability after OST failure"
327 echo "Reintegrating OSTs"
333 clients_recover_osts ost1
334 clients_recover_osts ost2
336 client_df || return 1
338 run_test 5 "Fifth Failure Mode: OST/OST `date`"
339 ###################################################
341 ############### Sixth Failure Mode ###############
343 echo "Sixth Failure Mode: OST/CLIENT `date`"
346 echo "Verify Lustre filesystem is up and running"
347 client_df || return 1
348 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile" || return 2
356 echo "Test Lustre stability after OST failure"
360 echo "Failing CLIENTs"
364 echo "Test Lustre stability after CLIENTs failure"
368 echo "Reintegrating OST/CLIENTs"
374 echo "Verifying mount"
375 client_df || return 3
377 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
378 ###################################################
381 ############### Seventh Failure Mode ###############
383 echo "Seventh Failure Mode: CLIENT/MDS `date`"
386 echo "Verify Lustre filesystem is up and running"
388 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
391 echo "Part 1: Failing CLIENT"
395 echo "Test Lustre stability after CLIENTs failure"
397 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
398 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
401 echo "Wait 1 minutes"
405 echo "Verify Lustre filesystem is up and running"
407 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
414 echo "Test Lustre stability after MDS failover"
416 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
417 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
420 echo "Reintegrating CLIENTs"
422 client_df || return 1
425 echo "wait 1 minutes"
428 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
429 ###################################################
432 ############### Eighth Failure Mode ###############
434 echo "Eighth Failure Mode: CLIENT/OST `date`"
437 echo "Verify Lustre filesystem is up and running"
439 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
442 echo "Failing CLIENTs"
446 echo "Test Lustre stability after CLIENTs failure"
448 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
449 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
452 echo "Wait 1 minutes"
456 echo "Verify Lustre filesystem is up and running"
458 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
466 echo "Test Lustre stability after OST failure"
468 $PDSH $LIVE_CLIENT "ls -l $MOUNT"
469 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
472 echo "Reintegrating CLIENTs/OST"
475 client_df || return 1
476 $PDSH $CLIENTS "/bin/touch $MOUNT/CLIENT_OST_2\`hostname\`_testfile" || return 2
479 echo "Wait 1 minutes"
482 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
483 ###################################################
486 ############### Ninth Failure Mode ###############
491 echo "Verify Lustre filesystem is up and running"
493 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile"
496 echo "Failing CLIENTs"
500 echo "Test Lustre stability after CLIENTs failure"
502 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
503 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
506 echo "Wait 1 minutes"
510 echo "Verify Lustre filesystem is up and running"
511 client_df || return 3
512 $PDSH $CLIENTS "/bin/touch $MOUNT/\`hostname\`_testfile" || return 4
515 echo "Failing CLIENTs"
519 echo "Test Lustre stability after CLIENTs failure"
521 $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
522 $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
525 echo "Reintegrating CLIENTs/CLIENTs"
527 client_df || return 7
530 echo "Wait 1 minutes"
533 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
534 ###################################################
537 #Run availability after all failures
538 ./availability.sh 21600
540 run_test 10 "Running Availability for 6 hours..."
542 equals_msg "Done, cleaning up"