lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
  12
  13 ALWAYS_EXCEPT="10"
  14
  15 SETUP=${SETUP:-"setup"}
  16 CLEANUP=${CLEANUP:-"cleanup"}
  17
  18 build_test_filter
  19
  20 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
  21
  22 ####
  23 # Initialize all the ostN_HOST
  24 NUMOST=2
  25 if [ "$EXTRA_OSTS" ]; then
  26     for host in $EXTRA_OSTS; do
  27         NUMOST=$((NUMOST + 1))
  28         OST=ost$NUMOST
  29         eval ${OST}_HOST=$host
  30     done
  31 fi
  32
  33 # This can be a regexp, to allow more clients
  34 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
  35
  36 DIR=${DIR:-$MOUNT}
  37
  38 #####
  39 # fail clients round robin
  40
  41 # list of failable clients
  42 FAIL_LIST=($FAIL_CLIENTS)
  43 FAIL_NUM=${#FAIL_LIST[*]}
  44 FAIL_NEXT=0
  45 typeset -i  FAIL_NEXT
  46 DOWN_NUM=0   # number of nodes currently down
  47
  48 # set next client to fail
  49 set_fail_client() {
  50     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  51     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  52     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  53 }
  54
  55 shutdown_client() {
  56     client=$1
  57     if [ "$FAILURE_MODE" = HARD ]; then
  58        $POWER_DOWN $client
  59        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  60            echo "waiting for node $client to fail"
  61            sleep 1
  62        done
  63     elif [ "$FAILURE_MODE" = SOFT ]; then
  64        zconf_umount $client $MOUNT -f
  65     fi
  66 }
  67
  68 reboot_node() {
  69     NODE=$1
  70     if [ "$FAILURE_MODE" = HARD ]; then
  71        $POWER_UP $NODE
  72     fi
  73 }
  74
  75 fail_clients() {
  76     num=$1
  77     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  78         num=$((FAIL_NUM - DOWN_NUM))
  79     fi
  80
  81     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  82         return
  83     fi
  84
  85     client_mkdirs
  86
  87     for i in `seq $num`; do
  88        set_fail_client
  89        client=$FAIL_CLIENT
  90        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  91        shutdown_client $client
  92     done
  93
  94     echo "down clients: $DOWN_CLIENTS"
  95
  96     for client in $DOWN_CLIENTS; do
  97         reboot_node $client
  98     done
  99     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
 100     client_rmdirs
 101 }
 102
 103 reintegrate_clients() {
 104     for client in $DOWN_CLIENTS; do
 105         wait_for_host $client
 106         echo "Restarting $client"
 107         zconf_mount $client $MOUNT || return 1
 108     done
 109     DOWN_CLIENTS=""
 110     DOWN_NUM=0
 111 }
 112
 113 gen_config() {
 114     rm -f $XMLCONFIG
 115     add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
 116
 117     if [ ! -z "$mdsfailover_HOST" ]; then
 118          add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
 119     fi
 120
 121     add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
 122         --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
 123     for i in `seq $NUMOST`; do
 124         dev=`printf $OSTDEV $i`
 125         add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
 126             --journal-size $OSTJOURNALSIZE
 127     done
 128
 129
 130     add_client client mds --lov lov1 --path $MOUNT
 131 }
 132
 133 setup() {
 134     gen_config
 135
 136     rm -rf logs/*
 137     for i in `seq $NUMOST`; do
 138         wait_for ost$i
 139         start ost$i ${REFORMAT} $OSTLCONFARGS
 140     done
 141     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
 142     wait_for mds
 143     start mds $MDSLCONFARGS ${REFORMAT}
 144     while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
 145     grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
 146
 147 }
 148
 149 cleanup() {
 150     zconf_umount $CLIENTS $MOUNT
 151
 152     stop mds ${FORCE} $MDSLCONFARGS || :
 153     for i in `seq $NUMOST`; do
 154         stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS  || :
 155     done
 156 }
 157
 158 trap exit INT
 159
 160 client_touch() {
 161     file=$1
 162     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 163         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 164         $PDSH $c touch $MOUNT/${c}_$file || return 1
 165     done
 166 }
 167
 168 client_rm() {
 169     file=$1
 170     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 171         $PDSH $c rm $MOUNT/${c}_$file
 172     done
 173 }
 174
 175 client_mkdirs() {
 176     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 177         echo "$c mkdir $MOUNT/$c"
 178         $PDSH $c "mkdir $MOUNT/$c"
 179         $PDSH $c "ls -l $MOUNT/$c"
 180     done
 181 }
 182
 183 client_rmdirs() {
 184     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 185         echo "rmdir $MOUNT/$c"
 186         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 187     done
 188 }
 189
 190 clients_recover_osts() {
 191     facet=$1
 192 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 193 }
 194
 195 node_to_ost() {
 196     node=$1
 197     retvar=$2
 198     for i in `seq $NUMOST`; do
 199         ostvar="ost${i}_HOST"
 200         if [ "${!ostvar}" == $node ]; then
 201             eval $retvar=ost${i}
 202             return 0
 203         fi
 204     done
 205     echo "No ost found for node; $node"
 206     return 1
 207
 208 }
 209
 210
 211
 212 if [ "$ONLY" == "cleanup" ]; then
 213     $CLEANUP
 214     exit
 215 fi
 216
 217 if [ ! -z "$EVAL" ]; then
 218     eval "$EVAL"
 219     exit $?
 220 fi
 221
 222 $SETUP
 223
 224 if [ "$ONLY" == "setup" ]; then
 225     exit 0
 226 fi
 227
 228 # 9 Different Failure Modes Combinations
 229 echo "Starting Test 17 at `date`"
 230
 231 test_0() {
 232     echo "Failover MDS"
 233     facet_failover mds
 234     echo "Waiting for df pid: $DFPID"
 235     wait $DFPID || echo "df returned $?" && return 1
 236
 237     echo "Failing OST1"
 238     facet_failover ost1
 239     echo "Waiting for df pid: $DFPID"
 240     wait $DFPID || echo "df returned $?" && return 2
 241
 242     echo "Failing OST2"
 243     facet_failover ost2
 244     echo "Waiting for df pid: $DFPID"
 245     wait $DFPID || echo "df returned $?" && return 3
 246     return 0
 247 }
 248 run_test 0 "Fail all nodes, independently"
 249
 250 ############### First Failure Mode ###############
 251 test_1() {
 252 echo "Don't do a MDS - MDS Failure Case"
 253 echo "This makes no sense"
 254 }
 255 run_test 1 "MDS/MDS failure"
 256 ###################################################
 257
 258 ############### Second Failure Mode ###############
 259 test_2() {
 260     echo "Verify Lustre filesystem is up and running"
 261     client_df
 262
 263     echo "Failing MDS"
 264     shutdown_facet mds
 265     reboot_facet mds
 266
 267     # prepare for MDS failover
 268     change_active mds
 269     reboot_facet mds
 270
 271     client_df &
 272     DFPID=$!
 273     sleep 5
 274
 275     echo "Failing OST"
 276     shutdown_facet ost1
 277
 278     echo "Reintegrating OST"
 279     reboot_facet ost1
 280     wait_for ost1
 281     start ost1
 282
 283     echo "Failover MDS"
 284     wait_for mds
 285     start mds
 286
 287     #Check FS
 288     wait $DFPID
 289     clients_recover_osts ost1
 290     echo "Verify reintegration"
 291     client_df || return 1
 292
 293 }
 294 run_test 2 "Second Failure Mode: MDS/OST `date`"
 295 ###################################################
 296
 297
 298 ############### Third Failure Mode ###############
 299 test_3() {
 300     #Create files
 301     echo "Verify Lustre filesystem is up and running"
 302
 303     #MDS Portion
 304     facet_failover mds
 305     wait $DFPID || echo df failed: $?
 306     #Check FS
 307
 308     echo "Test Lustre stability after MDS failover"
 309     client_df
 310
 311     #CLIENT Portion
 312     echo "Failing 2 CLIENTS"
 313     fail_clients 2
 314
 315     #Check FS
 316     echo "Test Lustre stability after CLIENT failure"
 317     client_df
 318
 319     #Reintegration
 320     echo "Reintegrating CLIENTS"
 321     reintegrate_clients || return 1
 322
 323     client_df || return 3
 324 }
 325 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 326 ###################################################
 327
 328 ############### Fourth Failure Mode ###############
 329 test_4() {
 330     echo "Fourth Failure Mode: OST/MDS `date`"
 331
 332     #OST Portion
 333     echo "Failing OST ost1"
 334     shutdown_facet ost1
 335
 336     #Check FS
 337     echo "Test Lustre stability after OST failure"
 338     client_df
 339
 340     #MDS Portion
 341     echo "Failing MDS"
 342     shutdown_facet mds
 343     reboot_facet mds
 344
 345     # prepare for MDS failover
 346     change_active mds
 347     reboot_facet mds
 348
 349     client_df &
 350     DFPID=$!
 351     sleep 5
 352
 353     #Reintegration
 354     echo "Reintegrating OST"
 355     reboot_facet ost1
 356     wait_for ost1
 357     start ost1
 358
 359     echo "Failover MDS"
 360     wait_for mds
 361     start mds
 362     #Check FS
 363
 364     wait $DFPID
 365     clients_recover_osts ost1
 366     echo "Test Lustre stability after MDS failover"
 367     client_df || return 1
 368 }
 369 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 370 ###################################################
 371
 372 ############### Fifth Failure Mode ###############
 373 test_5() {
 374     echo "Fifth Failure Mode: OST/OST `date`"
 375
 376     #Create files
 377     echo "Verify Lustre filesystem is up and running"
 378     client_df
 379
 380     #OST Portion
 381     echo "Failing OST"
 382     shutdown_facet ost1
 383     reboot_facet ost1
 384
 385     #Check FS
 386     echo "Test Lustre stability after OST failure"
 387     client_df
 388
 389     #OST Portion
 390     echo "Failing OST"
 391     shutdown_facet ost2
 392     reboot_facet ost2
 393
 394     #Check FS
 395     echo "Test Lustre stability after OST failure"
 396     client_df
 397
 398     #Reintegration
 399     echo "Reintegrating OSTs"
 400     wait_for ost1
 401     start ost1
 402     wait_for ost2
 403     start ost2
 404
 405     clients_recover_osts ost1
 406     clients_recover_osts ost2
 407     sleep $TIMEOUT
 408
 409     client_df || return 2
 410 }
 411 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 412 ###################################################
 413
 414 ############### Sixth Failure Mode ###############
 415 test_6() {
 416     echo "Sixth Failure Mode: OST/CLIENT `date`"
 417
 418     #Create files
 419     echo "Verify Lustre filesystem is up and running"
 420     client_df || return 1
 421     client_touch testfile || return 2
 422
 423     #OST Portion
 424     echo "Failing OST"
 425     shutdown_facet ost1
 426     reboot_facet ost1
 427
 428     #Check FS
 429     echo "Test Lustre stability after OST failure"
 430     client_df
 431
 432     #CLIENT Portion
 433     echo "Failing CLIENTs"
 434     fail_clients
 435
 436     #Check FS
 437     echo "Test Lustre stability after CLIENTs failure"
 438     client_df
 439
 440     #Reintegration
 441     echo "Reintegrating OST/CLIENTs"
 442     wait_for ost1
 443     start ost1
 444     reintegrate_clients
 445     sleep 5
 446
 447     echo "Verifying mount"
 448     client_df || return 3
 449 }
 450 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 451 ###################################################
 452
 453
 454 ############### Seventh Failure Mode ###############
 455 test_7() {
 456     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 457
 458     #Create files
 459     echo "Verify Lustre filesystem is up and running"
 460     client_df
 461     client_touch testfile  || return 1
 462
 463     #CLIENT Portion
 464     echo "Part 1: Failing CLIENT"
 465     fail_clients 2
 466
 467     #Check FS
 468     echo "Test Lustre stability after CLIENTs failure"
 469     client_df
 470     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 471     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 472
 473     #Sleep
 474     echo "Wait 1 minutes"
 475     sleep 60
 476
 477     #Create files
 478     echo "Verify Lustre filesystem is up and running"
 479     client_df
 480     client_rm testfile
 481
 482     #MDS Portion
 483     echo "Failing MDS"
 484     facet_failover mds
 485
 486     #Check FS
 487     echo "Test Lustre stability after MDS failover"
 488     wait $DFPID || echo "df on down clients fails " || return 1
 489     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 490     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 491
 492     #Reintegration
 493     echo "Reintegrating CLIENTs"
 494     reintegrate_clients
 495     client_df || return 2
 496
 497     #Sleep
 498     echo "wait 1 minutes"
 499     sleep 60
 500 }
 501 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 502 ###################################################
 503
 504
 505 ############### Eighth Failure Mode ###############
 506 test_8() {
 507     echo "Eighth Failure Mode: CLIENT/OST `date`"
 508
 509     #Create files
 510     echo "Verify Lustre filesystem is up and running"
 511     client_df
 512     client_touch testfile
 513
 514     #CLIENT Portion
 515     echo "Failing CLIENTs"
 516     fail_clients 2
 517
 518     #Check FS
 519     echo "Test Lustre stability after CLIENTs failure"
 520     client_df
 521     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 522     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 523
 524     #Sleep
 525     echo "Wait 1 minutes"
 526     sleep 60
 527
 528     #Create files
 529     echo "Verify Lustre filesystem is up and running"
 530     client_df
 531     client_touch testfile
 532
 533
 534     #OST Portion
 535     echo "Failing OST"
 536     shutdown_facet ost1
 537     reboot_facet ost1
 538
 539     #Check FS
 540     echo "Test Lustre stability after OST failure"
 541     client_df
 542     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 543     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 544
 545     #Reintegration
 546     echo "Reintegrating CLIENTs/OST"
 547     reintegrate_clients
 548     wait_for ost1
 549     start ost1
 550     client_df || return 1
 551     client_touch testfile2 || return 2
 552
 553     #Sleep
 554     echo "Wait 1 minutes"
 555     sleep 60
 556 }
 557 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 558 ###################################################
 559
 560
 561 ############### Ninth Failure Mode ###############
 562 test_9() {
 563     echo
 564
 565     #Create files
 566     echo "Verify Lustre filesystem is up and running"
 567     client_df
 568     client_touch testfile || return 1
 569
 570     #CLIENT Portion
 571     echo "Failing CLIENTs"
 572     fail_clients 2
 573
 574     #Check FS
 575     echo "Test Lustre stability after CLIENTs failure"
 576     client_df
 577     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 578     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 579
 580     #Sleep
 581     echo "Wait 1 minutes"
 582     sleep 60
 583
 584     #Create files
 585     echo "Verify Lustre filesystem is up and running"
 586     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 587     client_touch testfile || return 4
 588
 589     #CLIENT Portion
 590     echo "Failing CLIENTs"
 591     fail_clients 2
 592
 593     #Check FS
 594     echo "Test Lustre stability after CLIENTs failure"
 595     client_df
 596     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 597     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 598
 599     #Reintegration
 600     echo "Reintegrating  CLIENTs/CLIENTs"
 601     reintegrate_clients
 602     client_df || return 7
 603
 604     #Sleep
 605     echo "Wait 1 minutes"
 606     sleep 60
 607 }
 608 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 609 ###################################################
 610
 611 test_10() {
 612     #Run availability after all failures
 613     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 614     LOADTEST=${LOADTEST:-metadata-load.py}
 615     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 616 }
 617 run_test 10 "Running Availability for 6 hours..."
 618
 619 equals_msg "Done, cleaning up"
 620 $CLEANUP