lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-lmv.sh}
  12
  13 ALWAYS_EXCEPT="10"
  14
  15 SETUP=${SETUP:-"setup"}
  16 CLEANUP=${CLEANUP:-"cleanup"}
  17
  18 build_test_filter
  19
  20 assert_env MDSCOUNT mds1_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
  21
  22 ####
  23 # Initialize all the ostN_HOST
  24 NUMOST=2
  25 if [ "$EXTRA_OSTS" ]; then
  26     for host in $EXTRA_OSTS; do
  27         NUMOST=$((NUMOST + 1))
  28         OST=ost$NUMOST
  29         eval ${OST}_HOST=$host
  30     done
  31 fi
  32
  33 # This can be a regexp, to allow more clients
  34 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
  35
  36 DIR=${DIR:-$MOUNT}
  37
  38 #####
  39 # fail clients round robin
  40
  41 # list of failable clients
  42 FAIL_LIST=($FAIL_CLIENTS)
  43 FAIL_NUM=${#FAIL_LIST[*]}
  44 FAIL_NEXT=0
  45 typeset -i  FAIL_NEXT
  46 DOWN_NUM=0   # number of nodes currently down
  47
  48 # set next client to fail
  49 set_fail_client() {
  50     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  51     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  52     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  53 }
  54
  55 shutdown_client() {
  56     client=$1
  57     if [ "$FAILURE_MODE" = HARD ]; then
  58        $POWER_DOWN $client
  59        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  60            echo "waiting for node $client to fail"
  61            sleep 1
  62        done
  63     elif [ "$FAILURE_MODE" = SOFT ]; then
  64        zconf_umount $client $MOUNT -f
  65     fi
  66 }
  67
  68 reboot_node() {
  69     NODE=$1
  70     if [ "$FAILURE_MODE" = HARD ]; then
  71        $POWER_UP $NODE
  72     fi
  73 }
  74
  75 fail_clients() {
  76     num=$1
  77     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  78         num=$((FAIL_NUM - DOWN_NUM))
  79     fi
  80
  81     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  82         return
  83     fi
  84
  85     client_mkdirs
  86
  87     for i in `seq $num`; do
  88        set_fail_client
  89        client=$FAIL_CLIENT
  90        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  91        shutdown_client $client
  92     done
  93
  94     echo "down clients: $DOWN_CLIENTS"
  95
  96     for client in $DOWN_CLIENTS; do
  97         reboot_node $client
  98     done
  99     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
 100     client_rmdirs
 101 }
 102
 103 reintegrate_clients() {
 104     for client in $DOWN_CLIENTS; do
 105         wait_for_host $client
 106         echo "Restarting $client"
 107         zconf_mount $client $MOUNT || return 1
 108     done
 109     DOWN_CLIENTS=""
 110     DOWN_NUM=0
 111 }
 112
 113 gen_config() {
 114     rm -f $XMLCONFIG
 115     if [ "$MDSCOUNT" -gt 1 ]; then
 116         add_lmv lmv1_svc
 117         for mds in `mds_list`; do
 118             MDSDEV=$TMP/${mds}-`hostname`
 119             add_mds $mds --dev $MDSDEV --size $MDSSIZE --lmv lmv1_svc
 120         done
 121         add_lov_to_lmv lov1 lmv1_svc --stripe_sz $STRIPE_BYTES \
 122             --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
 123         MDS=lmv1
 124     else
 125         add_mds mds1 --dev $MDSDEV --size $MDSSIZE
 126         if [ ! -z "$mds1failover_HOST" ]; then
 127              add_mdsfailover mds1 --dev $MDSDEV --size $MDSSIZE
 128         fi
 129         add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \
 130             --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
 131         MDS=mds1
 132     fi
 133
 134     for i in `seq $NUMOST`; do
 135         dev=`printf $OSTDEV $i`
 136         add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
 137             --journal-size $OSTJOURNALSIZE
 138     done
 139
 140     add_client client $MDS --lov lov1 --path $MOUNT
 141 }
 142
 143 setup() {
 144     gen_config
 145
 146     rm -rf logs/*
 147     for i in `seq $NUMOST`; do
 148         wait_for ost$i
 149         start ost$i ${REFORMAT} $OSTLCONFARGS
 150     done
 151     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
 152     for mds in `mds_list`; do
 153         wait_for $mds
 154         start $mds $MDSLCONFARGS ${REFORMAT}
 155     done
 156     while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
 157     grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
 158
 159 }
 160
 161 cleanup() {
 162     zconf_umount $CLIENTS $MOUNT
 163
 164     for mds in `mds_list`; do
 165         stop $mds ${FORCE} $MDSLCONFARGS || :
 166     done
 167     for i in `seq $NUMOST`; do
 168         stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS  || :
 169     done
 170 }
 171
 172 trap exit INT
 173
 174 client_touch() {
 175     file=$1
 176     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 177         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 178         $PDSH $c touch $MOUNT/${c}_$file || return 1
 179     done
 180 }
 181
 182 client_rm() {
 183     file=$1
 184     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 185         $PDSH $c rm $MOUNT/${c}_$file
 186     done
 187 }
 188
 189 client_mkdirs() {
 190     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 191         echo "$c mkdir $MOUNT/$c"
 192         $PDSH $c "mkdir $MOUNT/$c"
 193         $PDSH $c "ls -l $MOUNT/$c"
 194     done
 195 }
 196
 197 client_rmdirs() {
 198     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 199         echo "rmdir $MOUNT/$c"
 200         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 201     done
 202 }
 203
 204 clients_recover_osts() {
 205     facet=$1
 206 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 207 }
 208
 209 node_to_ost() {
 210     node=$1
 211     retvar=$2
 212     for i in `seq $NUMOST`; do
 213         ostvar="ost${i}_HOST"
 214         if [ "${!ostvar}" == $node ]; then
 215             eval $retvar=ost${i}
 216             return 0
 217         fi
 218     done
 219     echo "No ost found for node; $node"
 220     return 1
 221
 222 }
 223
 224
 225
 226 if [ "$ONLY" == "cleanup" ]; then
 227     $CLEANUP
 228     exit
 229 fi
 230
 231 if [ ! -z "$EVAL" ]; then
 232     eval "$EVAL"
 233     exit $?
 234 fi
 235
 236 $SETUP
 237
 238 if [ "$ONLY" == "setup" ]; then
 239     exit 0
 240 fi
 241
 242 # 9 Different Failure Modes Combinations
 243 echo "Starting Test 17 at `date`"
 244
 245 test_0() {
 246     echo "Failover MDS"
 247     facet_failover mds1
 248     echo "Waiting for df pid: $DFPID"
 249     wait $DFPID || { echo "df returned $?" && return 1; }
 250
 251     echo "Failing OST1"
 252     facet_failover ost1
 253     echo "Waiting for df pid: $DFPID"
 254     wait $DFPID || { echo "df returned $?" && return 2; }
 255
 256     echo "Failing OST2"
 257     facet_failover ost2
 258     echo "Waiting for df pid: $DFPID"
 259     wait $DFPID || { echo "df returned $?" && return 3; }
 260     return 0
 261 }
 262 run_test 0 "Fail all nodes, independently"
 263
 264 ############### First Failure Mode ###############
 265 test_1() {
 266 echo "Don't do a MDS - MDS Failure Case"
 267 echo "This makes no sense"
 268 }
 269 run_test 1 "MDS/MDS failure"
 270 ###################################################
 271
 272 ############### Second Failure Mode ###############
 273 test_2() {
 274     echo "Verify Lustre filesystem is up and running"
 275     client_df
 276
 277     echo "Failing MDS"
 278     shutdown_facet mds1
 279     reboot_facet mds1
 280
 281     # prepare for MDS failover
 282     change_active mds1
 283     reboot_facet mds1
 284
 285     client_df &
 286     DFPID=$!
 287     sleep 5
 288
 289     echo "Failing OST"
 290     shutdown_facet ost1
 291
 292     echo "Reintegrating OST"
 293     reboot_facet ost1
 294     wait_for ost1
 295     start ost1
 296
 297     echo "Failover MDS"
 298     wait_for mds1
 299     start mds1
 300
 301     #Check FS
 302     wait $DFPID
 303     clients_recover_osts ost1
 304     echo "Verify reintegration"
 305     client_df || return 1
 306
 307 }
 308 run_test 2 "Second Failure Mode: MDS/OST `date`"
 309 ###################################################
 310
 311
 312 ############### Third Failure Mode ###############
 313 test_3() {
 314     #Create files
 315     echo "Verify Lustre filesystem is up and running"
 316
 317     #MDS Portion
 318     facet_failover mds1
 319     wait $DFPID || echo df failed: $?
 320     #Check FS
 321
 322     echo "Test Lustre stability after MDS failover"
 323     client_df
 324
 325     #CLIENT Portion
 326     echo "Failing 2 CLIENTS"
 327     fail_clients 2
 328
 329     #Check FS
 330     echo "Test Lustre stability after CLIENT failure"
 331     client_df
 332
 333     #Reintegration
 334     echo "Reintegrating CLIENTS"
 335     reintegrate_clients || return 1
 336
 337     client_df || return 3
 338 }
 339 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 340 ###################################################
 341
 342 ############### Fourth Failure Mode ###############
 343 test_4() {
 344     echo "Fourth Failure Mode: OST/MDS `date`"
 345
 346     #OST Portion
 347     echo "Failing OST ost1"
 348     shutdown_facet ost1
 349
 350     #Check FS
 351     echo "Test Lustre stability after OST failure"
 352     client_df
 353
 354     #MDS Portion
 355     echo "Failing MDS"
 356     shutdown_facet mds1
 357     reboot_facet mds1
 358
 359     # prepare for MDS failover
 360     change_active mds1
 361     reboot_facet mds1
 362
 363     client_df &
 364     DFPID=$!
 365     sleep 5
 366
 367     #Reintegration
 368     echo "Reintegrating OST"
 369     reboot_facet ost1
 370     wait_for ost1
 371     start ost1
 372
 373     echo "Failover MDS"
 374     wait_for mds1
 375     start mds1
 376     #Check FS
 377
 378     wait $DFPID
 379     clients_recover_osts ost1
 380     echo "Test Lustre stability after MDS failover"
 381     client_df || return 1
 382 }
 383 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 384 ###################################################
 385
 386 ############### Fifth Failure Mode ###############
 387 test_5() {
 388     echo "Fifth Failure Mode: OST/OST `date`"
 389
 390     #Create files
 391     echo "Verify Lustre filesystem is up and running"
 392     client_df
 393
 394     #OST Portion
 395     echo "Failing OST"
 396     shutdown_facet ost1
 397     reboot_facet ost1
 398
 399     #Check FS
 400     echo "Test Lustre stability after OST failure"
 401     client_df
 402
 403     #OST Portion
 404     echo "Failing OST"
 405     shutdown_facet ost2
 406     reboot_facet ost2
 407
 408     #Check FS
 409     echo "Test Lustre stability after OST failure"
 410     client_df
 411
 412     #Reintegration
 413     echo "Reintegrating OSTs"
 414     wait_for ost1
 415     start ost1
 416     wait_for ost2
 417     start ost2
 418
 419     clients_recover_osts ost1
 420     clients_recover_osts ost2
 421     sleep $TIMEOUT
 422
 423     client_df || return 2
 424 }
 425 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 426 ###################################################
 427
 428 ############### Sixth Failure Mode ###############
 429 test_6() {
 430     echo "Sixth Failure Mode: OST/CLIENT `date`"
 431
 432     #Create files
 433     echo "Verify Lustre filesystem is up and running"
 434     client_df || return 1
 435     client_touch testfile || return 2
 436
 437     #OST Portion
 438     echo "Failing OST"
 439     shutdown_facet ost1
 440     reboot_facet ost1
 441
 442     #Check FS
 443     echo "Test Lustre stability after OST failure"
 444     client_df
 445
 446     #CLIENT Portion
 447     echo "Failing CLIENTs"
 448     fail_clients
 449
 450     #Check FS
 451     echo "Test Lustre stability after CLIENTs failure"
 452     client_df
 453
 454     #Reintegration
 455     echo "Reintegrating OST/CLIENTs"
 456     wait_for ost1
 457     start ost1
 458     reintegrate_clients
 459     sleep 5
 460
 461     echo "Verifying mount"
 462     client_df || return 3
 463 }
 464 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 465 ###################################################
 466
 467
 468 ############### Seventh Failure Mode ###############
 469 test_7() {
 470     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 471
 472     #Create files
 473     echo "Verify Lustre filesystem is up and running"
 474     client_df
 475     client_touch testfile  || return 1
 476
 477     #CLIENT Portion
 478     echo "Part 1: Failing CLIENT"
 479     fail_clients 2
 480
 481     #Check FS
 482     echo "Test Lustre stability after CLIENTs failure"
 483     client_df
 484     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 485     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 486
 487     #Sleep
 488     echo "Wait 1 minutes"
 489     sleep 60
 490
 491     #Create files
 492     echo "Verify Lustre filesystem is up and running"
 493     client_df
 494     client_rm testfile
 495
 496     #MDS Portion
 497     echo "Failing MDS"
 498     facet_failover mds1
 499
 500     #Check FS
 501     echo "Test Lustre stability after MDS failover"
 502     wait $DFPID || echo "df on down clients fails " || return 1
 503     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 504     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 505
 506     #Reintegration
 507     echo "Reintegrating CLIENTs"
 508     reintegrate_clients
 509     client_df || return 2
 510
 511     #Sleep
 512     echo "wait 1 minutes"
 513     sleep 60
 514 }
 515 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 516 ###################################################
 517
 518
 519 ############### Eighth Failure Mode ###############
 520 test_8() {
 521     echo "Eighth Failure Mode: CLIENT/OST `date`"
 522
 523     #Create files
 524     echo "Verify Lustre filesystem is up and running"
 525     client_df
 526     client_touch testfile
 527
 528     #CLIENT Portion
 529     echo "Failing CLIENTs"
 530     fail_clients 2
 531
 532     #Check FS
 533     echo "Test Lustre stability after CLIENTs failure"
 534     client_df
 535     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 536     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 537
 538     #Sleep
 539     echo "Wait 1 minutes"
 540     sleep 60
 541
 542     #Create files
 543     echo "Verify Lustre filesystem is up and running"
 544     client_df
 545     client_touch testfile
 546
 547
 548     #OST Portion
 549     echo "Failing OST"
 550     shutdown_facet ost1
 551     reboot_facet ost1
 552
 553     #Check FS
 554     echo "Test Lustre stability after OST failure"
 555     client_df
 556     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 557     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 558
 559     #Reintegration
 560     echo "Reintegrating CLIENTs/OST"
 561     reintegrate_clients
 562     wait_for ost1
 563     start ost1
 564     client_df || return 1
 565     client_touch testfile2 || return 2
 566
 567     #Sleep
 568     echo "Wait 1 minutes"
 569     sleep 60
 570 }
 571 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 572 ###################################################
 573
 574
 575 ############### Ninth Failure Mode ###############
 576 test_9() {
 577     echo
 578
 579     #Create files
 580     echo "Verify Lustre filesystem is up and running"
 581     client_df
 582     client_touch testfile || return 1
 583
 584     #CLIENT Portion
 585     echo "Failing CLIENTs"
 586     fail_clients 2
 587
 588     #Check FS
 589     echo "Test Lustre stability after CLIENTs failure"
 590     client_df
 591     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 592     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 593
 594     #Sleep
 595     echo "Wait 1 minutes"
 596     sleep 60
 597
 598     #Create files
 599     echo "Verify Lustre filesystem is up and running"
 600     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 601     client_touch testfile || return 4
 602
 603     #CLIENT Portion
 604     echo "Failing CLIENTs"
 605     fail_clients 2
 606
 607     #Check FS
 608     echo "Test Lustre stability after CLIENTs failure"
 609     client_df
 610     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 611     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 612
 613     #Reintegration
 614     echo "Reintegrating  CLIENTs/CLIENTs"
 615     reintegrate_clients
 616     client_df || return 7
 617
 618     #Sleep
 619     echo "Wait 1 minutes"
 620     sleep 60
 621 }
 622 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 623 ###################################################
 624
 625 test_10() {
 626     #Run availability after all failures
 627     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 628     LOADTEST=${LOADTEST:-metadata-load.py}
 629     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 630 }
 631 run_test 10 "Running Availability for 6 hours..."
 632
 633 equals_msg "Done, cleaning up"
 634 $CLEANUP