lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
  12
  13 ALWAYS_EXCEPT="10"
  14
  15 SETUP=${SETUP:-"setup"}
  16 CLEANUP=${CLEANUP:-"cleanup"}
  17 UPCALL=${UPCALL:-DEFAULT}
  18
  19 build_test_filter
  20
  21 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
  22
  23 ####
  24 # Initialize all the ostN_HOST
  25 NUMOST=2
  26 if [ "$EXTRA_OSTS" ]; then
  27     for host in $EXTRA_OSTS; do
  28         NUMOST=$((NUMOST + 1))
  29         OST=ost$NUMOST
  30         eval ${OST}_HOST=$host
  31     done
  32 fi
  33
  34 # This can be a regexp, to allow more clients
  35 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
  36
  37 DIR=${DIR:-$MOUNT}
  38
  39 #####
  40 # fail clients round robin
  41
  42 # list of failable clients
  43 FAIL_LIST=($FAIL_CLIENTS)
  44 FAIL_NUM=${#FAIL_LIST[*]}
  45 FAIL_NEXT=0
  46 typeset -i  FAIL_NEXT
  47 DOWN_NUM=0   # number of nodes currently down
  48
  49 # set next client to fail
  50 set_fail_client() {
  51     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  52     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  53     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  54 }
  55
  56 shutdown_client() {
  57     client=$1
  58     if [ "$FAILURE_MODE" = HARD ]; then
  59        $POWER_DOWN $client
  60        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  61            echo "waiting for node $client to fail"
  62            sleep 1
  63        done
  64     elif [ "$FAILURE_MODE" = SOFT ]; then
  65        zconf_umount $client $MOUNT -f
  66     fi
  67 }
  68
  69 reboot_node() {
  70     NODE=$1
  71     if [ "$FAILURE_MODE" = HARD ]; then
  72        $POWER_UP $NODE
  73     fi
  74 }
  75
  76 fail_clients() {
  77     num=$1
  78     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  79         num=$((FAIL_NUM - DOWN_NUM))
  80     fi
  81
  82     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  83         return
  84     fi
  85
  86     client_mkdirs
  87
  88     for i in `seq $num`; do
  89        set_fail_client
  90        client=$FAIL_CLIENT
  91        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  92        shutdown_client $client
  93     done
  94
  95     echo "down clients: $DOWN_CLIENTS"
  96
  97     for client in $DOWN_CLIENTS; do
  98         reboot_node $client
  99     done
 100     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
 101     client_rmdirs
 102 }
 103
 104 reintegrate_clients() {
 105     for client in $DOWN_CLIENTS; do
 106         wait_for_host $client
 107         echo "Restarting $client"
 108         zconf_mount $client $MOUNT || return 1
 109     done
 110     DOWN_CLIENTS=""
 111     DOWN_NUM=0
 112 }
 113
 114 gen_config() {
 115     rm -f $XMLCONFIG
 116     add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
 117
 118     if [ ! -z "$mdsfailover_HOST" ]; then
 119          add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
 120     fi
 121
 122     add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
 123         --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
 124     for i in `seq $NUMOST`; do
 125         dev=`printf $OSTDEV $i`
 126         add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
 127             --journal-size $OSTJOURNALSIZE
 128     done
 129
 130
 131     add_client client mds --lov lov1 --path $MOUNT
 132 }
 133
 134 setup() {
 135     gen_config
 136
 137     rm -rf logs/*
 138     for i in `seq $NUMOST`; do
 139         wait_for ost$i
 140         start ost$i ${REFORMAT} $OSTLCONFARGS
 141     done
 142     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
 143     wait_for mds
 144     start mds $MDSLCONFARGS ${REFORMAT}
 145     while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
 146     grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
 147
 148 }
 149
 150 cleanup() {
 151     zconf_umount $CLIENTS $MOUNT
 152
 153     stop mds ${FORCE} $MDSLCONFARGS || :
 154     for i in `seq $NUMOST`; do
 155         stop ost$i ${FORCE} $OSTLCONFARGS  || :
 156     done
 157 }
 158
 159 trap exit INT
 160
 161 client_touch() {
 162     file=$1
 163     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 164         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 165         $PDSH $c touch $MOUNT/${c}_$file || return 1
 166     done
 167 }
 168
 169 client_rm() {
 170     file=$1
 171     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 172         $PDSH $c rm $MOUNT/${c}_$file
 173     done
 174 }
 175
 176 client_mkdirs() {
 177     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 178         echo "$c mkdir $MOUNT/$c"
 179         $PDSH $c "mkdir $MOUNT/$c"
 180         $PDSH $c "ls -l $MOUNT/$c"
 181     done
 182 }
 183
 184 client_rmdirs() {
 185     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 186         echo "rmdir $MOUNT/$c"
 187         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 188     done
 189 }
 190
 191 clients_recover_osts() {
 192     facet=$1
 193 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 194 }
 195
 196 node_to_ost() {
 197     node=$1
 198     retvar=$2
 199     for i in `seq $NUMOST`; do
 200         ostvar="ost${i}_HOST"
 201         if [ "${!ostvar}" == $node ]; then
 202             eval $retvar=ost${i}
 203             return 0
 204         fi
 205     done
 206     echo "No ost found for node; $node"
 207     return 1
 208
 209 }
 210
 211
 212
 213 if [ "$ONLY" == "cleanup" ]; then
 214     $CLEANUP
 215     exit
 216 fi
 217
 218 if [ ! -z "$EVAL" ]; then
 219     eval "$EVAL"
 220     exit $?
 221 fi
 222
 223 $SETUP
 224
 225 if [ "$ONLY" == "setup" ]; then
 226     exit 0
 227 fi
 228
 229 # 9 Different Failure Modes Combinations
 230 echo "Starting Test 17 at `date`"
 231
 232 test_0() {
 233     echo "Failover MDS"
 234     facet_failover mds
 235     echo "Waiting for df pid: $DFPID"
 236     wait $DFPID || { echo "df returned $?" && return 1; }
 237
 238     echo "Failing OST1"
 239     facet_failover ost1
 240     echo "Waiting for df pid: $DFPID"
 241     wait $DFPID || { echo "df returned $?" && return 2; }
 242
 243     echo "Failing OST2"
 244     facet_failover ost2
 245     echo "Waiting for df pid: $DFPID"
 246     wait $DFPID || { echo "df returned $?" && return 3; }
 247     return 0
 248 }
 249 run_test 0 "Fail all nodes, independently"
 250
 251 ############### First Failure Mode ###############
 252 test_1() {
 253 echo "Don't do a MDS - MDS Failure Case"
 254 echo "This makes no sense"
 255 }
 256 run_test 1 "MDS/MDS failure"
 257 ###################################################
 258
 259 ############### Second Failure Mode ###############
 260 test_2() {
 261     echo "Verify Lustre filesystem is up and running"
 262     client_df
 263
 264     echo "Failing MDS"
 265     shutdown_facet mds
 266     reboot_facet mds
 267
 268     # prepare for MDS failover
 269     change_active mds
 270     reboot_facet mds
 271
 272     client_df &
 273     DFPID=$!
 274     sleep 5
 275
 276     echo "Failing OST"
 277     shutdown_facet ost1
 278
 279     echo "Reintegrating OST"
 280     reboot_facet ost1
 281     wait_for ost1
 282     start ost1
 283
 284     echo "Failover MDS"
 285     wait_for mds
 286     start mds
 287
 288     #Check FS
 289     wait $DFPID
 290     clients_recover_osts ost1
 291     echo "Verify reintegration"
 292     client_df || return 1
 293
 294 }
 295 run_test 2 "Second Failure Mode: MDS/OST `date`"
 296 ###################################################
 297
 298
 299 ############### Third Failure Mode ###############
 300 test_3() {
 301     #Create files
 302     echo "Verify Lustre filesystem is up and running"
 303
 304     #MDS Portion
 305     facet_failover mds
 306     wait $DFPID || echo df failed: $?
 307     #Check FS
 308
 309     echo "Test Lustre stability after MDS failover"
 310     client_df
 311
 312     #CLIENT Portion
 313     echo "Failing 2 CLIENTS"
 314     fail_clients 2
 315
 316     #Check FS
 317     echo "Test Lustre stability after CLIENT failure"
 318     client_df
 319
 320     #Reintegration
 321     echo "Reintegrating CLIENTS"
 322     reintegrate_clients || return 1
 323
 324     client_df || return 3
 325 }
 326 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 327 ###################################################
 328
 329 ############### Fourth Failure Mode ###############
 330 test_4() {
 331     echo "Fourth Failure Mode: OST/MDS `date`"
 332
 333     #OST Portion
 334     echo "Failing OST ost1"
 335     shutdown_facet ost1
 336
 337     #Check FS
 338     echo "Test Lustre stability after OST failure"
 339     client_df &
 340     DFPIDA=$!
 341     sleep 5
 342
 343     #MDS Portion
 344     echo "Failing MDS"
 345     shutdown_facet mds
 346     reboot_facet mds
 347
 348     # prepare for MDS failover
 349     change_active mds
 350     reboot_facet mds
 351
 352     client_df &
 353     DFPIDB=$!
 354     sleep 5
 355
 356     #Reintegration
 357     echo "Reintegrating OST"
 358     reboot_facet ost1
 359     wait_for ost1
 360     start ost1
 361
 362     echo "Failover MDS"
 363     wait_for mds
 364     start mds
 365     #Check FS
 366
 367     wait $DFPIDA
 368     wait $DFPIDB
 369     clients_recover_osts ost1
 370     echo "Test Lustre stability after MDS failover"
 371     client_df || return 1
 372 }
 373 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 374 ###################################################
 375
 376 ############### Fifth Failure Mode ###############
 377 test_5() {
 378     echo "Fifth Failure Mode: OST/OST `date`"
 379
 380     #Create files
 381     echo "Verify Lustre filesystem is up and running"
 382     client_df
 383
 384     #OST Portion
 385     echo "Failing OST"
 386     shutdown_facet ost1
 387     reboot_facet ost1
 388
 389     #Check FS
 390     echo "Test Lustre stability after OST failure"
 391     client_df &
 392     DFPIDA=$!
 393     sleep 5
 394
 395     #OST Portion
 396     echo "Failing OST"
 397     shutdown_facet ost2
 398     reboot_facet ost2
 399
 400     #Check FS
 401     echo "Test Lustre stability after OST failure"
 402     client_df &
 403     DFPIDB=$!
 404     sleep 5
 405
 406     #Reintegration
 407     echo "Reintegrating OSTs"
 408     wait_for ost1
 409     start ost1
 410     wait_for ost2
 411     start ost2
 412
 413     clients_recover_osts ost1
 414     clients_recover_osts ost2
 415     sleep $TIMEOUT
 416
 417     wait $DFPIDA
 418     wait $DFPIDB
 419     client_df || return 2
 420 }
 421 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 422 ###################################################
 423
 424 ############### Sixth Failure Mode ###############
 425 test_6() {
 426     echo "Sixth Failure Mode: OST/CLIENT `date`"
 427
 428     #Create files
 429     echo "Verify Lustre filesystem is up and running"
 430     client_df || return 1
 431     client_touch testfile || return 2
 432
 433     #OST Portion
 434     echo "Failing OST"
 435     shutdown_facet ost1
 436     reboot_facet ost1
 437
 438     #Check FS
 439     echo "Test Lustre stability after OST failure"
 440     client_df &
 441     DFPIDA=$!
 442     sleep 5
 443
 444     #CLIENT Portion
 445     echo "Failing CLIENTs"
 446     fail_clients
 447
 448     #Check FS
 449     echo "Test Lustre stability after CLIENTs failure"
 450     client_df &
 451     DFPIDB=$!
 452     sleep 5
 453
 454     #Reintegration
 455     echo "Reintegrating OST/CLIENTs"
 456     wait_for ost1
 457     start ost1
 458     reintegrate_clients
 459     sleep 5
 460
 461     wait $DFPIDA
 462     wait $DFPIDB
 463     echo "Verifying mount"
 464     client_df || return 3
 465 }
 466 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 467 ###################################################
 468
 469
 470 ############### Seventh Failure Mode ###############
 471 test_7() {
 472     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 473
 474     #Create files
 475     echo "Verify Lustre filesystem is up and running"
 476     client_df
 477     client_touch testfile  || return 1
 478
 479     #CLIENT Portion
 480     echo "Part 1: Failing CLIENT"
 481     fail_clients 2
 482
 483     #Check FS
 484     echo "Test Lustre stability after CLIENTs failure"
 485     client_df
 486     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 487     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 488
 489     #Sleep
 490     echo "Wait 1 minutes"
 491     sleep 60
 492
 493     #Create files
 494     echo "Verify Lustre filesystem is up and running"
 495     client_df
 496     client_rm testfile
 497
 498     #MDS Portion
 499     echo "Failing MDS"
 500     facet_failover mds
 501
 502     #Check FS
 503     echo "Test Lustre stability after MDS failover"
 504     wait $DFPID || echo "df on down clients fails " || return 1
 505     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 506     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 507
 508     #Reintegration
 509     echo "Reintegrating CLIENTs"
 510     reintegrate_clients
 511     client_df || return 2
 512
 513     #Sleep
 514     echo "wait 1 minutes"
 515     sleep 60
 516 }
 517 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 518 ###################################################
 519
 520
 521 ############### Eighth Failure Mode ###############
 522 test_8() {
 523     echo "Eighth Failure Mode: CLIENT/OST `date`"
 524
 525     #Create files
 526     echo "Verify Lustre filesystem is up and running"
 527     client_df
 528     client_touch testfile
 529
 530     #CLIENT Portion
 531     echo "Failing CLIENTs"
 532     fail_clients 2
 533
 534     #Check FS
 535     echo "Test Lustre stability after CLIENTs failure"
 536     client_df
 537     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 538     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 539
 540     #Sleep
 541     echo "Wait 1 minutes"
 542     sleep 60
 543
 544     #Create files
 545     echo "Verify Lustre filesystem is up and running"
 546     client_df
 547     client_touch testfile
 548
 549
 550     #OST Portion
 551     echo "Failing OST"
 552     shutdown_facet ost1
 553     reboot_facet ost1
 554
 555     #Check FS
 556     echo "Test Lustre stability after OST failure"
 557     client_df &
 558     DFPID=$!
 559     sleep 5
 560     #non-failout hangs forever here
 561     #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
 562     #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 563
 564     #Reintegration
 565     echo "Reintegrating CLIENTs/OST"
 566     reintegrate_clients
 567     wait_for ost1
 568     start ost1
 569     wait $DFPID
 570     client_df || return 1
 571     client_touch testfile2 || return 2
 572
 573     #Sleep
 574     echo "Wait 1 minutes"
 575     sleep 60
 576 }
 577 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 578 ###################################################
 579
 580
 581 ############### Ninth Failure Mode ###############
 582 test_9() {
 583     echo
 584
 585     #Create files
 586     echo "Verify Lustre filesystem is up and running"
 587     client_df
 588     client_touch testfile || return 1
 589
 590     #CLIENT Portion
 591     echo "Failing CLIENTs"
 592     fail_clients 2
 593
 594     #Check FS
 595     echo "Test Lustre stability after CLIENTs failure"
 596     client_df
 597     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 598     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 599
 600     #Sleep
 601     echo "Wait 1 minutes"
 602     sleep 60
 603
 604     #Create files
 605     echo "Verify Lustre filesystem is up and running"
 606     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 607     client_touch testfile || return 4
 608
 609     #CLIENT Portion
 610     echo "Failing CLIENTs"
 611     fail_clients 2
 612
 613     #Check FS
 614     echo "Test Lustre stability after CLIENTs failure"
 615     client_df
 616     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 617     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 618
 619     #Reintegration
 620     echo "Reintegrating  CLIENTs/CLIENTs"
 621     reintegrate_clients
 622     client_df || return 7
 623
 624     #Sleep
 625     echo "Wait 1 minutes"
 626     sleep 60
 627 }
 628 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 629 ###################################################
 630
 631 test_10() {
 632     #Run availability after all failures
 633     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 634     LOADTEST=${LOADTEST:-metadata-load.py}
 635     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 636 }
 637 run_test 10 "Running Availability for 6 hours..."
 638
 639 equals_msg "Done, cleaning up"
 640 # we need to force cleanup for the stale MDS conns until bug 5921 is fixed
 641 FORCE=--force $CLEANUP