lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-lmv.sh}
  12
  13 ALWAYS_EXCEPT="10"
  14
  15 SETUP=${SETUP:-"setup"}
  16 CLEANUP=${CLEANUP:-"cleanup"}
  17
  18 build_test_filter
  19
  20 assert_env MDSCOUNT mds1_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
  21
  22 ####
  23 # Initialize all the ostN_HOST
  24 NUMOST=2
  25 if [ "$EXTRA_OSTS" ]; then
  26     for host in $EXTRA_OSTS; do
  27         NUMOST=$((NUMOST + 1))
  28         OST=ost$NUMOST
  29         eval ${OST}_HOST=$host
  30     done
  31 fi
  32
  33 # This can be a regexp, to allow more clients
  34 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
  35
  36 DIR=${DIR:-$MOUNT}
  37
  38 #####
  39 # fail clients round robin
  40
  41 # list of failable clients
  42 FAIL_LIST=($FAIL_CLIENTS)
  43 FAIL_NUM=${#FAIL_LIST[*]}
  44 FAIL_NEXT=0
  45 typeset -i  FAIL_NEXT
  46 DOWN_NUM=0   # number of nodes currently down
  47
  48 # set next client to fail
  49 set_fail_client() {
  50     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  51     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  52     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  53 }
  54
  55 shutdown_client() {
  56     client=$1
  57     if [ "$FAILURE_MODE" = HARD ]; then
  58        $POWER_DOWN $client
  59        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  60            echo "waiting for node $client to fail"
  61            sleep 1
  62        done
  63     elif [ "$FAILURE_MODE" = SOFT ]; then
  64        zconf_umount $client $MOUNT -f
  65     fi
  66 }
  67
  68 reboot_node() {
  69     NODE=$1
  70     if [ "$FAILURE_MODE" = HARD ]; then
  71        $POWER_UP $NODE
  72     fi
  73 }
  74
  75 fail_clients() {
  76     num=$1
  77     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  78         num=$((FAIL_NUM - DOWN_NUM))
  79     fi
  80
  81     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  82         return
  83     fi
  84
  85     client_mkdirs
  86
  87     for i in `seq $num`; do
  88        set_fail_client
  89        client=$FAIL_CLIENT
  90        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  91        shutdown_client $client
  92     done
  93
  94     echo "down clients: $DOWN_CLIENTS"
  95
  96     for client in $DOWN_CLIENTS; do
  97         reboot_node $client
  98     done
  99     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
 100     client_rmdirs
 101 }
 102
 103 reintegrate_clients() {
 104     for client in $DOWN_CLIENTS; do
 105         wait_for_host $client
 106         echo "Restarting $client"
 107         zconf_mount $client $MOUNT || return 1
 108     done
 109     DOWN_CLIENTS=""
 110     DOWN_NUM=0
 111 }
 112
 113 gen_config() {
 114     rm -f $XMLCONFIG
 115     if [ "$MDSCOUNT" -gt 1 ]; then
 116         add_lmv lmv1_svc
 117         for mds in `mds_list`; do
 118             MDSDEV=$TMP/${mds}-`hostname`
 119             add_mds $mds --dev $MDSDEV --size $MDSSIZE --lmv lmv1_svc
 120         done
 121         add_lov_to_lmv lov1 lmv1_svc --stripe_sz $STRIPE_BYTES \
 122             --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
 123         MDS=lmv1
 124     else
 125         add_mds mds1 --dev $MDSDEV --size $MDSSIZE
 126         if [ ! -z "$mds1failover_HOST" ]; then
 127              add_mdsfailover mds1 --dev $MDSDEV --size $MDSSIZE
 128         fi
 129         add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \
 130             --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
 131         MDS=mds1
 132     fi
 133
 134     for i in `seq $NUMOST`; do
 135         dev=`printf $OSTDEV $i`
 136         add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
 137             --journal-size $OSTJOURNALSIZE
 138     done
 139
 140     add_client client $MDS --lov lov1 --path $MOUNT
 141 }
 142
 143 setup() {
 144     gen_config
 145
 146     start_krb5_kdc || exit 1
 147     rm -rf logs/*
 148     for i in `seq $NUMOST`; do
 149         wait_for ost$i
 150         start ost$i ${REFORMAT} $OSTLCONFARGS
 151     done
 152     start_lsvcgssd || exit 2
 153     start_lgssd || exit 3
 154     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
 155     for mds in `mds_list`; do
 156         wait_for $mds
 157         start $mds $MDSLCONFARGS ${REFORMAT}
 158     done
 159     while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
 160     grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
 161
 162 }
 163
 164 cleanup() {
 165     zconf_umount $CLIENTS $MOUNT
 166
 167     for mds in `mds_list`; do
 168         stop $mds ${FORCE} $MDSLCONFARGS || :
 169     done
 170     stop_lgssd
 171     stop_lsvcgssd
 172     for i in `seq $NUMOST`; do
 173         stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS  || :
 174     done
 175 }
 176
 177 trap exit INT
 178
 179 client_touch() {
 180     file=$1
 181     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 182         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 183         $PDSH $c touch $MOUNT/${c}_$file || return 1
 184     done
 185 }
 186
 187 client_rm() {
 188     file=$1
 189     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 190         $PDSH $c rm $MOUNT/${c}_$file
 191     done
 192 }
 193
 194 client_mkdirs() {
 195     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 196         echo "$c mkdir $MOUNT/$c"
 197         $PDSH $c "mkdir $MOUNT/$c"
 198         $PDSH $c "ls -l $MOUNT/$c"
 199     done
 200 }
 201
 202 client_rmdirs() {
 203     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 204         echo "rmdir $MOUNT/$c"
 205         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 206     done
 207 }
 208
 209 clients_recover_osts() {
 210     facet=$1
 211 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 212 }
 213
 214 node_to_ost() {
 215     node=$1
 216     retvar=$2
 217     for i in `seq $NUMOST`; do
 218         ostvar="ost${i}_HOST"
 219         if [ "${!ostvar}" == $node ]; then
 220             eval $retvar=ost${i}
 221             return 0
 222         fi
 223     done
 224     echo "No ost found for node; $node"
 225     return 1
 226
 227 }
 228
 229
 230
 231 if [ "$ONLY" == "cleanup" ]; then
 232     $CLEANUP
 233     exit
 234 fi
 235
 236 if [ ! -z "$EVAL" ]; then
 237     eval "$EVAL"
 238     exit $?
 239 fi
 240
 241 $SETUP
 242
 243 if [ "$ONLY" == "setup" ]; then
 244     exit 0
 245 fi
 246
 247 # 9 Different Failure Modes Combinations
 248 echo "Starting Test 17 at `date`"
 249
 250 test_0() {
 251     echo "Failover MDS"
 252     facet_failover mds1
 253     echo "Waiting for df pid: $DFPID"
 254     wait $DFPID || { echo "df returned $?" && return 1; }
 255
 256     echo "Failing OST1"
 257     facet_failover ost1
 258     echo "Waiting for df pid: $DFPID"
 259     wait $DFPID || { echo "df returned $?" && return 2; }
 260
 261     echo "Failing OST2"
 262     facet_failover ost2
 263     echo "Waiting for df pid: $DFPID"
 264     wait $DFPID || { echo "df returned $?" && return 3; }
 265     return 0
 266 }
 267 run_test 0 "Fail all nodes, independently"
 268
 269 ############### First Failure Mode ###############
 270 test_1() {
 271 echo "Don't do a MDS - MDS Failure Case"
 272 echo "This makes no sense"
 273 }
 274 run_test 1 "MDS/MDS failure"
 275 ###################################################
 276
 277 ############### Second Failure Mode ###############
 278 test_2() {
 279     echo "Verify Lustre filesystem is up and running"
 280     client_df
 281
 282     echo "Failing MDS"
 283     shutdown_facet mds1
 284     reboot_facet mds1
 285
 286     # prepare for MDS failover
 287     change_active mds1
 288     reboot_facet mds1
 289
 290     client_df &
 291     DFPID=$!
 292     sleep 5
 293
 294     echo "Failing OST"
 295     shutdown_facet ost1
 296
 297     echo "Reintegrating OST"
 298     reboot_facet ost1
 299     wait_for ost1
 300     start ost1
 301
 302     echo "Failover MDS"
 303     wait_for mds1
 304     start mds1
 305
 306     #Check FS
 307     wait $DFPID
 308     clients_recover_osts ost1
 309     echo "Verify reintegration"
 310     client_df || return 1
 311
 312 }
 313 run_test 2 "Second Failure Mode: MDS/OST `date`"
 314 ###################################################
 315
 316
 317 ############### Third Failure Mode ###############
 318 test_3() {
 319     #Create files
 320     echo "Verify Lustre filesystem is up and running"
 321
 322     #MDS Portion
 323     facet_failover mds1
 324     wait $DFPID || echo df failed: $?
 325     #Check FS
 326
 327     echo "Test Lustre stability after MDS failover"
 328     client_df
 329
 330     #CLIENT Portion
 331     echo "Failing 2 CLIENTS"
 332     fail_clients 2
 333
 334     #Check FS
 335     echo "Test Lustre stability after CLIENT failure"
 336     client_df
 337
 338     #Reintegration
 339     echo "Reintegrating CLIENTS"
 340     reintegrate_clients || return 1
 341
 342     client_df || return 3
 343 }
 344 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 345 ###################################################
 346
 347 ############### Fourth Failure Mode ###############
 348 test_4() {
 349     echo "Fourth Failure Mode: OST/MDS `date`"
 350
 351     #OST Portion
 352     echo "Failing OST ost1"
 353     shutdown_facet ost1
 354
 355     #Check FS
 356     echo "Test Lustre stability after OST failure"
 357     client_df
 358
 359     #MDS Portion
 360     echo "Failing MDS"
 361     shutdown_facet mds1
 362     reboot_facet mds1
 363
 364     # prepare for MDS failover
 365     change_active mds1
 366     reboot_facet mds1
 367
 368     client_df &
 369     DFPID=$!
 370     sleep 5
 371
 372     #Reintegration
 373     echo "Reintegrating OST"
 374     reboot_facet ost1
 375     wait_for ost1
 376     start ost1
 377
 378     echo "Failover MDS"
 379     wait_for mds1
 380     start mds1
 381     #Check FS
 382
 383     wait $DFPID
 384     clients_recover_osts ost1
 385     echo "Test Lustre stability after MDS failover"
 386     client_df || return 1
 387 }
 388 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 389 ###################################################
 390
 391 ############### Fifth Failure Mode ###############
 392 test_5() {
 393     echo "Fifth Failure Mode: OST/OST `date`"
 394
 395     #Create files
 396     echo "Verify Lustre filesystem is up and running"
 397     client_df
 398
 399     #OST Portion
 400     echo "Failing OST"
 401     shutdown_facet ost1
 402     reboot_facet ost1
 403
 404     #Check FS
 405     echo "Test Lustre stability after OST failure"
 406     client_df
 407
 408     #OST Portion
 409     echo "Failing OST"
 410     shutdown_facet ost2
 411     reboot_facet ost2
 412
 413     #Check FS
 414     echo "Test Lustre stability after OST failure"
 415     client_df
 416
 417     #Reintegration
 418     echo "Reintegrating OSTs"
 419     wait_for ost1
 420     start ost1
 421     wait_for ost2
 422     start ost2
 423
 424     clients_recover_osts ost1
 425     clients_recover_osts ost2
 426     sleep $TIMEOUT
 427
 428     client_df || return 2
 429 }
 430 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 431 ###################################################
 432
 433 ############### Sixth Failure Mode ###############
 434 test_6() {
 435     echo "Sixth Failure Mode: OST/CLIENT `date`"
 436
 437     #Create files
 438     echo "Verify Lustre filesystem is up and running"
 439     client_df || return 1
 440     client_touch testfile || return 2
 441
 442     #OST Portion
 443     echo "Failing OST"
 444     shutdown_facet ost1
 445     reboot_facet ost1
 446
 447     #Check FS
 448     echo "Test Lustre stability after OST failure"
 449     client_df
 450
 451     #CLIENT Portion
 452     echo "Failing CLIENTs"
 453     fail_clients
 454
 455     #Check FS
 456     echo "Test Lustre stability after CLIENTs failure"
 457     client_df
 458
 459     #Reintegration
 460     echo "Reintegrating OST/CLIENTs"
 461     wait_for ost1
 462     start ost1
 463     reintegrate_clients
 464     sleep 5
 465
 466     echo "Verifying mount"
 467     client_df || return 3
 468 }
 469 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 470 ###################################################
 471
 472
 473 ############### Seventh Failure Mode ###############
 474 test_7() {
 475     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 476
 477     #Create files
 478     echo "Verify Lustre filesystem is up and running"
 479     client_df
 480     client_touch testfile  || return 1
 481
 482     #CLIENT Portion
 483     echo "Part 1: Failing CLIENT"
 484     fail_clients 2
 485
 486     #Check FS
 487     echo "Test Lustre stability after CLIENTs failure"
 488     client_df
 489     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 490     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 491
 492     #Sleep
 493     echo "Wait 1 minutes"
 494     sleep 60
 495
 496     #Create files
 497     echo "Verify Lustre filesystem is up and running"
 498     client_df
 499     client_rm testfile
 500
 501     #MDS Portion
 502     echo "Failing MDS"
 503     facet_failover mds1
 504
 505     #Check FS
 506     echo "Test Lustre stability after MDS failover"
 507     wait $DFPID || echo "df on down clients fails " || return 1
 508     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 509     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 510
 511     #Reintegration
 512     echo "Reintegrating CLIENTs"
 513     reintegrate_clients
 514     client_df || return 2
 515
 516     #Sleep
 517     echo "wait 1 minutes"
 518     sleep 60
 519 }
 520 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 521 ###################################################
 522
 523
 524 ############### Eighth Failure Mode ###############
 525 test_8() {
 526     echo "Eighth Failure Mode: CLIENT/OST `date`"
 527
 528     #Create files
 529     echo "Verify Lustre filesystem is up and running"
 530     client_df
 531     client_touch testfile
 532
 533     #CLIENT Portion
 534     echo "Failing CLIENTs"
 535     fail_clients 2
 536
 537     #Check FS
 538     echo "Test Lustre stability after CLIENTs failure"
 539     client_df
 540     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 541     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 542
 543     #Sleep
 544     echo "Wait 1 minutes"
 545     sleep 60
 546
 547     #Create files
 548     echo "Verify Lustre filesystem is up and running"
 549     client_df
 550     client_touch testfile
 551
 552
 553     #OST Portion
 554     echo "Failing OST"
 555     shutdown_facet ost1
 556     reboot_facet ost1
 557
 558     #Check FS
 559     echo "Test Lustre stability after OST failure"
 560     client_df
 561     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 562     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 563
 564     #Reintegration
 565     echo "Reintegrating CLIENTs/OST"
 566     reintegrate_clients
 567     wait_for ost1
 568     start ost1
 569     client_df || return 1
 570     client_touch testfile2 || return 2
 571
 572     #Sleep
 573     echo "Wait 1 minutes"
 574     sleep 60
 575 }
 576 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 577 ###################################################
 578
 579
 580 ############### Ninth Failure Mode ###############
 581 test_9() {
 582     echo
 583
 584     #Create files
 585     echo "Verify Lustre filesystem is up and running"
 586     client_df
 587     client_touch testfile || return 1
 588
 589     #CLIENT Portion
 590     echo "Failing CLIENTs"
 591     fail_clients 2
 592
 593     #Check FS
 594     echo "Test Lustre stability after CLIENTs failure"
 595     client_df
 596     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 597     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 598
 599     #Sleep
 600     echo "Wait 1 minutes"
 601     sleep 60
 602
 603     #Create files
 604     echo "Verify Lustre filesystem is up and running"
 605     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 606     client_touch testfile || return 4
 607
 608     #CLIENT Portion
 609     echo "Failing CLIENTs"
 610     fail_clients 2
 611
 612     #Check FS
 613     echo "Test Lustre stability after CLIENTs failure"
 614     client_df
 615     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 616     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 617
 618     #Reintegration
 619     echo "Reintegrating  CLIENTs/CLIENTs"
 620     reintegrate_clients
 621     client_df || return 7
 622
 623     #Sleep
 624     echo "Wait 1 minutes"
 625     sleep 60
 626 }
 627 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 628 ###################################################
 629
 630 test_10() {
 631     #Run availability after all failures
 632     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 633     LOADTEST=${LOADTEST:-metadata-load.py}
 634     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 635 }
 636 run_test 10 "Running Availability for 6 hours..."
 637
 638 equals_msg "Done, cleaning up"
 639 $CLEANUP