lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
  12
  13 ALWAYS_EXCEPT="10"
  14
  15 SETUP=${SETUP:-"setup"}
  16 CLEANUP=${CLEANUP:-"cleanup"}
  17 UPCALL=${UPCALL:-DEFAULT}
  18
  19 build_test_filter
  20
  21 assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
  22
  23 ####
  24 # Initialize all the ostN_HOST
  25 NUMOST=2
  26 if [ "$EXTRA_OSTS" ]; then
  27     for host in $EXTRA_OSTS; do
  28         NUMOST=$((NUMOST + 1))
  29         OST=ost$NUMOST
  30         eval ${OST}_HOST=$host
  31     done
  32 fi
  33
  34 # This can be a regexp, to allow more clients
  35 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
  36
  37 DIR=${DIR:-$MOUNT}
  38
  39 #####
  40 # fail clients round robin
  41
  42 # list of failable clients
  43 FAIL_LIST=($FAIL_CLIENTS)
  44 FAIL_NUM=${#FAIL_LIST[*]}
  45 FAIL_NEXT=0
  46 typeset -i  FAIL_NEXT
  47 DOWN_NUM=0   # number of nodes currently down
  48
  49 # set next client to fail
  50 set_fail_client() {
  51     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  52     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  53     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  54 }
  55
  56 shutdown_client() {
  57     client=$1
  58     if [ "$FAILURE_MODE" = HARD ]; then
  59        $POWER_DOWN $client
  60        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  61            echo "waiting for node $client to fail"
  62            sleep 1
  63        done
  64     elif [ "$FAILURE_MODE" = SOFT ]; then
  65        zconf_umount $client $MOUNT -f
  66     fi
  67 }
  68
  69 reboot_node() {
  70     NODE=$1
  71     if [ "$FAILURE_MODE" = HARD ]; then
  72        $POWER_UP $NODE
  73     fi
  74 }
  75
  76 fail_clients() {
  77     num=$1
  78     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  79         num=$((FAIL_NUM - DOWN_NUM))
  80     fi
  81
  82     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  83         return
  84     fi
  85
  86     client_mkdirs
  87
  88     for i in `seq $num`; do
  89        set_fail_client
  90        client=$FAIL_CLIENT
  91        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  92        shutdown_client $client
  93     done
  94
  95     echo "down clients: $DOWN_CLIENTS"
  96
  97     for client in $DOWN_CLIENTS; do
  98         reboot_node $client
  99     done
 100     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
 101     client_rmdirs
 102 }
 103
 104 reintegrate_clients() {
 105     for client in $DOWN_CLIENTS; do
 106         wait_for_host $client
 107         echo "Restarting $client"
 108         zconf_mount $client $MOUNT || return 1
 109     done
 110     DOWN_CLIENTS=""
 111     DOWN_NUM=0
 112 }
 113
 114 gen_config() {
 115     rm -f $XMLCONFIG
 116     add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
 117
 118     if [ ! -z "$mdsfailover_HOST" ]; then
 119          add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
 120     fi
 121
 122     add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
 123         --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
 124     for i in `seq $NUMOST`; do
 125         dev=`printf $OSTDEV $i`
 126         add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
 127             --journal-size $OSTJOURNALSIZE
 128     done
 129
 130
 131     add_client client mds --lov lov1 --path $MOUNT
 132 }
 133
 134 setup() {
 135     gen_config
 136
 137     rm -rf logs/*
 138     for i in `seq $NUMOST`; do
 139         wait_for ost$i
 140         start ost$i ${REFORMAT} $OSTLCONFARGS
 141     done
 142     [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
 143     wait_for mds
 144     start mds $MDSLCONFARGS ${REFORMAT}
 145     while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
 146     grep " $MOUNT " /proc/mounts || zconf_mount $CLIENTS $MOUNT
 147
 148 }
 149
 150 cleanup() {
 151     zconf_umount $CLIENTS $MOUNT
 152
 153     stop mds ${FORCE} $MDSLCONFARGS || :
 154     for i in `seq $NUMOST`; do
 155         stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS  || :
 156     done
 157 }
 158
 159 trap exit INT
 160
 161 client_touch() {
 162     file=$1
 163     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 164         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 165         $PDSH $c touch $MOUNT/${c}_$file || return 1
 166     done
 167 }
 168
 169 client_rm() {
 170     file=$1
 171     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 172         $PDSH $c rm $MOUNT/${c}_$file
 173     done
 174 }
 175
 176 client_mkdirs() {
 177     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 178         echo "$c mkdir $MOUNT/$c"
 179         $PDSH $c "mkdir $MOUNT/$c"
 180         $PDSH $c "ls -l $MOUNT/$c"
 181     done
 182 }
 183
 184 client_rmdirs() {
 185     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 186         echo "rmdir $MOUNT/$c"
 187         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 188     done
 189 }
 190
 191 clients_recover_osts() {
 192     facet=$1
 193 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 194 }
 195
 196 node_to_ost() {
 197     node=$1
 198     retvar=$2
 199     for i in `seq $NUMOST`; do
 200         ostvar="ost${i}_HOST"
 201         if [ "${!ostvar}" == $node ]; then
 202             eval $retvar=ost${i}
 203             return 0
 204         fi
 205     done
 206     echo "No ost found for node; $node"
 207     return 1
 208
 209 }
 210
 211
 212
 213 if [ "$ONLY" == "cleanup" ]; then
 214     $CLEANUP
 215     exit
 216 fi
 217
 218 if [ ! -z "$EVAL" ]; then
 219     eval "$EVAL"
 220     exit $?
 221 fi
 222
 223 $SETUP
 224
 225 if [ "$ONLY" == "setup" ]; then
 226     exit 0
 227 fi
 228
 229 # 9 Different Failure Modes Combinations
 230 echo "Starting Test 17 at `date`"
 231
 232 test_0() {
 233     echo "Failover MDS"
 234     facet_failover mds
 235     echo "Waiting for df pid: $DFPID"
 236     wait $DFPID || { echo "df returned $?" && return 1; }
 237
 238     echo "Failing OST1"
 239     facet_failover ost1
 240     echo "Waiting for df pid: $DFPID"
 241     wait $DFPID || { echo "df returned $?" && return 2; }
 242
 243     echo "Failing OST2"
 244     facet_failover ost2
 245     echo "Waiting for df pid: $DFPID"
 246     wait $DFPID || { echo "df returned $?" && return 3; }
 247     return 0
 248 }
 249 run_test 0 "Fail all nodes, independently"
 250
 251 ############### First Failure Mode ###############
 252 test_1() {
 253 echo "Don't do a MDS - MDS Failure Case"
 254 echo "This makes no sense"
 255 }
 256 run_test 1 "MDS/MDS failure"
 257 ###################################################
 258
 259 ############### Second Failure Mode ###############
 260 test_2() {
 261     echo "Verify Lustre filesystem is up and running"
 262     client_df
 263
 264     echo "Failing MDS"
 265     shutdown_facet mds
 266     reboot_facet mds
 267
 268     # prepare for MDS failover
 269     change_active mds
 270     reboot_facet mds
 271
 272     client_df &
 273     DFPID=$!
 274     sleep 5
 275
 276     echo "Failing OST"
 277     shutdown_facet ost1
 278
 279     echo "Reintegrating OST"
 280     reboot_facet ost1
 281     wait_for ost1
 282     start ost1
 283
 284     echo "Failover MDS"
 285     wait_for mds
 286     start mds
 287
 288     #Check FS
 289     wait $DFPID
 290     clients_recover_osts ost1
 291     echo "Verify reintegration"
 292     client_df || return 1
 293
 294 }
 295 run_test 2 "Second Failure Mode: MDS/OST `date`"
 296 ###################################################
 297
 298
 299 ############### Third Failure Mode ###############
 300 test_3() {
 301     #Create files
 302     echo "Verify Lustre filesystem is up and running"
 303
 304     #MDS Portion
 305     facet_failover mds
 306     wait $DFPID || echo df failed: $?
 307     #Check FS
 308
 309     echo "Test Lustre stability after MDS failover"
 310     client_df
 311
 312     #CLIENT Portion
 313     echo "Failing 2 CLIENTS"
 314     fail_clients 2
 315
 316     #Check FS
 317     echo "Test Lustre stability after CLIENT failure"
 318     client_df
 319
 320     #Reintegration
 321     echo "Reintegrating CLIENTS"
 322     reintegrate_clients || return 1
 323
 324     client_df || return 3
 325 }
 326 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 327 ###################################################
 328
 329 ############### Fourth Failure Mode ###############
 330 test_4() {
 331     echo "Fourth Failure Mode: OST/MDS `date`"
 332
 333     #OST Portion
 334     echo "Failing OST ost1"
 335     shutdown_facet ost1
 336
 337     #Check FS
 338     echo "Test Lustre stability after OST failure"
 339     client_df
 340
 341     #MDS Portion
 342     echo "Failing MDS"
 343     shutdown_facet mds
 344     reboot_facet mds
 345
 346     # prepare for MDS failover
 347     change_active mds
 348     reboot_facet mds
 349
 350     client_df &
 351     DFPID=$!
 352     sleep 5
 353
 354     #Reintegration
 355     echo "Reintegrating OST"
 356     reboot_facet ost1
 357     wait_for ost1
 358     start ost1
 359
 360     echo "Failover MDS"
 361     wait_for mds
 362     start mds
 363     #Check FS
 364
 365     wait $DFPID
 366     clients_recover_osts ost1
 367     echo "Test Lustre stability after MDS failover"
 368     client_df || return 1
 369 }
 370 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 371 ###################################################
 372
 373 ############### Fifth Failure Mode ###############
 374 test_5() {
 375     echo "Fifth Failure Mode: OST/OST `date`"
 376
 377     #Create files
 378     echo "Verify Lustre filesystem is up and running"
 379     client_df
 380
 381     #OST Portion
 382     echo "Failing OST"
 383     shutdown_facet ost1
 384     reboot_facet ost1
 385
 386     #Check FS
 387     echo "Test Lustre stability after OST failure"
 388     client_df
 389
 390     #OST Portion
 391     echo "Failing OST"
 392     shutdown_facet ost2
 393     reboot_facet ost2
 394
 395     #Check FS
 396     echo "Test Lustre stability after OST failure"
 397     client_df
 398
 399     #Reintegration
 400     echo "Reintegrating OSTs"
 401     wait_for ost1
 402     start ost1
 403     wait_for ost2
 404     start ost2
 405
 406     clients_recover_osts ost1
 407     clients_recover_osts ost2
 408     sleep $TIMEOUT
 409
 410     client_df || return 2
 411 }
 412 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 413 ###################################################
 414
 415 ############### Sixth Failure Mode ###############
 416 test_6() {
 417     echo "Sixth Failure Mode: OST/CLIENT `date`"
 418
 419     #Create files
 420     echo "Verify Lustre filesystem is up and running"
 421     client_df || return 1
 422     client_touch testfile || return 2
 423
 424     #OST Portion
 425     echo "Failing OST"
 426     shutdown_facet ost1
 427     reboot_facet ost1
 428
 429     #Check FS
 430     echo "Test Lustre stability after OST failure"
 431     client_df
 432
 433     #CLIENT Portion
 434     echo "Failing CLIENTs"
 435     fail_clients
 436
 437     #Check FS
 438     echo "Test Lustre stability after CLIENTs failure"
 439     client_df
 440
 441     #Reintegration
 442     echo "Reintegrating OST/CLIENTs"
 443     wait_for ost1
 444     start ost1
 445     reintegrate_clients
 446     sleep 5
 447
 448     echo "Verifying mount"
 449     client_df || return 3
 450 }
 451 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 452 ###################################################
 453
 454
 455 ############### Seventh Failure Mode ###############
 456 test_7() {
 457     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 458
 459     #Create files
 460     echo "Verify Lustre filesystem is up and running"
 461     client_df
 462     client_touch testfile  || return 1
 463
 464     #CLIENT Portion
 465     echo "Part 1: Failing CLIENT"
 466     fail_clients 2
 467
 468     #Check FS
 469     echo "Test Lustre stability after CLIENTs failure"
 470     client_df
 471     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 472     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 473
 474     #Sleep
 475     echo "Wait 1 minutes"
 476     sleep 60
 477
 478     #Create files
 479     echo "Verify Lustre filesystem is up and running"
 480     client_df
 481     client_rm testfile
 482
 483     #MDS Portion
 484     echo "Failing MDS"
 485     facet_failover mds
 486
 487     #Check FS
 488     echo "Test Lustre stability after MDS failover"
 489     wait $DFPID || echo "df on down clients fails " || return 1
 490     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 491     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 492
 493     #Reintegration
 494     echo "Reintegrating CLIENTs"
 495     reintegrate_clients
 496     client_df || return 2
 497
 498     #Sleep
 499     echo "wait 1 minutes"
 500     sleep 60
 501 }
 502 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 503 ###################################################
 504
 505
 506 ############### Eighth Failure Mode ###############
 507 test_8() {
 508     echo "Eighth Failure Mode: CLIENT/OST `date`"
 509
 510     #Create files
 511     echo "Verify Lustre filesystem is up and running"
 512     client_df
 513     client_touch testfile
 514
 515     #CLIENT Portion
 516     echo "Failing CLIENTs"
 517     fail_clients 2
 518
 519     #Check FS
 520     echo "Test Lustre stability after CLIENTs failure"
 521     client_df
 522     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 523     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 524
 525     #Sleep
 526     echo "Wait 1 minutes"
 527     sleep 60
 528
 529     #Create files
 530     echo "Verify Lustre filesystem is up and running"
 531     client_df
 532     client_touch testfile
 533
 534
 535     #OST Portion
 536     echo "Failing OST"
 537     shutdown_facet ost1
 538     reboot_facet ost1
 539
 540     #Check FS
 541     echo "Test Lustre stability after OST failure"
 542     client_df
 543     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 544     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 545
 546     #Reintegration
 547     echo "Reintegrating CLIENTs/OST"
 548     reintegrate_clients
 549     wait_for ost1
 550     start ost1
 551     client_df || return 1
 552     client_touch testfile2 || return 2
 553
 554     #Sleep
 555     echo "Wait 1 minutes"
 556     sleep 60
 557 }
 558 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 559 ###################################################
 560
 561
 562 ############### Ninth Failure Mode ###############
 563 test_9() {
 564     echo
 565
 566     #Create files
 567     echo "Verify Lustre filesystem is up and running"
 568     client_df
 569     client_touch testfile || return 1
 570
 571     #CLIENT Portion
 572     echo "Failing CLIENTs"
 573     fail_clients 2
 574
 575     #Check FS
 576     echo "Test Lustre stability after CLIENTs failure"
 577     client_df
 578     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 579     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 580
 581     #Sleep
 582     echo "Wait 1 minutes"
 583     sleep 60
 584
 585     #Create files
 586     echo "Verify Lustre filesystem is up and running"
 587     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 588     client_touch testfile || return 4
 589
 590     #CLIENT Portion
 591     echo "Failing CLIENTs"
 592     fail_clients 2
 593
 594     #Check FS
 595     echo "Test Lustre stability after CLIENTs failure"
 596     client_df
 597     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 598     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 599
 600     #Reintegration
 601     echo "Reintegrating  CLIENTs/CLIENTs"
 602     reintegrate_clients
 603     client_df || return 7
 604
 605     #Sleep
 606     echo "Wait 1 minutes"
 607     sleep 60
 608 }
 609 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 610 ###################################################
 611
 612 test_10() {
 613     #Run availability after all failures
 614     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 615     LOADTEST=${LOADTEST:-metadata-load.py}
 616     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 617 }
 618 run_test 10 "Running Availability for 6 hours..."
 619
 620 equals_msg "Done, cleaning up"
 621 $CLEANUP