lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  12 #
  13 ALWAYS_EXCEPT="10  $INSANITY_EXCEPT"
  14
  15 if [ "$FAILURE_MODE" = "HARD" ]; then
  16     mixed_ost_devs && CONFIG_EXCEPTIONS="0 2 4 5 6 8" && \
  17         echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " && \
  18         echo "Except the tests: $CONFIG_EXCEPTIONS" && \
  19         ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
  20 fi
  21
  22 #
  23 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
  24
  25 SETUP=${SETUP:-""}
  26 CLEANUP=${CLEANUP:-""}
  27
  28 build_test_filter
  29
  30 SINGLECLIENT=${SINGLECLIENT:-$HOSTNAME}
  31 LIVE_CLIENT=${LIVE_CLIENT:-$SINGLECLIENT}
  32 FAIL_CLIENTS=${FAIL_CLIENTS:-$RCLIENTS}
  33
  34 assert_env mds_HOST MDS_MKFS_OPTS
  35 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
  36 assert_env LIVE_CLIENT FSNAME
  37
  38 # FAIL_CLIENTS list should not contain the LIVE_CLIENT
  39 FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
  40
  41 DIR=${DIR:-$MOUNT}
  42
  43 #####
  44 # fail clients round robin
  45
  46 # list of failable clients
  47 FAIL_LIST=($FAIL_CLIENTS)
  48 FAIL_NUM=${#FAIL_LIST[*]}
  49 FAIL_NEXT=0
  50 typeset -i  FAIL_NEXT
  51 DOWN_NUM=0   # number of nodes currently down
  52
  53 # set next client to fail
  54 set_fail_client() {
  55     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  56     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  57     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  58 }
  59
  60 shutdown_client() {
  61     client=$1
  62     if [ "$FAILURE_MODE" = HARD ]; then
  63        $POWER_DOWN $client
  64        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  65            echo "waiting for node $client to fail"
  66            sleep 1
  67        done
  68     elif [ "$FAILURE_MODE" = SOFT ]; then
  69        zconf_umount $client $MOUNT -f
  70     fi
  71 }
  72
  73 reboot_node() {
  74     NODE=$1
  75     if [ "$FAILURE_MODE" = HARD ]; then
  76        $POWER_UP $NODE
  77     fi
  78 }
  79
  80 fail_clients() {
  81     num=$1
  82
  83     log "Request clients to fail: ${num}. Num of clients to fail: ${FAIL_NUM}, already failed: $DOWN_NUM"
  84     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  85         num=$((FAIL_NUM - DOWN_NUM))
  86     fi
  87
  88     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  89         log "No clients failed!"
  90         return
  91     fi
  92
  93     client_mkdirs
  94
  95     for i in `seq $num`; do
  96        set_fail_client
  97        client=$FAIL_CLIENT
  98        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  99        shutdown_client $client
 100     done
 101
 102     echo "down clients: $DOWN_CLIENTS"
 103
 104     for client in $DOWN_CLIENTS; do
 105         reboot_node $client
 106     done
 107     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
 108     client_rmdirs
 109 }
 110
 111 reintegrate_clients() {
 112     for client in $DOWN_CLIENTS; do
 113         wait_for_host $client
 114         echo "Restarting $client"
 115         zconf_mount $client $MOUNT || return 1
 116     done
 117     DOWN_CLIENTS=""
 118     DOWN_NUM=0
 119 }
 120
 121 start_ost() {
 122     start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
 123 }
 124
 125 trap exit INT
 126
 127 client_touch() {
 128     file=$1
 129     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 130         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 131         $PDSH $c touch $MOUNT/${c}_$file || return 1
 132     done
 133 }
 134
 135 client_rm() {
 136     file=$1
 137     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 138         $PDSH $c rm $MOUNT/${c}_$file
 139     done
 140 }
 141
 142 client_mkdirs() {
 143     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 144         echo "$c mkdir $MOUNT/$c"
 145         $PDSH $c "mkdir $MOUNT/$c"
 146         $PDSH $c "ls -l $MOUNT/$c"
 147     done
 148 }
 149
 150 client_rmdirs() {
 151     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 152         echo "rmdir $MOUNT/$c"
 153         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 154     done
 155 }
 156
 157 clients_recover_osts() {
 158     facet=$1
 159 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 160 }
 161
 162 cleanup_and_setup_lustre
 163
 164 # 9 Different Failure Modes Combinations
 165 echo "Starting Test 17 at `date`"
 166
 167 test_0() {
 168     facet_failover $SINGLEMDS
 169     echo "Waiting for df pid: $DFPID"
 170     wait $DFPID || { echo "df returned $?" && return 1; }
 171
 172     for i in $(seq $OSTCOUNT) ; do
 173         facet_failover ost$i || return 4
 174         echo "Waiting for df pid: $DFPID"
 175         wait $DFPID || { echo "df returned $?" && return 3; }
 176     done
 177     return 0
 178 }
 179 run_test 0 "Fail all nodes, independently"
 180
 181 ############### First Failure Mode ###############
 182 test_1() {
 183 echo "Don't do a MDS - MDS Failure Case"
 184 echo "This makes no sense"
 185 }
 186 run_test 1 "MDS/MDS failure"
 187 ###################################################
 188
 189 ############### Second Failure Mode ###############
 190 test_2() {
 191     echo "Verify Lustre filesystem is up and running"
 192     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 193
 194     client_df
 195
 196     shutdown_facet $SINGLEMDS
 197     reboot_facet $SINGLEMDS
 198
 199     # prepare for MDS failover
 200     change_active $SINGLEMDS
 201     reboot_facet $SINGLEMDS
 202
 203     client_df &
 204     DFPID=$!
 205     sleep 5
 206
 207     shutdown_facet ost1
 208
 209     echo "Reintegrating OST"
 210     reboot_facet ost1
 211     wait_for ost1
 212     start_ost 1 || return 2
 213
 214     wait_for $SINGLEMDS
 215     start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
 216
 217     #Check FS
 218     wait $DFPID
 219     clients_recover_osts ost1
 220     echo "Verify reintegration"
 221     client_df || return 1
 222
 223 }
 224 run_test 2 "Second Failure Mode: MDS/OST `date`"
 225 ###################################################
 226
 227
 228 ############### Third Failure Mode ###############
 229 test_3() {
 230     #Create files
 231     echo "Verify Lustre filesystem is up and running"
 232     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 233
 234     #MDS Portion
 235     facet_failover $SINGLEMDS
 236     wait $DFPID || echo df failed: $?
 237     #Check FS
 238
 239     echo "Test Lustre stability after MDS failover"
 240     client_df
 241
 242     #CLIENT Portion
 243     echo "Failing 2 CLIENTS"
 244     fail_clients 2
 245
 246     #Check FS
 247     echo "Test Lustre stability after CLIENT failure"
 248     client_df
 249
 250     #Reintegration
 251     echo "Reintegrating CLIENTS"
 252     reintegrate_clients || return 1
 253
 254     client_df || return 3
 255     sleep 2 # give it a little time for fully recovered before next test
 256 }
 257 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 258 ###################################################
 259
 260 ############### Fourth Failure Mode ###############
 261 test_4() {
 262     echo "Fourth Failure Mode: OST/MDS `date`"
 263
 264     #OST Portion
 265     shutdown_facet ost1
 266
 267     #Check FS
 268     echo "Test Lustre stability after OST failure"
 269     client_df &
 270     DFPIDA=$!
 271     sleep 5
 272
 273     #MDS Portion
 274     shutdown_facet $SINGLEMDS
 275     reboot_facet $SINGLEMDS
 276
 277     # prepare for MDS failover
 278     change_active $SINGLEMDS
 279     reboot_facet $SINGLEMDS
 280
 281     client_df &
 282     DFPIDB=$!
 283     sleep 5
 284
 285     #Reintegration
 286     echo "Reintegrating OST"
 287     reboot_facet ost1
 288     wait_for ost1
 289     start_ost 1
 290
 291     wait_for $SINGLEMDS
 292     start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
 293     #Check FS
 294
 295     wait $DFPIDA
 296     wait $DFPIDB
 297     clients_recover_osts ost1
 298     echo "Test Lustre stability after MDS failover"
 299     client_df || return 1
 300 }
 301 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 302 ###################################################
 303
 304 ############### Fifth Failure Mode ###############
 305 test_5() {
 306     [ $OSTCOUNT -lt 2 ] && skip "$OSTCOUNT < 2, not enough OSTs" && return 0
 307
 308     echo "Fifth Failure Mode: OST/OST `date`"
 309
 310     #Create files
 311     echo "Verify Lustre filesystem is up and running"
 312     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 313
 314     client_df
 315
 316     #OST Portion
 317     shutdown_facet ost1
 318     reboot_facet ost1
 319
 320     #Check FS
 321     echo "Test Lustre stability after OST failure"
 322     client_df &
 323     DFPIDA=$!
 324     sleep 5
 325
 326     #OST Portion
 327     shutdown_facet ost2
 328     reboot_facet ost2
 329
 330     #Check FS
 331     echo "Test Lustre stability after OST failure"
 332     client_df &
 333     DFPIDB=$!
 334     sleep 5
 335
 336     #Reintegration
 337     echo "Reintegrating OSTs"
 338     wait_for ost1
 339     start_ost 1
 340     wait_for ost2
 341     start_ost 2
 342
 343     clients_recover_osts ost1
 344     clients_recover_osts ost2
 345     sleep $TIMEOUT
 346
 347     wait $DFPIDA
 348     wait $DFPIDB
 349     client_df || return 2
 350 }
 351 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 352 ###################################################
 353
 354 ############### Sixth Failure Mode ###############
 355 test_6() {
 356     echo "Sixth Failure Mode: OST/CLIENT `date`"
 357
 358     #Create files
 359     echo "Verify Lustre filesystem is up and running"
 360     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 361
 362     client_df
 363     client_touch testfile || return 2
 364
 365     #OST Portion
 366     shutdown_facet ost1
 367     reboot_facet ost1
 368
 369     #Check FS
 370     echo "Test Lustre stability after OST failure"
 371     client_df &
 372     DFPIDA=$!
 373     echo DFPIDA=$DFPIDA
 374     sleep 5
 375
 376     #CLIENT Portion
 377     echo "Failing CLIENTs"
 378     fail_clients
 379
 380     #Check FS
 381     echo "Test Lustre stability after CLIENTs failure"
 382     client_df &
 383     DFPIDB=$!
 384     echo DFPIDB=$DFPIDB
 385     sleep 5
 386
 387     #Reintegration
 388     echo "Reintegrating OST/CLIENTs"
 389     wait_for ost1
 390     start_ost 1
 391     reintegrate_clients || return 1
 392     sleep 5
 393
 394     wait_remote_prog df $((TIMEOUT * 3 + 10))
 395     wait $DFPIDA
 396     wait $DFPIDB
 397
 398     echo "Verifying mount"
 399     [ -z "$(mounted_lustre_filesystems)" ] && return 3
 400     client_df
 401 }
 402 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 403 ###################################################
 404
 405
 406 ############### Seventh Failure Mode ###############
 407 test_7() {
 408     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 409
 410     #Create files
 411     echo "Verify Lustre filesystem is up and running"
 412     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 413
 414     client_df
 415     client_touch testfile  || return 1
 416
 417     #CLIENT Portion
 418     echo "Part 1: Failing CLIENT"
 419     fail_clients 2
 420
 421     #Check FS
 422     echo "Test Lustre stability after CLIENTs failure"
 423     client_df
 424     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 425     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 426
 427     #Sleep
 428     echo "Wait 1 minutes"
 429     sleep 60
 430
 431     #Create files
 432     echo "Verify Lustre filesystem is up and running"
 433     [ -z "$(mounted_lustre_filesystems)" ] && return 2
 434
 435     client_df
 436     client_rm testfile
 437
 438     #MDS Portion
 439     facet_failover $SINGLEMDS
 440
 441     #Check FS
 442     echo "Test Lustre stability after MDS failover"
 443     wait $DFPID || echo "df on down clients fails " || return 1
 444     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 445     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 446
 447     #Reintegration
 448     echo "Reintegrating CLIENTs"
 449     reintegrate_clients || return 2
 450     client_df
 451
 452     #Sleep
 453     echo "wait 1 minutes"
 454     sleep 60
 455 }
 456 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 457 ###################################################
 458
 459
 460 ############### Eighth Failure Mode ###############
 461 test_8() {
 462     echo "Eighth Failure Mode: CLIENT/OST `date`"
 463
 464     #Create files
 465     echo "Verify Lustre filesystem is up and running"
 466     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 467
 468     client_df
 469     client_touch testfile
 470
 471     #CLIENT Portion
 472     echo "Failing CLIENTs"
 473     fail_clients 2
 474
 475     #Check FS
 476     echo "Test Lustre stability after CLIENTs failure"
 477     client_df
 478     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 479     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 480
 481     #Sleep
 482     echo "Wait 1 minutes"
 483     sleep 60
 484
 485     #Create files
 486     echo "Verify Lustre filesystem is up and running"
 487     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 488
 489     client_df
 490     client_touch testfile
 491
 492
 493     #OST Portion
 494     shutdown_facet ost1
 495     reboot_facet ost1
 496
 497     #Check FS
 498     echo "Test Lustre stability after OST failure"
 499     client_df &
 500     DFPID=$!
 501     sleep 5
 502     #non-failout hangs forever here
 503     #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
 504     #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 505
 506     #Reintegration
 507     echo "Reintegrating CLIENTs/OST"
 508     reintegrate_clients || return 3
 509     wait_for ost1
 510     start_ost 1
 511     wait $DFPID
 512     client_df || return 1
 513     client_touch testfile2 || return 2
 514
 515     #Sleep
 516     echo "Wait 1 minutes"
 517     sleep 60
 518 }
 519 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 520 ###################################################
 521
 522
 523 ############### Ninth Failure Mode ###############
 524 test_9() {
 525     echo
 526
 527     #Create files
 528     echo "Verify Lustre filesystem is up and running"
 529     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 530
 531     client_df
 532     client_touch testfile || return 1
 533
 534     #CLIENT Portion
 535     echo "Failing CLIENTs"
 536     fail_clients 2
 537
 538     #Check FS
 539     echo "Test Lustre stability after CLIENTs failure"
 540     client_df
 541     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 542     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 543
 544     #Sleep
 545     echo "Wait 1 minutes"
 546     sleep 60
 547
 548     #Create files
 549     echo "Verify Lustre filesystem is up and running"
 550     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 551     client_touch testfile || return 4
 552
 553     #CLIENT Portion
 554     echo "Failing CLIENTs"
 555     fail_clients 2
 556
 557     #Check FS
 558     echo "Test Lustre stability after CLIENTs failure"
 559     client_df
 560     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 561     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 562
 563     #Reintegration
 564     echo "Reintegrating  CLIENTs/CLIENTs"
 565     reintegrate_clients || return 7
 566     client_df
 567
 568     #Sleep
 569     echo "Wait 1 minutes"
 570     sleep 60
 571 }
 572 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 573 ###################################################
 574
 575 test_10() {
 576     #Run availability after all failures
 577     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 578     LOADTEST=${LOADTEST:-metadata-load.py}
 579     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 580 }
 581 run_test 10 "Running Availability for 6 hours..."
 582
 583 equals_msg `basename $0`: test complete, cleaning up
 584 check_and_cleanup_lustre
 585 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true