lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  12 #
  13 ALWAYS_EXCEPT="10  $INSANITY_EXCEPT"
  14
  15 if [ "$FAILURE_MODE" = "HARD" ]; then
  16     mixed_ost_devs && CONFIG_EXCEPTIONS="0 2 4 5 6 8" && \
  17         echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " && \
  18         echo "Except the tests: $CONFIG_EXCEPTIONS" && \
  19         ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
  20 fi
  21
  22 #
  23 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
  24
  25 SETUP=${SETUP:-""}
  26 CLEANUP=${CLEANUP:-""}
  27
  28 build_test_filter
  29
  30 SINGLECLIENT=${SINGLECLIENT:-$HOSTNAME}
  31 LIVE_CLIENT=${LIVE_CLIENT:-$SINGLECLIENT}
  32 FAIL_CLIENTS=${FAIL_CLIENTS:-$RCLIENTS}
  33
  34 assert_env mds_HOST MDS_MKFS_OPTS
  35 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
  36 assert_env LIVE_CLIENT FSNAME
  37
  38 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
  39 remote_ost_nodsh && skip "remote OST with nodsh" && exit 0
  40
  41 # FAIL_CLIENTS list should not contain the LIVE_CLIENT
  42 FAIL_CLIENTS=$(echo " $FAIL_CLIENTS " | sed -re "s/\s+$LIVE_CLIENT\s+/ /g")
  43
  44 DIR=${DIR:-$MOUNT}
  45
  46 #####
  47 # fail clients round robin
  48
  49 # list of failable clients
  50 FAIL_LIST=($FAIL_CLIENTS)
  51 FAIL_NUM=${#FAIL_LIST[*]}
  52 FAIL_NEXT=0
  53 typeset -i  FAIL_NEXT
  54 DOWN_NUM=0   # number of nodes currently down
  55
  56 # set next client to fail
  57 set_fail_client() {
  58     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  59     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  60     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  61 }
  62
  63 shutdown_client() {
  64     client=$1
  65     if [ "$FAILURE_MODE" = HARD ]; then
  66        $POWER_DOWN $client
  67        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  68            echo "waiting for node $client to fail"
  69            sleep 1
  70        done
  71     elif [ "$FAILURE_MODE" = SOFT ]; then
  72        zconf_umount $client $MOUNT -f
  73     fi
  74 }
  75
  76 fail_clients() {
  77     num=$1
  78
  79     log "Request clients to fail: ${num}. Num of clients to fail: ${FAIL_NUM}, already failed: $DOWN_NUM"
  80     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  81         num=$((FAIL_NUM - DOWN_NUM))
  82     fi
  83
  84     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  85         log "No clients failed!"
  86         return
  87     fi
  88
  89     client_mkdirs
  90
  91     for i in `seq $num`; do
  92        set_fail_client
  93        client=$FAIL_CLIENT
  94        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  95        shutdown_client $client
  96     done
  97
  98     echo "down clients: $DOWN_CLIENTS"
  99
 100     for client in $DOWN_CLIENTS; do
 101         boot_node $client
 102     done
 103     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
 104     client_rmdirs
 105 }
 106
 107 reintegrate_clients() {
 108     for client in $DOWN_CLIENTS; do
 109         wait_for_host $client
 110         echo "Restarting $client"
 111         zconf_mount $client $MOUNT || return 1
 112     done
 113     DOWN_CLIENTS=""
 114     DOWN_NUM=0
 115 }
 116
 117 start_ost() {
 118     start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
 119 }
 120
 121 trap exit INT
 122
 123 client_touch() {
 124     file=$1
 125     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 126         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 127         $PDSH $c touch $MOUNT/${c}_$file || return 1
 128     done
 129 }
 130
 131 client_rm() {
 132     file=$1
 133     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 134         $PDSH $c rm $MOUNT/${c}_$file
 135     done
 136 }
 137
 138 client_mkdirs() {
 139     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 140         echo "$c mkdir $MOUNT/$c"
 141         $PDSH $c "mkdir $MOUNT/$c"
 142         $PDSH $c "ls -l $MOUNT/$c"
 143     done
 144 }
 145
 146 client_rmdirs() {
 147     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 148         echo "rmdir $MOUNT/$c"
 149         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 150     done
 151 }
 152
 153 clients_recover_osts() {
 154     facet=$1
 155 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 156 }
 157
 158 check_and_setup_lustre
 159
 160 # 9 Different Failure Modes Combinations
 161 echo "Starting Test 17 at `date`"
 162
 163 test_0() {
 164     facet_failover $SINGLEMDS
 165     echo "Waiting for df pid: $DFPID"
 166     wait $DFPID || { echo "df returned $?" && return 1; }
 167
 168     for i in $(seq $OSTCOUNT) ; do
 169         facet_failover ost$i || return 4
 170         echo "Waiting for df pid: $DFPID"
 171         wait $DFPID || { echo "df returned $?" && return 3; }
 172     done
 173     return 0
 174 }
 175 run_test 0 "Fail all nodes, independently"
 176
 177 ############### First Failure Mode ###############
 178 test_1() {
 179 echo "Don't do a MDS - MDS Failure Case"
 180 echo "This makes no sense"
 181 }
 182 run_test 1 "MDS/MDS failure"
 183 ###################################################
 184
 185 ############### Second Failure Mode ###############
 186 test_2() {
 187     echo "Verify Lustre filesystem is up and running"
 188     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 189
 190     client_df
 191
 192     shutdown_facet $SINGLEMDS
 193     reboot_facet $SINGLEMDS
 194
 195     # prepare for MDS failover
 196     change_active $SINGLEMDS
 197     reboot_facet $SINGLEMDS
 198
 199     client_df &
 200     DFPID=$!
 201     sleep 5
 202
 203     shutdown_facet ost1
 204
 205     echo "Reintegrating OST"
 206     reboot_facet ost1
 207     wait_for ost1
 208     start_ost 1 || return 2
 209
 210     wait_for $SINGLEMDS
 211     start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
 212
 213     #Check FS
 214     wait $DFPID
 215     clients_recover_osts ost1
 216     echo "Verify reintegration"
 217     client_df || return 1
 218
 219 }
 220 run_test 2 "Second Failure Mode: MDS/OST `date`"
 221 ###################################################
 222
 223
 224 ############### Third Failure Mode ###############
 225 test_3() {
 226     #Create files
 227     echo "Verify Lustre filesystem is up and running"
 228     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 229
 230     #MDS Portion
 231     facet_failover $SINGLEMDS
 232     wait $DFPID || echo df failed: $?
 233     #Check FS
 234
 235     echo "Test Lustre stability after MDS failover"
 236     client_df
 237
 238     #CLIENT Portion
 239     echo "Failing 2 CLIENTS"
 240     fail_clients 2
 241
 242     #Check FS
 243     echo "Test Lustre stability after CLIENT failure"
 244     client_df
 245
 246     #Reintegration
 247     echo "Reintegrating CLIENTS"
 248     reintegrate_clients || return 1
 249
 250     client_df || return 3
 251     sleep 2 # give it a little time for fully recovered before next test
 252 }
 253 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 254 ###################################################
 255
 256 ############### Fourth Failure Mode ###############
 257 test_4() {
 258     echo "Fourth Failure Mode: OST/MDS `date`"
 259
 260     #OST Portion
 261     shutdown_facet ost1
 262
 263     #Check FS
 264     echo "Test Lustre stability after OST failure"
 265     client_df &
 266     DFPIDA=$!
 267     sleep 5
 268
 269     #MDS Portion
 270     shutdown_facet $SINGLEMDS
 271     reboot_facet $SINGLEMDS
 272
 273     # prepare for MDS failover
 274     change_active $SINGLEMDS
 275     reboot_facet $SINGLEMDS
 276
 277     client_df &
 278     DFPIDB=$!
 279     sleep 5
 280
 281     #Reintegration
 282     echo "Reintegrating OST"
 283     reboot_facet ost1
 284     wait_for ost1
 285     start_ost 1
 286
 287     wait_for $SINGLEMDS
 288     start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
 289     #Check FS
 290
 291     wait $DFPIDA
 292     wait $DFPIDB
 293     clients_recover_osts ost1
 294     echo "Test Lustre stability after MDS failover"
 295     client_df || return 1
 296 }
 297 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 298 ###################################################
 299
 300 ############### Fifth Failure Mode ###############
 301 test_5() {
 302     [ $OSTCOUNT -lt 2 ] && skip "$OSTCOUNT < 2, not enough OSTs" && return 0
 303
 304     echo "Fifth Failure Mode: OST/OST `date`"
 305
 306     #Create files
 307     echo "Verify Lustre filesystem is up and running"
 308     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 309
 310     client_df
 311
 312     #OST Portion
 313     shutdown_facet ost1
 314     reboot_facet ost1
 315
 316     #Check FS
 317     echo "Test Lustre stability after OST failure"
 318     client_df &
 319     DFPIDA=$!
 320     sleep 5
 321
 322     #OST Portion
 323     shutdown_facet ost2
 324     reboot_facet ost2
 325
 326     #Check FS
 327     echo "Test Lustre stability after OST failure"
 328     client_df &
 329     DFPIDB=$!
 330     sleep 5
 331
 332     #Reintegration
 333     echo "Reintegrating OSTs"
 334     wait_for ost1
 335     start_ost 1
 336     wait_for ost2
 337     start_ost 2
 338
 339     clients_recover_osts ost1
 340     clients_recover_osts ost2
 341     sleep $TIMEOUT
 342
 343     wait $DFPIDA
 344     wait $DFPIDB
 345     client_df || return 2
 346 }
 347 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 348 ###################################################
 349
 350 ############### Sixth Failure Mode ###############
 351 test_6() {
 352     echo "Sixth Failure Mode: OST/CLIENT `date`"
 353
 354     #Create files
 355     echo "Verify Lustre filesystem is up and running"
 356     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 357
 358     client_df
 359     client_touch testfile || return 2
 360
 361     #OST Portion
 362     shutdown_facet ost1
 363     reboot_facet ost1
 364
 365     #Check FS
 366     echo "Test Lustre stability after OST failure"
 367     client_df &
 368     DFPIDA=$!
 369     echo DFPIDA=$DFPIDA
 370     sleep 5
 371
 372     #CLIENT Portion
 373     echo "Failing CLIENTs"
 374     fail_clients
 375
 376     #Check FS
 377     echo "Test Lustre stability after CLIENTs failure"
 378     client_df &
 379     DFPIDB=$!
 380     echo DFPIDB=$DFPIDB
 381     sleep 5
 382
 383     #Reintegration
 384     echo "Reintegrating OST/CLIENTs"
 385     wait_for ost1
 386     start_ost 1
 387     reintegrate_clients || return 1
 388     sleep 5
 389
 390     wait_remote_prog df $((TIMEOUT * 3 + 10))
 391     wait $DFPIDA
 392     wait $DFPIDB
 393
 394     echo "Verifying mount"
 395     [ -z "$(mounted_lustre_filesystems)" ] && return 3
 396     client_df
 397 }
 398 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 399 ###################################################
 400
 401
 402 ############### Seventh Failure Mode ###############
 403 test_7() {
 404     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 405
 406     #Create files
 407     echo "Verify Lustre filesystem is up and running"
 408     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 409
 410     client_df
 411     client_touch testfile  || return 1
 412
 413     #CLIENT Portion
 414     echo "Part 1: Failing CLIENT"
 415     fail_clients 2
 416
 417     #Check FS
 418     echo "Test Lustre stability after CLIENTs failure"
 419     client_df
 420     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 421     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 422
 423     #Sleep
 424     echo "Wait 1 minutes"
 425     sleep 60
 426
 427     #Create files
 428     echo "Verify Lustre filesystem is up and running"
 429     [ -z "$(mounted_lustre_filesystems)" ] && return 2
 430
 431     client_df
 432     client_rm testfile
 433
 434     #MDS Portion
 435     facet_failover $SINGLEMDS
 436
 437     #Check FS
 438     echo "Test Lustre stability after MDS failover"
 439     wait $DFPID || echo "df on down clients fails " || return 1
 440     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 441     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 442
 443     #Reintegration
 444     echo "Reintegrating CLIENTs"
 445     reintegrate_clients || return 2
 446     client_df
 447
 448     #Sleep
 449     echo "wait 1 minutes"
 450     sleep 60
 451 }
 452 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 453 ###################################################
 454
 455
 456 ############### Eighth Failure Mode ###############
 457 test_8() {
 458     echo "Eighth Failure Mode: CLIENT/OST `date`"
 459
 460     #Create files
 461     echo "Verify Lustre filesystem is up and running"
 462     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 463
 464     client_df
 465     client_touch testfile
 466
 467     #CLIENT Portion
 468     echo "Failing CLIENTs"
 469     fail_clients 2
 470
 471     #Check FS
 472     echo "Test Lustre stability after CLIENTs failure"
 473     client_df
 474     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 475     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 476
 477     #Sleep
 478     echo "Wait 1 minutes"
 479     sleep 60
 480
 481     #Create files
 482     echo "Verify Lustre filesystem is up and running"
 483     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 484
 485     client_df
 486     client_touch testfile
 487
 488
 489     #OST Portion
 490     shutdown_facet ost1
 491     reboot_facet ost1
 492
 493     #Check FS
 494     echo "Test Lustre stability after OST failure"
 495     client_df &
 496     DFPID=$!
 497     sleep 5
 498     #non-failout hangs forever here
 499     #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
 500     #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 501
 502     #Reintegration
 503     echo "Reintegrating CLIENTs/OST"
 504     reintegrate_clients || return 3
 505     wait_for ost1
 506     start_ost 1
 507     wait $DFPID
 508     client_df || return 1
 509     client_touch testfile2 || return 2
 510
 511     #Sleep
 512     echo "Wait 1 minutes"
 513     sleep 60
 514 }
 515 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 516 ###################################################
 517
 518
 519 ############### Ninth Failure Mode ###############
 520 test_9() {
 521     echo
 522
 523     #Create files
 524     echo "Verify Lustre filesystem is up and running"
 525     [ -z "$(mounted_lustre_filesystems)" ] && error "Lustre is not running"
 526
 527     client_df
 528     client_touch testfile || return 1
 529
 530     #CLIENT Portion
 531     echo "Failing CLIENTs"
 532     fail_clients 2
 533
 534     #Check FS
 535     echo "Test Lustre stability after CLIENTs failure"
 536     client_df
 537     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 538     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 539
 540     #Sleep
 541     echo "Wait 1 minutes"
 542     sleep 60
 543
 544     #Create files
 545     echo "Verify Lustre filesystem is up and running"
 546     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 547     client_touch testfile || return 4
 548
 549     #CLIENT Portion
 550     echo "Failing CLIENTs"
 551     fail_clients 2
 552
 553     #Check FS
 554     echo "Test Lustre stability after CLIENTs failure"
 555     client_df
 556     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 557     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 558
 559     #Reintegration
 560     echo "Reintegrating  CLIENTs/CLIENTs"
 561     reintegrate_clients || return 7
 562     client_df
 563
 564     #Sleep
 565     echo "Wait 1 minutes"
 566     sleep 60
 567 }
 568 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 569 ###################################################
 570
 571 test_10() {
 572     #Run availability after all failures
 573     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 574     LOADTEST=${LOADTEST:-metadata-load.py}
 575     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 576 }
 577 run_test 10 "Running Availability for 6 hours..."
 578
 579 equals_msg `basename $0`: test complete, cleaning up
 580 check_and_cleanup_lustre
 581 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true