lustre/tests/insanity.sh

   1 #!/bin/sh
   2 # Test multiple failures, AKA Test 17
   3
   4 set -e
   5
   6 LUSTRE=${LUSTRE:-`dirname $0`/..}
   7 . $LUSTRE/tests/test-framework.sh
   8
   9 init_test_env $@
  10
  11 . ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
  12
  13 ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
  14
  15 #
  16 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
  17
  18 SETUP=${SETUP:-""}
  19 CLEANUP=${CLEANUP:-""}
  20
  21 build_test_filter
  22
  23 assert_env mds_HOST MDS_MKFS_OPTS MDSDEV
  24 assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
  25 assert_env LIVE_CLIENT FSNAME
  26
  27
  28 # This can be a regexp, to allow more clients
  29 CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
  30
  31 DIR=${DIR:-$MOUNT}
  32
  33 #####
  34 # fail clients round robin
  35
  36 # list of failable clients
  37 FAIL_LIST=($FAIL_CLIENTS)
  38 FAIL_NUM=${#FAIL_LIST[*]}
  39 FAIL_NEXT=0
  40 typeset -i  FAIL_NEXT
  41 DOWN_NUM=0   # number of nodes currently down
  42
  43 # set next client to fail
  44 set_fail_client() {
  45     FAIL_CLIENT=${FAIL_LIST[$FAIL_NEXT]}
  46     FAIL_NEXT=$(( (FAIL_NEXT+1) % FAIL_NUM ))
  47     echo "fail $FAIL_CLIENT, next is $FAIL_NEXT"
  48 }
  49
  50 shutdown_client() {
  51     client=$1
  52     if [ "$FAILURE_MODE" = HARD ]; then
  53        $POWER_DOWN $client
  54        while ping -w 3 -c 1 $client > /dev/null 2>&1; do
  55            echo "waiting for node $client to fail"
  56            sleep 1
  57        done
  58     elif [ "$FAILURE_MODE" = SOFT ]; then
  59        zconf_umount $client $MOUNT -f
  60     fi
  61 }
  62
  63 reboot_node() {
  64     NODE=$1
  65     if [ "$FAILURE_MODE" = HARD ]; then
  66        $POWER_UP $NODE
  67     fi
  68 }
  69
  70 fail_clients() {
  71     num=$1
  72     if [ -z "$num"  ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then
  73         num=$((FAIL_NUM - DOWN_NUM))
  74     fi
  75
  76     if [ -z "$num" ] || [ "$num" -le 0 ]; then
  77         return
  78     fi
  79
  80     client_mkdirs
  81
  82     for i in `seq $num`; do
  83        set_fail_client
  84        client=$FAIL_CLIENT
  85        DOWN_CLIENTS="$DOWN_CLIENTS $client"
  86        shutdown_client $client
  87     done
  88
  89     echo "down clients: $DOWN_CLIENTS"
  90
  91     for client in $DOWN_CLIENTS; do
  92         reboot_node $client
  93     done
  94     DOWN_NUM=`echo $DOWN_CLIENTS | wc -w`
  95     client_rmdirs
  96 }
  97
  98 reintegrate_clients() {
  99     for client in $DOWN_CLIENTS; do
 100         wait_for_host $client
 101         echo "Restarting $client"
 102         zconf_mount $client $MOUNT || return 1
 103     done
 104     DOWN_CLIENTS=""
 105     DOWN_NUM=0
 106 }
 107
 108 start_ost() {
 109     start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
 110 }
 111
 112 trap exit INT
 113
 114 client_touch() {
 115     file=$1
 116     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 117         if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
 118         $PDSH $c touch $MOUNT/${c}_$file || return 1
 119     done
 120 }
 121
 122 client_rm() {
 123     file=$1
 124     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 125         $PDSH $c rm $MOUNT/${c}_$file
 126     done
 127 }
 128
 129 client_mkdirs() {
 130     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 131         echo "$c mkdir $MOUNT/$c"
 132         $PDSH $c "mkdir $MOUNT/$c"
 133         $PDSH $c "ls -l $MOUNT/$c"
 134     done
 135 }
 136
 137 client_rmdirs() {
 138     for c in $LIVE_CLIENT $FAIL_CLIENTS;  do
 139         echo "rmdir $MOUNT/$c"
 140         $PDSH $LIVE_CLIENT "rmdir $MOUNT/$c"
 141     done
 142 }
 143
 144 clients_recover_osts() {
 145     facet=$1
 146 #    do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
 147 }
 148
 149 cleanup_and_setup_lustre
 150
 151 # 9 Different Failure Modes Combinations
 152 echo "Starting Test 17 at `date`"
 153
 154 test_0() {
 155     facet_failover mds
 156     echo "Waiting for df pid: $DFPID"
 157     wait $DFPID || { echo "df returned $?" && return 1; }
 158
 159     facet_failover ost1 || return 4
 160     echo "Waiting for df pid: $DFPID"
 161     wait $DFPID || { echo "df returned $?" && return 2; }
 162
 163     if [ $OSTCOUNT -gt 1 ]; then
 164         facet_failover ost2 || return 5
 165         echo "Waiting for df pid: $DFPID"
 166         wait $DFPID || { echo "df returned $?" && return 3; }
 167     fi
 168     return 0
 169 }
 170 run_test 0 "Fail all nodes, independently"
 171
 172 ############### First Failure Mode ###############
 173 test_1() {
 174 echo "Don't do a MDS - MDS Failure Case"
 175 echo "This makes no sense"
 176 }
 177 run_test 1 "MDS/MDS failure"
 178 ###################################################
 179
 180 ############### Second Failure Mode ###############
 181 test_2() {
 182     echo "Verify Lustre filesystem is up and running"
 183     client_df
 184
 185     shutdown_facet mds
 186     reboot_facet mds
 187
 188     # prepare for MDS failover
 189     change_active mds
 190     reboot_facet mds
 191
 192     client_df &
 193     DFPID=$!
 194     sleep 5
 195
 196     shutdown_facet ost1
 197
 198     echo "Reintegrating OST"
 199     reboot_facet ost1
 200     wait_for ost1
 201     start_ost 1 || return 2
 202
 203     wait_for mds
 204     start mds $MDSDEV $MDS_MOUNT_OPTS || return $?
 205
 206     #Check FS
 207     wait $DFPID
 208     clients_recover_osts ost1
 209     echo "Verify reintegration"
 210     client_df || return 1
 211
 212 }
 213 run_test 2 "Second Failure Mode: MDS/OST `date`"
 214 ###################################################
 215
 216
 217 ############### Third Failure Mode ###############
 218 test_3() {
 219     #Create files
 220     echo "Verify Lustre filesystem is up and running"
 221
 222     #MDS Portion
 223     facet_failover mds
 224     wait $DFPID || echo df failed: $?
 225     #Check FS
 226
 227     echo "Test Lustre stability after MDS failover"
 228     client_df
 229
 230     #CLIENT Portion
 231     echo "Failing 2 CLIENTS"
 232     fail_clients 2
 233
 234     #Check FS
 235     echo "Test Lustre stability after CLIENT failure"
 236     client_df
 237
 238     #Reintegration
 239     echo "Reintegrating CLIENTS"
 240     reintegrate_clients || return 1
 241
 242     client_df || return 3
 243 }
 244 run_test 3  "Thirdb Failure Mode: MDS/CLIENT `date`"
 245 ###################################################
 246
 247 ############### Fourth Failure Mode ###############
 248 test_4() {
 249     echo "Fourth Failure Mode: OST/MDS `date`"
 250
 251     #OST Portion
 252     shutdown_facet ost1
 253
 254     #Check FS
 255     echo "Test Lustre stability after OST failure"
 256     client_df &
 257     DFPIDA=$!
 258     sleep 5
 259
 260     #MDS Portion
 261     shutdown_facet mds
 262     reboot_facet mds
 263
 264     # prepare for MDS failover
 265     change_active mds
 266     reboot_facet mds
 267
 268     client_df &
 269     DFPIDB=$!
 270     sleep 5
 271
 272     #Reintegration
 273     echo "Reintegrating OST"
 274     reboot_facet ost1
 275     wait_for ost1
 276     start_ost 1
 277
 278     wait_for mds
 279     start mds $MDSDEV $MDS_MOUNT_OPTS
 280     #Check FS
 281
 282     wait $DFPIDA
 283     wait $DFPIDB
 284     clients_recover_osts ost1
 285     echo "Test Lustre stability after MDS failover"
 286     client_df || return 1
 287 }
 288 run_test 4 "Fourth Failure Mode: OST/MDS `date`"
 289 ###################################################
 290
 291 ############### Fifth Failure Mode ###############
 292 test_5() {
 293     [ $OSTCOUNT -lt 1 ] && skip "$OSTCOUNT < 1, not enough OSTs" && return 0
 294
 295     echo "Fifth Failure Mode: OST/OST `date`"
 296
 297     #Create files
 298     echo "Verify Lustre filesystem is up and running"
 299     client_df
 300
 301     #OST Portion
 302     shutdown_facet ost1
 303     reboot_facet ost1
 304
 305     #Check FS
 306     echo "Test Lustre stability after OST failure"
 307     client_df &
 308     DFPIDA=$!
 309     sleep 5
 310
 311     #OST Portion
 312     shutdown_facet ost2
 313     reboot_facet ost2
 314
 315     #Check FS
 316     echo "Test Lustre stability after OST failure"
 317     client_df &
 318     DFPIDB=$!
 319     sleep 5
 320
 321     #Reintegration
 322     echo "Reintegrating OSTs"
 323     wait_for ost1
 324     start_ost 1
 325     wait_for ost2
 326     start_ost 2
 327
 328     clients_recover_osts ost1
 329     clients_recover_osts ost2
 330     sleep $TIMEOUT
 331
 332     wait $DFPIDA
 333     wait $DFPIDB
 334     client_df || return 2
 335 }
 336 run_test 5 "Fifth Failure Mode: OST/OST `date`"
 337 ###################################################
 338
 339 ############### Sixth Failure Mode ###############
 340 test_6() {
 341     echo "Sixth Failure Mode: OST/CLIENT `date`"
 342
 343     #Create files
 344     echo "Verify Lustre filesystem is up and running"
 345     client_df || return 1
 346     client_touch testfile || return 2
 347
 348     #OST Portion
 349     shutdown_facet ost1
 350     reboot_facet ost1
 351
 352     #Check FS
 353     echo "Test Lustre stability after OST failure"
 354     client_df &
 355     DFPIDA=$!
 356     sleep 5
 357
 358     #CLIENT Portion
 359     echo "Failing CLIENTs"
 360     fail_clients
 361
 362     #Check FS
 363     echo "Test Lustre stability after CLIENTs failure"
 364     client_df &
 365     DFPIDB=$!
 366     sleep 5
 367
 368     #Reintegration
 369     echo "Reintegrating OST/CLIENTs"
 370     wait_for ost1
 371     start_ost 1
 372     reintegrate_clients
 373     sleep 5
 374
 375     wait $DFPIDA
 376     wait $DFPIDB
 377     echo "Verifying mount"
 378     client_df || return 3
 379 }
 380 run_test 6 "Sixth Failure Mode: OST/CLIENT `date`"
 381 ###################################################
 382
 383
 384 ############### Seventh Failure Mode ###############
 385 test_7() {
 386     echo "Seventh Failure Mode: CLIENT/MDS `date`"
 387
 388     #Create files
 389     echo "Verify Lustre filesystem is up and running"
 390     client_df
 391     client_touch testfile  || return 1
 392
 393     #CLIENT Portion
 394     echo "Part 1: Failing CLIENT"
 395     fail_clients 2
 396
 397     #Check FS
 398     echo "Test Lustre stability after CLIENTs failure"
 399     client_df
 400     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 401     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 402
 403     #Sleep
 404     echo "Wait 1 minutes"
 405     sleep 60
 406
 407     #Create files
 408     echo "Verify Lustre filesystem is up and running"
 409     client_df
 410     client_rm testfile
 411
 412     #MDS Portion
 413     facet_failover mds
 414
 415     #Check FS
 416     echo "Test Lustre stability after MDS failover"
 417     wait $DFPID || echo "df on down clients fails " || return 1
 418     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 419     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 420
 421     #Reintegration
 422     echo "Reintegrating CLIENTs"
 423     reintegrate_clients
 424     client_df || return 2
 425
 426     #Sleep
 427     echo "wait 1 minutes"
 428     sleep 60
 429 }
 430 run_test 7 "Seventh Failure Mode: CLIENT/MDS `date`"
 431 ###################################################
 432
 433
 434 ############### Eighth Failure Mode ###############
 435 test_8() {
 436     echo "Eighth Failure Mode: CLIENT/OST `date`"
 437
 438     #Create files
 439     echo "Verify Lustre filesystem is up and running"
 440     client_df
 441     client_touch testfile
 442
 443     #CLIENT Portion
 444     echo "Failing CLIENTs"
 445     fail_clients 2
 446
 447     #Check FS
 448     echo "Test Lustre stability after CLIENTs failure"
 449     client_df
 450     $PDSH $LIVE_CLIENT "ls -l $MOUNT"
 451     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 452
 453     #Sleep
 454     echo "Wait 1 minutes"
 455     sleep 60
 456
 457     #Create files
 458     echo "Verify Lustre filesystem is up and running"
 459     client_df
 460     client_touch testfile
 461
 462
 463     #OST Portion
 464     shutdown_facet ost1
 465     reboot_facet ost1
 466
 467     #Check FS
 468     echo "Test Lustre stability after OST failure"
 469     client_df &
 470     DFPID=$!
 471     sleep 5
 472     #non-failout hangs forever here
 473     #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
 474     #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
 475
 476     #Reintegration
 477     echo "Reintegrating CLIENTs/OST"
 478     reintegrate_clients
 479     wait_for ost1
 480     start_ost 1
 481     wait $DFPID
 482     client_df || return 1
 483     client_touch testfile2 || return 2
 484
 485     #Sleep
 486     echo "Wait 1 minutes"
 487     sleep 60
 488 }
 489 run_test 8 "Eighth Failure Mode: CLIENT/OST `date`"
 490 ###################################################
 491
 492
 493 ############### Ninth Failure Mode ###############
 494 test_9() {
 495     echo
 496
 497     #Create files
 498     echo "Verify Lustre filesystem is up and running"
 499     client_df
 500     client_touch testfile || return 1
 501
 502     #CLIENT Portion
 503     echo "Failing CLIENTs"
 504     fail_clients 2
 505
 506     #Check FS
 507     echo "Test Lustre stability after CLIENTs failure"
 508     client_df
 509     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 1
 510     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 2
 511
 512     #Sleep
 513     echo "Wait 1 minutes"
 514     sleep 60
 515
 516     #Create files
 517     echo "Verify Lustre filesystem is up and running"
 518     $PDSH $LIVE_CLIENT df $MOUNT || return 3
 519     client_touch testfile || return 4
 520
 521     #CLIENT Portion
 522     echo "Failing CLIENTs"
 523     fail_clients 2
 524
 525     #Check FS
 526     echo "Test Lustre stability after CLIENTs failure"
 527     client_df
 528     $PDSH $LIVE_CLIENT "ls -l $MOUNT" || return 5
 529     $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile" || return 6
 530
 531     #Reintegration
 532     echo "Reintegrating  CLIENTs/CLIENTs"
 533     reintegrate_clients
 534     client_df || return 7
 535
 536     #Sleep
 537     echo "Wait 1 minutes"
 538     sleep 60
 539 }
 540 run_test 9 "Ninth Failure Mode: CLIENT/CLIENT `date`"
 541 ###################################################
 542
 543 test_10() {
 544     #Run availability after all failures
 545     DURATION=${DURATION:-$((2 * 60 * 60))} # 6 hours default
 546     LOADTEST=${LOADTEST:-metadata-load.py}
 547     $PWD/availability.sh $CONFIG $DURATION $CLIENTS || return 1
 548 }
 549 run_test 10 "Running Availability for 6 hours..."
 550
 551 equals_msg `basename $0`: test complete, cleaning up
 552 check_and_cleanup_lustre
 553 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true