lustre/tests/ha.sh

   1 #!/bin/bash
   2 # -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
   3 # vim:shiftwidth=4:softtabstop=4:tabstop=4:
   4 #
   5 # NAME
   6 #
   7 #   ha.sh - test Lustre HA (aka failover) configurations
   8 #
   9 # SYNOPSIS
  10 #
  11 #   ha.sh [OPTIONS]
  12 #
  13 # DESCRIPTION
  14 #
  15 #   ha.sh tests Lustre HA (aka failover) configurations with a CRM.
  16 #
  17 # OPTIONS
  18 #
  19 #   -h
  20 #       Help.
  21 #
  22 #   -c HOST[,...]
  23 #       Specify client nodes.
  24 #
  25 #   -s HOST[,...]
  26 #       Specify server nodes.
  27 #
  28 #   -v HOST[,...]
  29 #       Specify victim nodes to be rebooted.
  30 #
  31 #   -d DIRECTORY
  32 #       Choose a parent of the test directory.  "/mnt/lustre" if not specified.
  33 #
  34 #   -u SECONDS
  35 #       Define a duration for the test. 86400 seconds if not specified.
  36 #
  37 #   -p SECONDS
  38 #       Define a max failover period. 10 minutes if not set.
  39 #
  40 #   -w
  41 #       Only run the workloads; no failure will be introduced.
  42 #       -v, -s are ignored in this case.
  43 #   -r
  44 #       Workloads dry run for several seconds; no failures will be introduced.
  45 #       This option is useful to verify the loads.
  46 #       -u is ignored in this case
  47 #   -m
  48 #       Reboot victim nodes simultaneously.
  49 #
  50 #
  51 # ASSUMPTIONS
  52 #
  53 #   A Lustre file system is up and mounted on all client nodes.  This script
  54 #   does not mount or unmount any Lustre targets or clients, let alone format
  55 #   anything.
  56 #
  57 #   Each target has a failnode, so that workloads can continue after a power
  58 #   failure.
  59 #
  60 #   CRM could be configured by 2 ways:
  61 #   1.
  62 #   Targets are automatically failed back when their primary node is back.  This
  63 #   assumption avoids calling CRM-specific commands to trigger failbacks, making
  64 #   this script more CRM-neural.
  65 #   2.
  66 #   Targets are not automatically failed back when their primary node is back.
  67 #   CRM-specific command is executed to trigger failbacks.
  68 #
  69 #   A crash dump mechanism is configured to catch LBUGs, panics, etc.
  70 #
  71 # WORKLOADS
  72 #
  73 #   Each client runs set of MPI and non-MPI workloads. These
  74 #   applications are run in short loops so that their exit status can be waited
  75 #   for and checked within reasonable time by ha_wait_loads.
  76 #   The set of MPI and non-MPI workloads are configurable by parameters:
  77 #       ha_nonmpi_loads
  78 #               default set: dd, tar, iozone
  79 #       ha_mpi_loads
  80 #               default set: ior, simul, mdtest
  81 #
  82 #   The number of clients run MPI loads is configured by parameter
  83 #   ha_mpi_instances. Only one client runs MPI workloads by default.
  84 #
  85 #   MPI workloads can be run from several users. The list of users to use is
  86 #   configured by parameter ha_mpi_users, default is "mpiuser".
  87 #
  88 # PROCESS STRUCTURE AND IPC
  89 #
  90 #   On the node where this script is run, the processes look like this:
  91 #
  92 #       ~ ha.sh (ha_killer)
  93 #
  94 #           ~ ha.sh (ha_repeat_mpi_load ior)
  95 #               ~ mpirun IOR
  96 #           ~ ha.sh (ha_repeat_mpi_load simul)
  97 #               ~ mpirun simul
  98 #           ~ ha.sh (ha_repeat_mpi_load mdtest)
  99 #               ~ mpirun mdtest
 100 #           ~ ... (one for each MPI load)
 101 #
 102 #           ~ ha.sh (ha_repeat_nonmpi_load client2 dbench)
 103 #               ~ pdsh client2 dbench
 104 #           ~ ha.sh (ha_repeat_nonmpi_load client2 iozone)
 105 #               ~ pdsh client2 iozone
 106 #           ~ ha.sh (ha_repeat_nonmpi_load client5 iozone)
 107 #               ~ pdsh client5 iozone
 108 #           ~ ... (one for each non-MPI load on each client)
 109 #
 110 #   Each tilde represents a process.  Indentations imply parent-children
 111 #   relation.
 112 #
 113 #   IPC is done by files in the temporary directory.
 114 #
 115
 116 #set -x
 117
 118 SIMUL=${SIMUL:-$(which simul 2> /dev/null || true)}
 119 IOR=${IOR:-$(which IOR 2> /dev/null || true)}
 120 MDTEST=${MDTEST:-$(which mdtest 2> /dev/null || true)}
 121
 122 ior_blockSize=${ior_blockSize:-6g}
 123 mpi_threads_per_client=${mpi_threads_per_client:-2}
 124
 125 iozone_SIZE=${iozone_SIZE:-262144} # 256m
 126
 127 mpirun=${MPIRUN:-$(which mpirun)}
 128 LFS=${LFS:-$(which lfs)}
 129
 130 ha_check_env()
 131 {
 132         for ((load = 0; load < ${#ha_mpi_load_tags[@]}; load++)); do
 133                 local tag=${ha_mpi_load_tags[$load]}
 134                 local bin=$(echo $tag | tr '[:lower:]' '[:upper:]')
 135                 if [ x${!bin} = x ]; then
 136                         ha_error ha_mpi_loads: ${ha_mpi_loads}, $bin is not set
 137                         exit 1
 138                 fi
 139         done
 140 }
 141
 142 ha_info()
 143 {
 144         echo "$0: $(date +%H:%M:%S' '%s):" "$@"
 145 }
 146
 147 ha_log()
 148 {
 149         local nodes=${1// /,}
 150         shift
 151         ha_on $nodes "lctl mark $*"
 152 }
 153
 154 ha_error()
 155 {
 156     ha_info "$@" >&2
 157 }
 158
 159 ha_trap_err()
 160 {
 161     local i
 162
 163     ha_error "Trap ERR triggered by:"
 164     ha_error "    $BASH_COMMAND"
 165     ha_error "Call trace:"
 166     for ((i = 0; i < ${#FUNCNAME[@]}; i++)); do
 167         ha_error "    ${FUNCNAME[$i]} [${BASH_SOURCE[$i]}:${BASH_LINENO[$i]}]"
 168     done
 169 }
 170
 171 trap ha_trap_err ERR
 172 set -eE
 173
 174 declare     ha_tmp_dir=/tmp/$(basename $0)-$$
 175 declare     ha_stop_file=$ha_tmp_dir/stop
 176 declare     ha_fail_file=$ha_tmp_dir/fail
 177 declare     ha_status_file_prefix=$ha_tmp_dir/status
 178 declare -a  ha_status_files
 179 declare     ha_machine_file=$ha_tmp_dir/machine_file
 180 declare     ha_lfsck_log=$ha_tmp_dir/lfsck.log
 181 declare     ha_lfsck_lock=$ha_tmp_dir/lfsck.lock
 182 declare     ha_lfsck_stop=$ha_tmp_dir/lfsck.stop
 183 declare     ha_lfsck_bg=${LFSCK_BG:-false}
 184 declare     ha_lfsck_after=${LFSCK_AFTER:-false}
 185 declare     ha_lfsck_node=${LFSCK_NODE:-""}
 186 declare     ha_lfsck_device=${LFSCK_DEV:-""}
 187 declare     ha_lfsck_types=${LFSCK_TYPES:-"namespace layout"}
 188 declare     ha_lfsck_custom_params=${LFSCK_CUSTOM_PARAMS:-""}
 189 declare     ha_lfsck_wait=${LFSCK_WAIT:-1200}
 190 declare     ha_lfsck_fail_on_repaired=${LFSCK_FAIL_ON_REPAIRED:-false}
 191 declare     ha_power_down_cmd=${POWER_DOWN:-"pm -0"}
 192 declare     ha_power_up_cmd=${POWER_UP:-"pm -1"}
 193 declare     ha_power_delay=${POWER_DELAY:-60}
 194 declare     ha_failback_delay=${DELAY:-5}
 195 declare     ha_failback_cmd=${FAILBACK:-""}
 196 declare     ha_stripe_params=${STRIPEPARAMS:-"-c 0"}
 197 declare     ha_dir_stripe_count=${DSTRIPECOUNT:-"1"}
 198 declare     ha_mdt_index=${MDTINDEX:-"0"}
 199 declare     ha_mdt_index_random=${MDTINDEXRAND:-false}
 200 declare -a  ha_clients
 201 declare -a  ha_servers
 202 declare -a  ha_victims
 203 declare     ha_test_dir=/mnt/lustre/$(basename $0)-$$
 204 declare     ha_start_time=$(date +%s)
 205 declare     ha_expected_duration=$((60 * 60 * 24))
 206 declare     ha_max_failover_period=10
 207 declare     ha_nr_loops=0
 208 declare     ha_stop_signals="SIGINT SIGTERM SIGHUP"
 209 declare     ha_load_timeout=$((60 * 10))
 210 declare     ha_workloads_only=false
 211 declare     ha_workloads_dry_run=false
 212 declare     ha_simultaneous=false
 213
 214 declare     ha_mpi_instances=${ha_mpi_instances:-1}
 215
 216 declare     ha_mpi_loads=${ha_mpi_loads="ior simul mdtest"}
 217 declare -a  ha_mpi_load_tags=($ha_mpi_loads)
 218 declare -a  ha_mpiusers=(${ha_mpi_users="mpiuser"})
 219
 220 declare     ha_ior_params=${IORP:-'" -b $ior_blockSize -t 2m -w -W -T 1"'}
 221 declare     ha_simul_params=${SIMULP:-'" -n 10"'}
 222 declare     ha_mdtest_params=${MDTESTP:-'" -i 1 -n 1000"'}
 223 declare     ha_mpirun_options=${MPIRUN_OPTIONS:-""}
 224 declare     ha_clients_stripe=${CLIENTSSTRIPE:-'"$STRIPEPARAMS"'}
 225 declare     ha_nclientsset=${NCLIENTSSET:-1}
 226
 227 declare     ha_racer_params=${RACERP:-"MDSCOUNT=1"}
 228
 229 eval ha_params_ior=($ha_ior_params)
 230 eval ha_params_simul=($ha_simul_params)
 231 eval ha_params_mdtest=($ha_mdtest_params)
 232 eval ha_stripe_clients=($ha_clients_stripe)
 233
 234 declare ha_nparams_ior=${#ha_params_ior[@]}
 235 declare ha_nparams_simul=${#ha_params_simul[@]}
 236 declare ha_nparams_mdtest=${#ha_params_mdtest[@]}
 237 declare ha_nstripe_clients=${#ha_stripe_clients[@]}
 238
 239 declare -A  ha_mpi_load_cmds=(
 240         [ior]="$IOR -o {}/f.ior {params}"
 241         [simul]="$SIMUL {params} -d {}"
 242         [mdtest]="$MDTEST {params} -d {}"
 243 )
 244
 245 declare racer=${RACER:-"$(dirname $0)/racer/racer.sh"}
 246
 247 declare     ha_nonmpi_loads=${ha_nonmpi_loads="dd tar iozone"}
 248 declare -a  ha_nonmpi_load_tags=($ha_nonmpi_loads)
 249 declare -A  ha_nonmpi_load_cmds=(
 250         [dd]="dd if=/dev/zero of={}/f.dd bs=1M count=256"
 251         [tar]="tar cf - /etc | tar xf - -C {}"
 252         [iozone]="iozone -a -e -+d -s $iozone_SIZE {}/f.iozone"
 253         [racer]="$ha_racer_params $racer {}"
 254 )
 255
 256 ha_usage()
 257 {
 258     ha_info "Usage: $0 -c HOST[,...] -s HOST[,...]"                         \
 259             "-v HOST[,...] [-d DIRECTORY] [-u SECONDS]"
 260 }
 261
 262 ha_process_arguments()
 263 {
 264     local opt
 265
 266         while getopts hc:s:v:d:p:u:wrm opt; do
 267         case $opt in
 268         h)
 269             ha_usage
 270             exit 0
 271             ;;
 272         c)
 273             ha_clients=(${OPTARG//,/ })
 274             ;;
 275         s)
 276             ha_servers=(${OPTARG//,/ })
 277             ;;
 278         v)
 279             ha_victims=(${OPTARG//,/ })
 280             ;;
 281         d)
 282             ha_test_dir=$OPTARG/$(basename $0)-$$
 283             ;;
 284         u)
 285             ha_expected_duration=$OPTARG
 286             ;;
 287         p)
 288                 ha_max_failover_period=$OPTARG
 289                 ;;
 290         w)
 291                 ha_workloads_only=true
 292                 ;;
 293         r)
 294                 ha_workloads_dry_run=true
 295                 ;;
 296         m)
 297                 ha_simultaneous=true
 298                 ;;
 299         \?)
 300             ha_usage
 301             exit 1
 302             ;;
 303         esac
 304     done
 305
 306         if [ -z "${ha_clients[*]}" ]; then
 307                 ha_error "-c is mandatory"
 308                 ha_usage
 309                 exit 1
 310         fi
 311         if ! ($ha_workloads_dry_run ||
 312                         $ha_workloads_only) &&
 313                         ([ -z "${ha_servers[*]}" ] ||
 314                         [ -z "${ha_victims[*]}" ]); then
 315                 ha_error "-s, and -v are all mandatory"
 316                 ha_usage
 317                 exit 1
 318         fi
 319 }
 320
 321 ha_on()
 322 {
 323         local nodes=$1
 324         local rc=0
 325
 326         shift
 327
 328         #
 329         # -S is to be used here to track the
 330         # remote command return values
 331         #
 332         pdsh -S -w $nodes PATH=/usr/local/sbin:/usr/local/bin:/sbin:\
 333 /bin:/usr/sbin:/usr/bin "$@" ||
 334                 rc=$?
 335         return $rc
 336 }
 337
 338 ha_trap_exit()
 339 {
 340         touch "$ha_stop_file"
 341         trap 0
 342         if [ -e "$ha_fail_file" ]; then
 343                 ha_info "Test directory $ha_test_dir not removed"
 344                 ha_info "Temporary directory $ha_tmp_dir not removed"
 345         else
 346                 ha_on ${ha_clients[0]} rm -rf "$ha_test_dir"
 347                 ha_info "Please find the results in the directory $ha_tmp_dir"
 348         fi
 349 }
 350
 351 ha_trap_stop_signals()
 352 {
 353     ha_info "${ha_stop_signals// /,} received"
 354     touch "$ha_stop_file"
 355 }
 356
 357 ha_sleep()
 358 {
 359     local n=$1
 360
 361     ha_info "Sleeping for ${n}s"
 362     #
 363     # sleep(1) could interrupted.
 364     #
 365     sleep $n || true
 366 }
 367
 368 ha_wait_unlock()
 369 {
 370         local lock=$1
 371
 372         while [ -e $lock ]; do
 373                 sleep 1
 374         done
 375 }
 376
 377 ha_lock()
 378 {
 379     local lock=$1
 380
 381     until mkdir "$lock" >/dev/null 2>&1; do
 382         ha_sleep 1 >/dev/null
 383     done
 384 }
 385
 386 ha_unlock()
 387 {
 388     local lock=$1
 389
 390     rm -r "$lock"
 391 }
 392
 393 ha_dump_logs()
 394 {
 395         local nodes=${1// /,}
 396         local file=/tmp/$(basename $0)-$$-$(date +%s).dk
 397         local lock=$ha_tmp_dir/lock-dump-logs
 398         local rc=0
 399
 400         ha_lock "$lock"
 401         ha_info "Dumping lctl log to $file"
 402
 403         #
 404         # some nodes could crash, so
 405         # do not exit with error if not all logs are dumped
 406         #
 407         ha_on $nodes "lctl dk >>$file" || rc=$?
 408
 409         [ $rc -eq 0 ] ||
 410                 ha_error "not all logs are dumped! Some nodes are unreachable."
 411         ha_unlock "$lock"
 412 }
 413
 414 ha_repeat_mpi_load()
 415 {
 416         local client=$1
 417         local load=$2
 418         local status=$3
 419         local parameter=$4
 420         local machines=$5
 421         local stripeparams=$6
 422         local mpiuser=$7
 423         local tag=${ha_mpi_load_tags[$load]}
 424         local cmd=${ha_mpi_load_cmds[$tag]}
 425         local dir=$ha_test_dir/$client-$tag
 426         local log=$ha_tmp_dir/$client-$tag
 427         local rc=0
 428         local nr_loops=0
 429         local avg_loop_time=0
 430         local start_time=$(date +%s)
 431
 432         cmd=${cmd//"{}"/$dir}
 433         cmd=${cmd//"{params}"/$parameter}
 434
 435         ha_info "Starting $tag"
 436
 437         machines="-machinefile $machines"
 438         while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
 439                 {
 440                 local mdt_index
 441                 if $ha_mdt_index_random && [ $ha_mdt_index -ne 0 ]; then
 442                         mdt_index=$(ha_rand $ha_mdt_index)
 443                 else
 444                         mdt_index=$ha_mdt_index
 445                 fi
 446                 ha_on $client $LFS mkdir -i$mdt_index -c$ha_dir_stripe_count "$dir" &&
 447                 ha_on $client $LFS getdirstripe "$dir" &&
 448                 ha_on $client $LFS setstripe $stripeparams $dir &&
 449                 ha_on $client $LFS getstripe $dir &&
 450                 ha_on $client chmod a+xwr $dir &&
 451                 ha_on $client "su $mpiuser sh -c \" $mpirun $ha_mpirun_options \
 452                         -np $((${#ha_clients[@]} * mpi_threads_per_client )) \
 453                         $machines $cmd \" " &&
 454                         ha_on $client rm -rf "$dir";
 455                 } >>"$log" 2>&1 || rc=$?
 456
 457                 ha_info rc=$rc
 458
 459                 if ((rc != 0)); then
 460                         touch "$ha_fail_file"
 461                         touch "$ha_stop_file"
 462                         ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
 463                 fi
 464                 echo $rc >"$status"
 465
 466                 nr_loops=$((nr_loops + 1))
 467         done
 468
 469         [ $nr_loops -ne 0 ] &&
 470                 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
 471
 472         ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time"
 473 }
 474
 475 ha_start_mpi_loads()
 476 {
 477         local client
 478         local load
 479         local tag
 480         local status
 481         local n
 482         local nparam
 483         local machines
 484         local m
 485         local -a mach
 486         local mpiuser
 487
 488         # ha_mpi_instances defines the number of
 489         # clients start mpi loads; should be <= ${#ha_clients[@]}
 490         # do nothing if
 491         #    ha_mpi_instances = 0
 492         # or
 493         #    ${#ha_mpi_load_tags[@]} =0
 494         local inst=$ha_mpi_instances
 495         (( inst == 0 )) || (( ${#ha_mpi_load_tags[@]} == 0 )) &&
 496                 ha_info "no mpi load to start" &&
 497                 return 0
 498
 499         (( inst <= ${#ha_clients[@]} )) || inst=${#ha_clients[@]}
 500
 501         # Define names for machinefiles for each client set
 502         for (( n=0; n < $ha_nclientsset; n++ )); do
 503                 mach[$n]=$ha_machine_file$n
 504         done
 505
 506         for ((n = 0; n < ${#ha_clients[@]}; n++)); do
 507                 m=$(( n % ha_nclientsset))
 508                 machines=${mach[m]}
 509                 ha_info machine_file=$machines
 510                 echo ${ha_clients[n]} >> $machines
 511         done
 512         local dirname=$(dirname $ha_machine_file)
 513         for client in ${ha_clients[@]}; do
 514                 ha_on $client mkdir -p $dirname
 515                 scp $ha_machine_file* $client:$dirname
 516         done
 517
 518         for ((n = 0; n < $inst; n++)); do
 519                 client=${ha_clients[n]}
 520                 mpiuser=${ha_mpiusers[$((n % ${#ha_mpiusers[@]}))]}
 521                 for ((load = 0; load < ${#ha_mpi_load_tags[@]}; load++)); do
 522                         tag=${ha_mpi_load_tags[$load]}
 523                         status=$ha_status_file_prefix-$tag-$client
 524                         # ha_nparams_ior
 525                         # ha_nparams_simul
 526                         local num=ha_nparams_$tag
 527                         nparam=$((n % num))
 528                         local aref=ha_params_$tag[nparam]
 529                         local parameter=${!aref}
 530                         local nstripe=$((n % ha_nstripe_clients))
 531                         aref=ha_stripe_clients[nstripe]
 532                         local stripe=${!aref}
 533                         local m=$(( n % ha_nclientsset))
 534                         machines=${mach[m]}
 535                         ha_repeat_mpi_load $client $load $status "$parameter" $machines "$stripe" "$mpiuser" &
 536                                 ha_status_files+=("$status")
 537                 done
 538         done
 539 }
 540
 541 ha_repeat_nonmpi_load()
 542 {
 543         local client=$1
 544         local load=$2
 545         local status=$3
 546         local tag=${ha_nonmpi_load_tags[$load]}
 547         local cmd=${ha_nonmpi_load_cmds[$tag]}
 548         local dir=$ha_test_dir/$client-$tag
 549         local log=$ha_tmp_dir/$client-$tag
 550         local rc=0
 551         local nr_loops=0
 552         local avg_loop_time=0
 553         local start_time=$(date +%s)
 554
 555     cmd=${cmd//"{}"/$dir}
 556
 557     ha_info "Starting $tag on $client"
 558
 559         while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
 560                 ha_on $client "mkdir -p $dir &&                              \
 561                         $cmd &&                                              \
 562                         rm -rf $dir" >>"$log" 2>&1 || rc=$?
 563
 564                 if ((rc != 0)); then
 565                         ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
 566                         touch "$ha_fail_file"
 567                         touch "$ha_stop_file"
 568                 fi
 569                 echo $rc >"$status"
 570
 571                 nr_loops=$((nr_loops + 1))
 572         done
 573
 574         [ $nr_loops -ne 0 ] &&
 575                 avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
 576
 577         ha_info "$tag on $client stopped: rc $rc avg loop time ${avg_loop_time}s"
 578 }
 579
 580 ha_start_nonmpi_loads()
 581 {
 582     local client
 583     local load
 584     local tag
 585     local status
 586
 587     for client in ${ha_clients[@]}; do
 588         for ((load = 0; load < ${#ha_nonmpi_load_tags[@]}; load++)); do
 589             tag=${ha_nonmpi_load_tags[$load]}
 590             status=$ha_status_file_prefix-$tag-$client
 591             ha_repeat_nonmpi_load $client $load $status &
 592             ha_status_files+=("$status")
 593         done
 594     done
 595 }
 596
 597 ha_lfsck_bg () {
 598         rm -f $ha_lfsck_log
 599         rm -f $ha_lfsck_stop
 600
 601         ha_info "LFSCK BG"
 602         while [ true ]; do
 603                 [ -f $ha_lfsck_stop ] && ha_info "LFSCK stopped" && break
 604                 [ -f $ha_stop_file ] &&
 605                         ha_info "$ha_stop_file found! LFSCK not started" &&
 606                         break
 607                 ha_start_lfsck 2>&1 | tee -a $ha_lfsck_log
 608                 sleep 1
 609         done &
 610         LFSCK_BG_PID=$!
 611         ha_info LFSCK BG PID: $LFSCK_BG_PID
 612 }
 613
 614 ha_wait_lfsck_completed () {
 615         local -a status
 616         local -a types=($ha_lfsck_types)
 617         local type
 618         local s
 619
 620         local nodes="${ha_servers[@]}"
 621         nodes=${nodes// /,}
 622
 623         # -A start LFSCK on all nodes
 624         # -t default all
 625         [ ${#types[@]} -eq 0 ] && types=(namespace layout)
 626         ha_info "Waiting LFSCK completed in $ha_lfsck_wait sec: types ${types[@]}"
 627         for type in ${types[@]}; do
 628                 eval var_$type=0
 629                 for (( i=0; i<=ha_lfsck_wait; i++)); do
 630                         status=($(ha_on $nodes lctl get_param -n *.*.lfsck_$type 2>/dev/null | \
 631                                 awk '/status/ { print $3 }'))
 632                         for (( s=0; s<${#status[@]}; s++ )); do
 633                                 # "partial" is expected after HARD failover
 634                                 [[ "${status[s]}" = "completed" ]] ||
 635                                 [[ "${status[s]}" = "partial" ]] ||  break
 636                         done
 637                         [[ $s -eq ${#status[@]} ]] && eval var_$type=1 && break
 638                         sleep 1
 639                 done
 640                 ha_info "LFSCK $type status in $i sec:"
 641                 ha_on $nodes lctl get_param -n *.*.lfsck_$type 2>/dev/null | grep status
 642
 643         done
 644
 645         for type in ${types[@]}; do
 646                 local var=var_$type
 647                 ha_on $nodes lctl get_param -n *.*.lfsck_$type 2>/dev/null
 648                 [[ ${!var} -eq 1 ]] ||
 649                         { ha_info "lfsck not completed in $ha_lfsck_wait sec";
 650                         return 1; }
 651         done
 652         return 0
 653 }
 654
 655 ha_start_lfsck()
 656 {
 657         local -a types=($ha_lfsck_types)
 658         local rc=0
 659
 660         # -A: start LFSCK on all nodes via the specified MDT device
 661         # (see "-M" option) by single LFSCK command
 662         local params=" -A -r $ha_lfsck_custom_params"
 663
 664         # use specified device if set
 665         [ -n "$ha_lfsck_device" ] && params="-M $ha_lfsck_device $params"
 666
 667         # -t: check type(s) to be performed (default all)
 668         # check only specified types if set
 669         if [ ${#types[@]} -ne 0 ]; then
 670                 local type="${types[@]}"
 671                 params="$params -t ${type// /,}"
 672         fi
 673
 674         ha_info "LFSCK start $params"
 675         ha_on $ha_lfsck_node "lctl lfsck_start $params" || rc=1
 676         if [ $rc -ne 0 ]; then
 677                 if [ -e $ha_lfsck_lock ]; then
 678                         rc=0
 679                         ha_wait_unlock $ha_lfsck_lock
 680                         ha_sleep 120
 681                         ha_on $ha_lfsck_node "lctl lfsck_start $params" || rc=1
 682                 fi
 683         fi
 684
 685         [ $rc -eq 0 ] ||
 686                 { touch "$ha_fail_file"; touch "$ha_stop_file";
 687                 touch $ha_lfsck_stop; return 1; }
 688
 689         ha_wait_lfsck_completed ||
 690                 { touch "$ha_fail_file"; touch "$ha_stop_file";
 691                 touch $ha_lfsck_stop; return 1; }
 692
 693         return 0
 694 }
 695
 696 ha_lfsck_repaired()
 697 {
 698         local n=0
 699
 700         n=$(cat $ha_lfsck_log | awk '/repaired/ {print $3}' |\
 701                 awk '{sum += $1} END { print sum }')
 702         [ $n -eq 0] ||
 703                 { ha_info "Total repaired: $n";
 704                 touch "$ha_fail_file"; return 1; }
 705         return 0
 706 }
 707
 708 ha_start_loads()
 709 {
 710         $ha_lfsck_bg && ha_lfsck_bg
 711         trap ha_trap_stop_signals $ha_stop_signals
 712         ha_start_nonmpi_loads
 713         ha_start_mpi_loads
 714 }
 715
 716 ha_stop_loads()
 717 {
 718         touch $ha_stop_file
 719         # true because of lfsck_bg could be stopped already
 720         $ha_lfsck_bg && wait $LFSCK_BG_PID || true
 721         trap - $ha_stop_signals
 722         ha_info "Waiting for workloads to stop"
 723         wait
 724 }
 725
 726 ha_wait_loads()
 727 {
 728     local file
 729     local end=$(($(date +%s) + ha_load_timeout))
 730
 731     ha_info "Waiting for workload status"
 732     rm -f "${ha_status_files[@]}"
 733
 734         #
 735         # return immediately if ha_stop_file exists,
 736         # all status_files not needed to be checked
 737         #
 738         for file in "${ha_status_files[@]}"; do
 739                 if [ -e "$ha_stop_file" ]; then
 740                         ha_info "$ha_stop_file found! Stop."
 741                         break
 742                 fi
 743                 #
 744                 # Wait status file created during ha_load_timeout.
 745                 # Existing file guarantees that some application
 746                 # is completed. If no status file was created
 747                 # this function guarantees that we allow
 748                 # applications to continue after/before
 749                 # failover/failback during ha_load_timeout time.
 750                 #
 751                 until [ -e "$file" ] || (($(date +%s) >= end)); do
 752                         #
 753                         # check ha_stop_file again, it could appear
 754                         # during ha_load_timeout
 755                         #
 756                         if [ -e "$ha_stop_file" ]; then
 757                                 ha_info "$ha_stop_file found! Stop."
 758                                 break
 759                         fi
 760                         ha_sleep 1 >/dev/null
 761                 done
 762         done
 763 }
 764
 765 ha_power_down()
 766 {
 767         local nodes=$1
 768         local rc=1
 769         local i
 770
 771         if $ha_lfsck_bg && [[ ${nodes//,/ /} =~ $ha_lfsck_node ]]; then
 772                 ha_info "$ha_lfsck_node down, delay start LFSCK"
 773                 ha_lock $ha_lfsck_lock
 774         fi
 775
 776         ha_info "Powering down $nodes"
 777         for i in $(seq 1 5); do
 778                 $ha_power_down_cmd $nodes && rc=0 && break
 779                 sleep $ha_power_delay
 780         done
 781
 782         [ $rc -eq 0 ] || ha_info "Failed Powering down in $i attempts"
 783 }
 784
 785 ha_power_up()
 786 {
 787         local nodes=$1
 788         local rc=1
 789         local i
 790
 791         ha_info "Powering up $nodes"
 792         for i in $(seq 1 5); do
 793                 $ha_power_up_cmd $nodes && rc=0 && break
 794                 sleep $ha_power_delay
 795         done
 796
 797         [ $rc -eq 0 ] || ha_info "Failed Powering up in $i attempts"
 798 }
 799
 800 #
 801 # rand MAX
 802 #
 803 # Print a random integer within [0, MAX).
 804 #
 805 ha_rand()
 806 {
 807     local max=$1
 808
 809     #
 810     # See "5.2 Bash Variables" from "info bash".
 811     #
 812     echo -n $((RANDOM * max / 32768))
 813 }
 814
 815 ha_aim()
 816 {
 817         local i
 818         local nodes
 819
 820         if $ha_simultaneous ; then
 821                 nodes=$(echo ${ha_victims[@]})
 822                 nodes=${nodes// /,}
 823         else
 824                 i=$(ha_rand ${#ha_victims[@]})
 825                 nodes=${ha_victims[$i]}
 826         fi
 827
 828         echo -n $nodes
 829 }
 830
 831 ha_wait_nodes()
 832 {
 833         local nodes=$1
 834         local end=$(($(date +%s) + 10 * 60))
 835
 836         ha_info "Waiting for $nodes to boot up"
 837         until ha_on $nodes hostname >/dev/null 2>&1 ||
 838                 [ -e "$ha_stop_file" ] ||
 839                         (($(date +%s) >= end)); do
 840                 ha_sleep 1 >/dev/null
 841         done
 842 }
 843
 844 ha_failback()
 845 {
 846         local nodes=$1
 847         ha_info "Failback resources on $nodes in $ha_failback_delay sec"
 848
 849         ha_sleep $ha_failback_delay
 850         [ "$ha_failback_cmd" ] ||
 851         {
 852                 ha_info "No failback command set, skiping"
 853                 return 0
 854         }
 855
 856         $ha_failback_cmd $nodes
 857         [ -e $ha_lfsck_lock ] && ha_unlock $ha_lfsck_lock || true
 858 }
 859
 860 ha_summarize()
 861 {
 862     ha_info "---------------8<---------------"
 863     ha_info "Summary:"
 864     ha_info "    Duration: $(($(date +%s) - $ha_start_time))s"
 865     ha_info "    Loops: $ha_nr_loops"
 866 }
 867
 868 ha_killer()
 869 {
 870         local nodes
 871
 872         while (($(date +%s) < ha_start_time + ha_expected_duration)) &&
 873                         [ ! -e "$ha_stop_file" ]; do
 874                 ha_info "---------------8<---------------"
 875
 876                 $ha_workloads_only || nodes=$(ha_aim)
 877
 878                 ha_info "Failing $nodes"
 879                 $ha_workloads_only && ha_info "    is skipped: workload only..."
 880
 881                 ha_sleep $(ha_rand $ha_max_failover_period)
 882                 $ha_workloads_only || ha_power_down $nodes
 883                 ha_sleep 10
 884                 ha_wait_loads || return
 885
 886                 if [ -e $ha_stop_file ]; then
 887                         $ha_workloads_only || ha_power_up $nodes
 888                         break
 889                 fi
 890
 891                 ha_info "Bringing $nodes back"
 892                 ha_sleep $(ha_rand 10)
 893                 $ha_workloads_only ||
 894                 {
 895                         ha_power_up $nodes
 896                         ha_wait_nodes $nodes
 897                         ha_failback $nodes
 898                 }
 899
 900                 #
 901                 # Wait for the failback to start.
 902                 #
 903                 ha_sleep 60
 904                 ha_wait_loads || return
 905
 906                 ha_sleep $(ha_rand 20)
 907
 908                 ha_nr_loops=$((ha_nr_loops + 1))
 909                 ha_info "Loop $ha_nr_loops done"
 910         done
 911         ha_summarize
 912 }
 913
 914 ha_main()
 915 {
 916         ha_process_arguments "$@"
 917         ha_check_env
 918
 919         ha_log "${ha_clients[*]} ${ha_servers[*]}" \
 920                 "START: $0: $(date +%H:%M:%S' '%s)"
 921         trap ha_trap_exit EXIT
 922         mkdir "$ha_tmp_dir"
 923         ha_on ${ha_clients[0]} mkdir "$ha_test_dir"
 924         ha_on ${ha_clients[0]} " \
 925                 $LFS setstripe $ha_stripe_params $ha_test_dir"
 926
 927         ha_start_loads
 928         ha_wait_loads
 929
 930         if $ha_workloads_dry_run; then
 931                 ha_sleep 5
 932         else
 933                 ha_killer
 934                 ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
 935         fi
 936
 937         ha_stop_loads
 938
 939         $ha_lfsck_after && ha_start_lfsck | tee -a $ha_lfsck_log
 940
 941         $ha_lfsck_fail_on_repaired && ha_lfsck_repaired
 942
 943         if [ -e "$ha_fail_file" ]; then
 944                 exit 1
 945         else
 946                 ha_log "${ha_clients[*]} ${ha_servers[*]}" \
 947                         "END: $0: $(date +%H:%M:%S' '%s)"
 948                 exit 0
 949         fi
 950 }
 951
 952 ha_main "$@"