lustre/tests/ha.sh

   1 #!/bin/bash
   2 # vim: expandtab tabstop=4 softtabstop=4 shiftwidth=4 textwidth=80
   3 #
   4 # NAME
   5 #
   6 #   ha.sh - test Lustre HA (aka failover) configurations
   7 #
   8 # SYNOPSIS
   9 #
  10 #   ha.sh [OPTIONS]
  11 #
  12 # DESCRIPTION
  13 #
  14 #   ha.sh tests Lustre HA (aka failover) configurations with a CRM.
  15 #
  16 # OPTIONS
  17 #
  18 #   -h
  19 #       Help.
  20 #
  21 #   -c HOST[,...]
  22 #       Specify client nodes.
  23 #
  24 #   -s HOST[,...]
  25 #       Specify server nodes.
  26 #
  27 #   -v HOST[,...]
  28 #       Specify victim nodes to be rebooted.
  29 #
  30 #   -d DIRECTORY
  31 #       Choose a parent of the test directory.  "/mnt/lustre" if not specified.
  32 #
  33 #   -u SECONDS
  34 #       Define a duration for the test. 86400 seconds if not specified.
  35 #
  36 #   -w
  37 #       Only run the workloads; no failure will be introduced.
  38 #
  39 # ASSUMPTIONS
  40 #
  41 #   A Lustre file system is up and mounted on all client nodes.  This script
  42 #   does not mount or unmount any Lustre targets or clients, let alone format
  43 #   anything.
  44 #
  45 #   Each target has a failnode, so that workloads can continue after a power
  46 #   failure.
  47 #
  48 #   Targets are automatically failed back when their primary node is back.  This
  49 #   assumption avoids calling CRM-specific commands to trigger failbacks, making
  50 #   this script more CRM-neural.
  51 #
  52 #   A crash dump mechanism is configured to catch LBUGs, panics, etc.
  53 #
  54 # WORKLOADS
  55 #
  56 #   Each client runs the same set of MPI and non-MPI workloads.  These
  57 #   applications are run in short loops so that their exit status can be waited
  58 #   for and checked within reasonable time by ha_wait_loads.
  59 #
  60 # PROCESS STRUCTURE AND IPC
  61 #
  62 #   On the node where this script is run, the processes look like this:
  63 #
  64 #       ~ ha.sh (ha_killer)
  65 #
  66 #           ~ ha.sh (ha_repeat_mpi_load ior)
  67 #               ~ mpirun IOR
  68 #           ~ ha.sh (ha_repeat_mpi_load simul)
  69 #               ~ mpirun simul
  70 #           ~ ... (one for each MPI load)
  71 #
  72 #           ~ ha.sh (ha_repeat_nonmpi_load client2 dbench)
  73 #               ~ pdsh client2 dbench
  74 #           ~ ha.sh (ha_repeat_nonmpi_load client2 iozone)
  75 #               ~ pdsh client2 iozone
  76 #           ~ ha.sh (ha_repeat_nonmpi_load client5 iozone)
  77 #               ~ pdsh client5 iozone
  78 #           ~ ... (one for each non-MPI load on each client)
  79 #
  80 #   Each tilde represents a process.  Indentations imply parent-children
  81 #   relation.
  82 #
  83 #   IPC is done by files in the temporary directory.
  84 #
  85
  86 ha_info()
  87 {
  88     echo "$0: $(date +%s):" "$@"
  89 }
  90
  91 ha_error()
  92 {
  93     ha_info "$@" >&2
  94 }
  95
  96 ha_trap_err()
  97 {
  98     local i
  99
 100     ha_error "Trap ERR triggered by:"
 101     ha_error "    $BASH_COMMAND"
 102     ha_error "Call trace:"
 103     for ((i = 0; i < ${#FUNCNAME[@]}; i++)); do
 104         ha_error "    ${FUNCNAME[$i]} [${BASH_SOURCE[$i]}:${BASH_LINENO[$i]}]"
 105     done
 106 }
 107
 108 trap ha_trap_err ERR
 109 set -eE
 110
 111 declare     ha_tmp_dir=/tmp/$(basename $0)-$$
 112 declare     ha_stop_file=$ha_tmp_dir/stop
 113 declare     ha_fail_file=$ha_tmp_dir/fail
 114 declare     ha_status_file_prefix=$ha_tmp_dir/status
 115 declare -a  ha_status_files
 116 declare     ha_machine_file=$ha_tmp_dir/machine_file
 117 declare     ha_power_down_cmd=${POWER_DOWN:-pm -0}
 118 declare     ha_power_up_cmd=${POWER_UP:-pm -1}
 119 declare -a  ha_clients
 120 declare -a  ha_servers
 121 declare -a  ha_victims
 122 declare     ha_test_dir=/mnt/lustre/$(basename $0)-$$
 123 declare     ha_start_time=$(date +%s)
 124 declare     ha_expected_duration=$((60 * 60 * 24))
 125 declare     ha_nr_loops=0
 126 declare     ha_stop_signals="SIGINT SIGTERM SIGHUP"
 127 declare     ha_load_timeout=$((60 * 10))
 128 declare     ha_workloads_only=false
 129 declare -a  ha_mpi_load_tags=(
 130     ior
 131     simul
 132 )
 133 declare -a  ha_mpi_load_cmds=(
 134     "/testsuite/tests/x86_64/rhel5/IOR/src/C/IOR -b 256m -o {}/f.ior -t 2m
 135                                                  -w -W -T 1"
 136     "/testsuite/tests/x86_64/rhel5/simul/simul -d {}"
 137 )
 138 declare -a  ha_nonmpi_load_tags=(
 139     dd
 140     tar
 141 )
 142 declare -a  ha_nonmpi_load_cmds=(
 143     "dd if=/dev/zero of={}/f.dd bs=1M count=256"
 144     "tar cf - /etc/fonts | tar xf - -C {}"
 145 )
 146
 147 ha_usage()
 148 {
 149     ha_info "Usage: $0 -c HOST[,...] -s HOST[,...]"                         \
 150             "-v HOST[,...] [-d DIRECTORY] [-u SECONDS]"
 151 }
 152
 153 ha_process_arguments()
 154 {
 155     local opt
 156
 157     while getopts hc:s:v:d:u:w opt; do
 158         case $opt in
 159         h)
 160             ha_usage
 161             exit 0
 162             ;;
 163         c)
 164             ha_clients=(${OPTARG//,/ })
 165             ;;
 166         s)
 167             ha_servers=(${OPTARG//,/ })
 168             ;;
 169         v)
 170             ha_victims=(${OPTARG//,/ })
 171             ;;
 172         d)
 173             ha_test_dir=$OPTARG/$(basename $0)-$$
 174             ;;
 175         u)
 176             ha_expected_duration=$OPTARG
 177             ;;
 178         w)
 179             ha_workloads_only=true
 180             ;;
 181         \?)
 182             ha_usage
 183             exit 1
 184             ;;
 185         esac
 186     done
 187
 188     if [ -z "${ha_clients[*]}" ] ||                                         \
 189        [ -z "${ha_servers[*]}" ] ||                                         \
 190        [ -z "${ha_victims[*]}" ]; then
 191         ha_error "-c, -s, and -v are all mandatory"
 192         ha_usage
 193         exit 1
 194     fi
 195 }
 196
 197 ha_on()
 198 {
 199     local nodes=$1
 200
 201     shift
 202     pdsh -w $nodes PATH=/usr/kerberos/sbin:/usr/kerberos/bin:/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin "$@"
 203 }
 204
 205 ha_trap_exit()
 206 {
 207     if [ -e "$ha_fail_file" ]; then
 208         ha_info "Test directory $ha_test_dir not removed"
 209         ha_info "Temporary directory $ha_tmp_dir not removed"
 210     else
 211         ha_on ${ha_clients[0]} rm -rf "$ha_test_dir"
 212         rm -rf "$ha_tmp_dir"
 213     fi
 214 }
 215
 216 ha_trap_stop_signals()
 217 {
 218     ha_info "${ha_stop_signals// /,} received"
 219     touch "$ha_stop_file"
 220 }
 221
 222 ha_sleep()
 223 {
 224     local n=$1
 225
 226     ha_info "Sleeping for ${n}s"
 227     #
 228     # sleep(1) could interrupted.
 229     #
 230     sleep $n || true
 231 }
 232
 233 ha_lock()
 234 {
 235     local lock=$1
 236
 237     until mkdir "$lock" >/dev/null 2>&1; do
 238         ha_sleep 1 >/dev/null
 239     done
 240 }
 241
 242 ha_unlock()
 243 {
 244     local lock=$1
 245
 246     rm -r "$lock"
 247 }
 248
 249 ha_dump_logs()
 250 {
 251     local nodes=${1// /,}
 252     local file=/tmp/$(basename $0)-$$-$(date +%s).dk
 253     local lock=$ha_tmp_dir/lock-dump-logs
 254
 255     ha_lock "$lock"
 256     ha_info "Dumping lctl log to $file"
 257     ha_on $nodes "lctl dk >$file"
 258     ha_unlock "$lock"
 259 }
 260
 261 ha_repeat_mpi_load()
 262 {
 263     local load=$1
 264     local status=$2
 265     local tag=${ha_mpi_load_tags[$load]}
 266     local cmd=${ha_mpi_load_cmds[$load]}
 267     local dir=$ha_test_dir/$tag
 268     local log=$ha_tmp_dir/$tag
 269     local rc=0
 270     local nr_loops=0
 271     local start_time=$(date +%s)
 272
 273     cmd=${cmd//"{}"/$dir}
 274
 275     ha_info "Starting $tag"
 276
 277     while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
 278         {
 279             ha_on ${ha_clients[0]} mkdir -p "$dir" &&                       \
 280             mpirun -np ${#ha_clients[@]} -machinefile "$ha_machine_file"    \
 281                    $cmd &&                                                  \
 282             ha_on ${ha_clients[0]} rm -rf "$dir"
 283         } >>"$log" 2>&1 || rc=$?
 284
 285         if ((rc != 0)); then
 286             ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}"
 287             touch "$ha_fail_file"
 288             touch "$ha_stop_file"
 289         fi
 290         echo $rc >"$status"
 291
 292         nr_loops=$((nr_loops + 1))
 293     done
 294
 295     avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
 296
 297     ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time"
 298 }
 299
 300 ha_start_mpi_loads()
 301 {
 302     local client
 303     local load
 304     local tag
 305     local status
 306
 307     for client in ${ha_clients[@]}; do
 308         echo $client >>"$ha_machine_file"
 309     done
 310
 311     for ((load = 0; load < ${#ha_mpi_load_tags[@]}; load++)); do
 312         tag=${ha_mpi_load_tags[$load]}
 313         status=$ha_status_file_prefix-$tag
 314         ha_repeat_mpi_load $load $status &
 315         ha_status_files+=("$status")
 316     done
 317 }
 318
 319 ha_repeat_nonmpi_load()
 320 {
 321     local client=$1
 322     local load=$2
 323     local status=$3
 324     local tag=${ha_nonmpi_load_tags[$load]}
 325     local cmd=${ha_nonmpi_load_cmds[$load]}
 326     local dir=$ha_test_dir/$client-$tag
 327     local log=$ha_tmp_dir/$client-$tag
 328     local rc=0
 329     local nr_loops=0
 330     local start_time=$(date +%s)
 331
 332     cmd=${cmd//"{}"/$dir}
 333
 334     ha_info "Starting $tag on $client"
 335
 336     while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do
 337         ha_on $client "mkdir -p $dir &&                                     \
 338                        $cmd &&                                              \
 339                        rm -rf $dir" >>"$log" 2>&1 || rc=$?
 340
 341         if ((rc != 0)); then
 342             ha_dump_logs "$client ${ha_servers[*]}"
 343             touch "$ha_fail_file"
 344             touch "$ha_stop_file"
 345         fi
 346         echo $rc >"$status"
 347
 348         nr_loops=$((nr_loops + 1))
 349     done
 350
 351     avg_loop_time=$((($(date +%s) - start_time) / nr_loops))
 352
 353     ha_info "$tag on $client stopped: rc $rc avg loop time ${avg_loop_time}s"
 354 }
 355
 356 ha_start_nonmpi_loads()
 357 {
 358     local client
 359     local load
 360     local tag
 361     local status
 362
 363     for client in ${ha_clients[@]}; do
 364         for ((load = 0; load < ${#ha_nonmpi_load_tags[@]}; load++)); do
 365             tag=${ha_nonmpi_load_tags[$load]}
 366             status=$ha_status_file_prefix-$tag-$client
 367             ha_repeat_nonmpi_load $client $load $status &
 368             ha_status_files+=("$status")
 369         done
 370     done
 371 }
 372
 373 ha_start_loads()
 374 {
 375     trap ha_trap_stop_signals $ha_stop_signals
 376     ha_start_nonmpi_loads
 377     ha_start_mpi_loads
 378 }
 379
 380 ha_stop_loads()
 381 {
 382     touch $ha_stop_file
 383     trap - $ha_stop_signals
 384     ha_info "Waiting for workloads to stop"
 385     wait
 386 }
 387
 388 ha_wait_loads()
 389 {
 390     local file
 391     local end=$(($(date +%s) + ha_load_timeout))
 392
 393     ha_info "Waiting for workload status"
 394     rm -f "${ha_status_files[@]}"
 395     for file in "${ha_status_files[@]}"; do
 396         until [ -e "$ha_stop_file" ] ||
 397               [ -e "$file" ] ||
 398               (($(date +%s) >= end)); do
 399             ha_sleep 1 >/dev/null
 400         done
 401     done
 402 }
 403
 404 ha_power_down()
 405 {
 406     local node=$1
 407
 408     ha_info "Powering down $node"
 409     $ha_power_down_cmd $node
 410 }
 411
 412 ha_power_up()
 413 {
 414     local node=$1
 415
 416     ha_info "Powering up $node"
 417     $ha_power_up_cmd $node
 418 }
 419
 420 #
 421 # rand MAX
 422 #
 423 # Print a random integer within [0, MAX).
 424 #
 425 ha_rand()
 426 {
 427     local max=$1
 428
 429     #
 430     # See "5.2 Bash Variables" from "info bash".
 431     #
 432     echo -n $((RANDOM * max / 32768))
 433 }
 434
 435 ha_aim()
 436 {
 437     local i=$(ha_rand ${#ha_victims[@]})
 438
 439     echo -n ${ha_victims[$i]}
 440 }
 441
 442 ha_wait_node()
 443 {
 444     local node=$1
 445     local end=$(($(date +%s) + 5 * 60))
 446
 447     ha_info "Waiting for $node to boot up"
 448     until pdsh -w $node -S hostname >/dev/null 2>&1 ||
 449           [ -e "$ha_stop_file" ] ||
 450           (($(date +%s) >= end)); do
 451         ha_sleep 1 >/dev/null
 452     done
 453 }
 454
 455 ha_summarize()
 456 {
 457     ha_info "---------------8<---------------"
 458     ha_info "Summary:"
 459     ha_info "    Duration: $(($(date +%s) - $ha_start_time))s"
 460     ha_info "    Loops: $ha_nr_loops"
 461 }
 462
 463 ha_killer()
 464 {
 465     local node
 466
 467     while (($(date +%s) < ha_start_time + ha_expected_duration)) &&
 468           [ ! -e "$ha_stop_file" ]; do
 469         ha_info "---------------8<---------------"
 470
 471         node=$(ha_aim)
 472
 473         ha_info "Failing $node"
 474         ha_sleep $(ha_rand 10)
 475         ha_power_down $node
 476         ha_sleep 10
 477         ha_wait_loads || return
 478
 479         if [ -e $ha_stop_file ]; then
 480             ha_power_up $node
 481             break;
 482         fi
 483
 484         ha_info "Bringing $node back"
 485         ha_sleep $(ha_rand 10)
 486         ha_power_up $node
 487         ha_wait_node $node
 488         #
 489         # Wait for the failback to start.
 490         #
 491         ha_sleep 60
 492         ha_wait_loads || return
 493
 494         ha_sleep $(ha_rand 20)
 495
 496         ha_nr_loops=$((ha_nr_loops + 1))
 497         ha_info "Loop $ha_nr_loops done"
 498     done
 499     ha_summarize
 500 }
 501
 502 ha_main()
 503 {
 504     ha_process_arguments "$@"
 505
 506     trap ha_trap_exit EXIT
 507     mkdir "$ha_tmp_dir"
 508     ha_on ${ha_clients[0]} mkdir "$ha_test_dir"
 509
 510     ha_start_loads
 511     ha_wait_loads
 512
 513     if $ha_workloads_only; then
 514         ha_sleep $((60 * 60))
 515     else
 516         ha_killer
 517         ha_dump_logs
 518     fi
 519
 520     ha_stop_loads
 521
 522     if [ -e "$ha_fail_file" ]; then
 523         exit 1
 524     else
 525         exit 0
 526     fi
 527 }
 528
 529 ha_main "$@"