lustre/tests/parallel-scale.sh

   1 #!/bin/bash
   2 #
   3 #set -vx
   4
   5 set -e
   6
   7 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
   8 . $LUSTRE/tests/test-framework.sh
   9 init_test_env $@
  10 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
  11
  12 #
  13 # compilbench
  14 #
  15 # Boulder cluster compilebench default location
  16 cbench_DIR=${cbench_DIR:-/testsuite/tests/$(arch)/compilebench}
  17 cbench_IDIRS=${cbench_IDIRS:-10}
  18 cbench_RUNS=${cbench_RUNS:-10}  # FIXME: wiki page requirements is 30, do we really need 30 ?
  19
  20 if [ "$SLOW" = "no" ]; then
  21     cbench_IDIRS=2
  22     cbench_RUNS=2
  23 fi
  24
  25 #
  26 # metabench
  27 #
  28 # Boulder cluster metabench default location
  29 METABENCH=${METABENCH:-/testsuite/tests/$(arch)/METABENCH/src/metabench}
  30 mbench_NFILES=${mbench_NFILES:-30400}
  31 [ "$SLOW" = "no" ] && mbench_NFILES=10000
  32 MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
  33 # threads per client
  34 mbench_THREADS=${mbench_THREADS:-4}
  35
  36 #
  37 # simul
  38 #
  39 # Boulder cluster default location
  40 SIMUL=${SIMUL:-/testsuite/tests/$(arch)/simul/simul}
  41 # threads per client
  42 simul_THREADS=${simul_THREADS:-2}
  43 simul_REP=${simul_REP:-20}
  44 [ "$SLOW" = "no" ] && simul_REP=2
  45
  46 #
  47 # connectathon
  48 #
  49 # Boulder cluster default location
  50 cnt_DIR=${cnt_DIR:-/testsuite/tests/$(arch)/connectathon}
  51 cnt_NRUN=${cnt_NRUN:-10}
  52 [ "$SLOW" = "no" ] && cnt_NRUN=2
  53
  54 #
  55 # cascading rw
  56 #
  57 # Boulder cluster default location
  58 CASC_RW=${CASC_RW:-/testsuite/tests/$(arch)/parallel/cascading_rw}
  59 # threads per client
  60 casc_THREADS=${casc_THREADS:-2}
  61 casc_REP=${casc_REP:-300}
  62 [ "$SLOW" = "no" ] && casc_REP=10
  63
  64 #
  65 # IOR
  66 #
  67 # Boulder cluster default location
  68 IOR=${IOR:-/testsuite/tests/$(arch)/IOR/src/C/IOR}
  69 # threads per client
  70 ior_THREADS=${ior_THREADS:-2}
  71 ior_blockSize=${ior_blockSize:-6}       # Gb
  72 ior_DURATION=${ior_DURATION:-30}        # minutes
  73 [ "$SLOW" = "no" ] && ior_DURATION=5
  74
  75 #
  76 # write_append_truncate
  77 #
  78 # threads per client
  79 write_THREADS=${write_THREADS:-8}
  80 write_REP=${write_REP:-10000}
  81 [ "$SLOW" = "no" ] && write_REP=100
  82
  83 #
  84 # write_disjoint
  85 #
  86 # Boulder cluster default location
  87 WRITE_DISJOINT=${WRITE_DISJOINT:-/testsuite/tests/x86_64/lustre/lustre/tests/write_disjoint}
  88 # threads per client
  89 wdisjoint_THREADS=${wdisjoint_THREADS:-4}
  90 wdisjoint_REP=${wdisjoint_REP:-10000}
  91 [ "$SLOW" = "no" ] && wdisjoint_REP=100
  92
  93 build_test_filter
  94 check_and_setup_lustre
  95
  96 print_opts () {
  97     local var
  98
  99     echo OPTIONS:
 100
 101     for i in $@; do
 102         var=$i
 103         echo "${var}=${!var}"
 104     done
 105     [ -e $MACHINEFILE ] && cat $MACHINEFILE
 106 }
 107
 108 # Takes:
 109 # 5 min * cbench_RUNS
 110 #        SLOW=no     10 mins
 111 #        SLOW=yes    50 mins
 112 # Space estimation:
 113 #        compile dir kernel-1 680MB
 114 #        required space       680MB * cbench_IDIRS = ~7 Gb
 115
 116 test_compilebench() {
 117     print_opts cbench_DIR cbench_IDIRS cbench_RUNS
 118
 119     [ -d $cbench_DIR ] || \
 120         { skip "No compilebench found" && return; }
 121
 122     [ -e $cbench_DIR/compilebench ] || \
 123         { skip "No compilebench build" && return; }
 124
 125     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 126     if [ $space -le $((680 * 1024 * cbench_IDIRS)) ]; then
 127         cbench_IDIRS=$(( space / 680 / 1024))
 128         [ $cbench_IDIRS = 0 ] && \
 129             skip "Need free space atleast 680 Mb, have $space" && return
 130
 131         log free space=$space, reducing initial dirs to $cbench_IDIRS
 132     fi
 133     # FIXME:
 134     # t-f _base needs to be modifyed to set properly tdir
 135     # for new "test_foo" functions names
 136     # local testdir=$DIR/$tdir
 137     local testdir=$DIR/d0.compilebench
 138     mkdir -p $testdir
 139
 140     local savePWD=$PWD
 141     cd $cbench_DIR
 142     local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej"
 143
 144     log "$cmd"
 145
 146     local rc=0
 147     eval $cmd
 148     rc=$?
 149
 150     cd $savePWD
 151     [ $rc = 0 ] || error "compilebench failed: $rc"
 152     rm -rf $testdir
 153 }
 154 run_test compilebench "compilebench"
 155
 156 test_metabench() {
 157     [ -e $METABENCH ] || \
 158         { skip "metabench not found" && return; }
 159
 160     local clients=$CLIENTS
 161     [ -z $clients ] && clients=$(hostname)
 162
 163     num_clients=$(get_node_count ${clients//,/ })
 164
 165     # FIXME
 166     # Need space estimation here.
 167
 168     generate_machine_file $clients $MACHINEFILE || \
 169         error "can not generate machinefile $MACHINEFILE"
 170
 171     print_opts METABENCH clients mbench_NFILES mbench_THREADS
 172
 173     local testdir=$DIR/d0.metabench
 174     mkdir -p $testdir
 175     # mpi_run uses mpiuser
 176     chmod 0777 $testdir
 177
 178     # -C             Run the file creation tests.
 179     # -S             Run the file stat tests.
 180     # -c nfile       Number of files to be used in each test.
 181     # -k             Cleanup.  Remove the test directories.
 182     local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k"
 183     echo "+ $cmd"
 184     mpi_run -np $((num_clients * $mbench_THREADS)) -machinefile ${MACHINEFILE} $cmd
 185     local rc=$?
 186     if [ $rc != 0 ] ; then
 187         error "metabench failed! $rc"
 188     fi
 189     rm -rf $testdir
 190 }
 191 run_test metabench "metabench"
 192
 193 test_simul() {
 194     [ -e $SIMUL ] || \
 195         { skip "simul not found" && return; }
 196
 197     local clients=$CLIENTS
 198     [ -z $clients ] && clients=$(hostname)
 199
 200     local num_clients=$(get_node_count ${clients//,/ })
 201
 202     # FIXME
 203     # Need space estimation here.
 204
 205     generate_machine_file $clients $MACHINEFILE || \
 206         error "can not generate machinefile $MACHINEFILE"
 207
 208     print_opts SIMUL clients simul_REP simul_THREADS
 209
 210     local testdir=$DIR/d0.simul
 211     mkdir -p $testdir
 212     # mpi_run uses mpiuser
 213     chmod 0777 $testdir
 214
 215     # -n # : repeat each test # times
 216     # -N # : repeat the entire set of tests # times
 217
 218     local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP"
 219
 220     echo "+ $cmd"
 221     mpi_run -np $((num_clients * $simul_THREADS)) -machinefile ${MACHINEFILE} $cmd
 222
 223     local rc=$?
 224     if [ $rc != 0 ] ; then
 225         error "simul failed! $rc"
 226     fi
 227     rm -rf $testdir
 228 }
 229 run_test simul "simul"
 230
 231 test_connectathon() {
 232     print_opts cnt_DIR cnt_NRUN
 233
 234     [ -d $cnt_DIR ] || \
 235         { skip "No connectathon dir found" && return; }
 236
 237     [ -e $cnt_DIR/runtests ] || \
 238         { skip "No connectathon runtests found" && return; }
 239
 240     local testdir=$DIR/d0.connectathon
 241     mkdir -p $testdir
 242
 243     local savePWD=$PWD
 244     cd $cnt_DIR
 245
 246     # -f      a quick functionality test
 247     # -a      run basic, general, special, and lock tests
 248     # -N numpasses - will be passed to the runtests script.  This argument
 249     #         is optional.  It specifies the number of times to run
 250     #         through the tests.
 251
 252     local cmd="./runtests -N $cnt_NRUN -a -f $testdir"
 253
 254     log "$cmd"
 255
 256     local rc=0
 257     eval $cmd
 258     rc=$?
 259
 260     cd $savePWD
 261     [ $rc = 0 ] || error "connectathon failed: $rc"
 262     rm -rf $testdir
 263 }
 264 run_test connectathon "connectathon"
 265
 266 test_ior() {
 267     [ -e $IOR ] || \
 268         { skip "IOR not found" && return; }
 269
 270     local clients=$CLIENTS
 271     [ -z $clients ] && clients=$(hostname)
 272
 273     local num_clients=$(get_node_count ${clients//,/ })
 274
 275     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 276     echo "+ $ior_blockSize * 1024 * 1024 * $num_clients * $ior_THREADS "
 277     if [ $((space / 2)) -le $(( ior_blockSize * 1024 * 1024 * num_clients * ior_THREADS)) ]; then
 278         echo "+ $space * 9/10 / 1024 / 1024 / $num_clients / $ior_THREADS"
 279         ior_blockSize=$(( space /2 /1024 /1024 / num_clients / ior_THREADS ))
 280         [ $ior_blockSize = 0 ] && \
 281             skip "Need free space more than ($num_clients * $ior_THREADS )Gb: $((num_clients*ior_THREADS *1024 *1024*2)), have $space" && return
 282
 283         echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)"
 284     fi
 285
 286     generate_machine_file $clients $MACHINEFILE || \
 287         error "can not generate machinefile $MACHINEFILE"
 288
 289     print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
 290
 291     local testdir=$DIR/d0.ior
 292     mkdir -p $testdir
 293     # mpi_run uses mpiuser
 294     chmod 0777 $testdir
 295
 296     #
 297     # -b N  blockSize -- contiguous bytes to write per task  (e.g.: 8, 4k, 2m, 1g)"
 298     # -o S  testFileName
 299     # -t N  transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
 300     # -w    writeFile -- write file"
 301     # -r    readFile -- read existing file"
 302     # -T    maxTimeDuration -- max time in minutes to run tests"
 303     # -k    keepFile -- keep testFile(s) on program exit
 304     local cmd="$IOR -a POSIX -b ${ior_blockSize}g -o $testdir/iorData -t 2m -v -w -r -T $ior_DURATION -k"
 305
 306     echo "+ $cmd"
 307     mpi_run -np $((num_clients * $ior_THREADS)) -machinefile ${MACHINEFILE} $cmd
 308
 309     local rc=$?
 310     if [ $rc != 0 ] ; then
 311         error "ior failed! $rc"
 312     fi
 313     rm -rf $testdir
 314 }
 315 run_test ior "ior"
 316
 317 test_cascading_rw() {
 318     [ -e $CASC_RW ] || \
 319         { skip "cascading_rw not found" && return; }
 320
 321     local clients=$CLIENTS
 322     [ -z $clients ] && clients=$(hostname)
 323
 324     num_clients=$(get_node_count ${clients//,/ })
 325
 326     # FIXME
 327     # Need space estimation here.
 328
 329     generate_machine_file $clients $MACHINEFILE || \
 330         error "can not generate machinefile $MACHINEFILE"
 331
 332     print_opts CASC_RW clients casc_THREADS casc_REP MACHINEFILE
 333
 334     local testdir=$DIR/d0.cascading_rw
 335     mkdir -p $testdir
 336     # mpi_run uses mpiuser
 337     chmod 0777 $testdir
 338
 339     # -g: debug mode
 340     # -n: repeat test # times
 341
 342     local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
 343
 344     echo "+ $cmd"
 345     mpi_run -np $((num_clients * $casc_THREADS)) -machinefile ${MACHINEFILE} $cmd
 346
 347     local rc=$?
 348     if [ $rc != 0 ] ; then
 349         error "cascading_rw failed! $rc"
 350     fi
 351     rm -rf $testdir
 352 }
 353 run_test cascading_rw "cascading_rw"
 354
 355 test_write_append_truncate() {
 356     # location is lustre/tests dir
 357     if ! which write_append_truncate > /dev/null 2>&1 ; then
 358         skip "write_append_truncate not found"
 359         return
 360     fi
 361
 362     local clients=$CLIENTS
 363     [ -z $clients ] && clients=$(hostname)
 364
 365     local num_clients=$(get_node_count ${clients//,/ })
 366
 367     # FIXME
 368     # Need space estimation here.
 369
 370     generate_machine_file $clients $MACHINEFILE || \
 371         error "can not generate machinefile $MACHINEFILE"
 372
 373     local testdir=$DIR/d0.write_append_truncate
 374     local file=$testdir/f0.wat
 375
 376     print_opts clients write_REP write_THREADS MACHINEFILE
 377
 378     mkdir -p $testdir
 379     # mpi_run uses mpiuser
 380     chmod 0777 $testdir
 381
 382     local cmd="write_append_truncate -n $write_REP $file"
 383
 384     echo "+ $cmd"
 385     mpi_run -np $((num_clients * $write_THREADS)) -machinefile ${MACHINEFILE} $cmd
 386
 387     local rc=$?
 388     if [ $rc != 0 ] ; then
 389         error "write_append_truncate failed! $rc"
 390         return $rc
 391     fi
 392     rm -rf $testdir
 393 }
 394 run_test write_append_truncate "write_append_truncate"
 395
 396 test_write_disjoint() {
 397     [ -e $WRITE_DISJOINT ] || \
 398         { skip "write_disjoint not found" && return; }
 399
 400     local clients=$CLIENTS
 401     [ -z $clients ] && clients=$(hostname)
 402
 403     local num_clients=$(get_node_count ${clients//,/ })
 404
 405     # FIXME
 406     # Need space estimation here.
 407
 408     generate_machine_file $clients $MACHINEFILE || \
 409         error "can not generate machinefile $MACHINEFILE"
 410
 411     print_opts WRITE_DISJOINT clients wdisjoint_THREADS wdisjoint_REP MACHINEFILE
 412     local testdir=$DIR/d0.write_disjoint
 413     mkdir -p $testdir
 414     # mpi_run uses mpiuser
 415     chmod 0777 $testdir
 416
 417     local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP"
 418
 419     echo "+ $cmd"
 420     mpi_run -np $((num_clients * $wdisjoint_THREADS)) -machinefile ${MACHINEFILE} $cmd
 421
 422     local rc=$?
 423     if [ $rc != 0 ] ; then
 424         error "write_disjoint failed! $rc"
 425     fi
 426     rm -rf $testdir
 427 }
 428 run_test write_disjoint "write_disjoint"
 429
 430 equals_msg `basename $0`: test complete, cleaning up
 431 check_and_cleanup_lustre
 432 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true