lustre/tests/parallel-scale.sh

   1 #!/bin/bash
   2 #
   3 #set -vx
   4
   5 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
   6 . $LUSTRE/tests/test-framework.sh
   7 init_test_env $@
   8 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
   9
  10 #
  11 # compilbench
  12 #
  13 cbench_DIR=${cbench_DIR:-""}
  14 cbench_IDIRS=${cbench_IDIRS:-10}
  15 cbench_RUNS=${cbench_RUNS:-10}  # FIXME: wiki page requirements is 30, do we really need 30 ?
  16
  17 if [ "$SLOW" = "no" ]; then
  18     cbench_IDIRS=2
  19     cbench_RUNS=2
  20 fi
  21
  22 #
  23 # metabench
  24 #
  25 METABENCH=${METABENCH:-$(which metabench 2> /dev/null || true)}
  26 mbench_NFILES=${mbench_NFILES:-30400}
  27 [ "$SLOW" = "no" ] && mbench_NFILES=10000
  28 MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
  29 # threads per client
  30 mbench_THREADS=${mbench_THREADS:-4}
  31
  32 #
  33 # simul
  34 #
  35 SIMUL=${SIMUL:=$(which simul 2> /dev/null || true)}
  36 # threads per client
  37 simul_THREADS=${simul_THREADS:-2}
  38 simul_REP=${simul_REP:-20}
  39 [ "$SLOW" = "no" ] && simul_REP=2
  40
  41 #
  42 # connectathon
  43 #
  44 cnt_DIR=${cnt_DIR:-""}
  45 cnt_NRUN=${cnt_NRUN:-10}
  46 [ "$SLOW" = "no" ] && cnt_NRUN=2
  47
  48 #
  49 # cascading rw
  50 #
  51 CASC_RW=${CASC_RW:-$(which cascading_rw 2> /dev/null || true)}
  52 # threads per client
  53 casc_THREADS=${casc_THREADS:-2}
  54 casc_REP=${casc_REP:-300}
  55 [ "$SLOW" = "no" ] && casc_REP=10
  56
  57 #
  58 # IOR
  59 #
  60 IOR=${IOR:-$(which IOR 2> /dev/null || true)}
  61 # threads per client
  62 ior_THREADS=${ior_THREADS:-2}
  63 ior_blockSize=${ior_blockSize:-6}       # Gb
  64 ior_DURATION=${ior_DURATION:-30}        # minutes
  65 [ "$SLOW" = "no" ] && ior_DURATION=5
  66
  67 #
  68 # write_append_truncate
  69 #
  70 # threads per client
  71 write_THREADS=${write_THREADS:-8}
  72 write_REP=${write_REP:-10000}
  73 [ "$SLOW" = "no" ] && write_REP=100
  74
  75 #
  76 # write_disjoint
  77 #
  78 WRITE_DISJOINT=${WRITE_DISJOINT:-$(which write_disjoint 2> /dev/null || true)}
  79 # threads per client
  80 wdisjoint_THREADS=${wdisjoint_THREADS:-4}
  81 wdisjoint_REP=${wdisjoint_REP:-10000}
  82 [ "$SLOW" = "no" ] && wdisjoint_REP=100
  83
  84 build_test_filter
  85 check_and_setup_lustre
  86
  87 print_opts () {
  88     local var
  89
  90     echo OPTIONS:
  91
  92     for i in $@; do
  93         var=$i
  94         echo "${var}=${!var}"
  95     done
  96     [ -e $MACHINEFILE ] && cat $MACHINEFILE
  97 }
  98
  99 # Takes:
 100 # 5 min * cbench_RUNS
 101 #        SLOW=no     10 mins
 102 #        SLOW=yes    50 mins
 103 # Space estimation:
 104 #        compile dir kernel-1 680MB
 105 #        required space       680MB * cbench_IDIRS = ~7 Gb
 106
 107 test_compilebench() {
 108     print_opts cbench_DIR cbench_IDIRS cbench_RUNS
 109
 110     [ x$cbench_DIR = x ] &&
 111         { skip "compilebench not found" && return; }
 112
 113     [ -e $cbench_DIR/compilebench ] || \
 114         { skip "No compilebench build" && return; }
 115
 116     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 117     if [ $space -le $((680 * 1024 * cbench_IDIRS)) ]; then
 118         cbench_IDIRS=$(( space / 680 / 1024))
 119         [ $cbench_IDIRS = 0 ] && \
 120             skip "Need free space atleast 680 Mb, have $space" && return
 121
 122         log free space=$space, reducing initial dirs to $cbench_IDIRS
 123     fi
 124     # FIXME:
 125     # t-f _base needs to be modifyed to set properly tdir
 126     # for new "test_foo" functions names
 127     # local testdir=$DIR/$tdir
 128     local testdir=$DIR/d0.compilebench
 129     mkdir -p $testdir
 130
 131     local savePWD=$PWD
 132     cd $cbench_DIR
 133     local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej"
 134
 135     log "$cmd"
 136
 137     local rc=0
 138     eval $cmd
 139     rc=$?
 140
 141     cd $savePWD
 142     [ $rc = 0 ] || error "compilebench failed: $rc"
 143     rm -rf $testdir
 144 }
 145 run_test compilebench "compilebench"
 146
 147 test_metabench() {
 148     [ x$METABENCH = x ] &&
 149         { skip "metabench not found" && return; }
 150
 151     local clients=$CLIENTS
 152     [ -z $clients ] && clients=$(hostname)
 153
 154     num_clients=$(get_node_count ${clients//,/ })
 155
 156     # FIXME
 157     # Need space estimation here.
 158
 159     generate_machine_file $clients $MACHINEFILE || \
 160         error "can not generate machinefile $MACHINEFILE"
 161
 162     print_opts METABENCH clients mbench_NFILES mbench_THREADS
 163
 164     local testdir=$DIR/d0.metabench
 165     mkdir -p $testdir
 166     # mpi_run uses mpiuser
 167     chmod 0777 $testdir
 168
 169     # -C             Run the file creation tests.
 170     # -S             Run the file stat tests.
 171     # -c nfile       Number of files to be used in each test.
 172     # -k             Cleanup.  Remove the test directories.
 173     local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k"
 174     echo "+ $cmd"
 175     mpi_run -np $((num_clients * $mbench_THREADS)) -machinefile ${MACHINEFILE} $cmd
 176     local rc=$?
 177     if [ $rc != 0 ] ; then
 178         error "metabench failed! $rc"
 179     fi
 180     rm -rf $testdir
 181 }
 182 run_test metabench "metabench"
 183
 184 test_simul() {
 185     [ x$SIMUL = x ] &&
 186         { skip "simul not found" && return; }
 187
 188     local clients=$CLIENTS
 189     [ -z $clients ] && clients=$(hostname)
 190
 191     local num_clients=$(get_node_count ${clients//,/ })
 192
 193     # FIXME
 194     # Need space estimation here.
 195
 196     generate_machine_file $clients $MACHINEFILE || \
 197         error "can not generate machinefile $MACHINEFILE"
 198
 199     print_opts SIMUL clients simul_REP simul_THREADS
 200
 201     local testdir=$DIR/d0.simul
 202     mkdir -p $testdir
 203     # mpi_run uses mpiuser
 204     chmod 0777 $testdir
 205
 206     # -n # : repeat each test # times
 207     # -N # : repeat the entire set of tests # times
 208
 209     local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP"
 210
 211     echo "+ $cmd"
 212     mpi_run -np $((num_clients * $simul_THREADS)) -machinefile ${MACHINEFILE} $cmd
 213
 214     local rc=$?
 215     if [ $rc != 0 ] ; then
 216         error "simul failed! $rc"
 217     fi
 218     rm -rf $testdir
 219 }
 220 run_test simul "simul"
 221
 222 test_connectathon() {
 223     print_opts cnt_DIR cnt_NRUN
 224
 225     [ x$cnt_DIR = x ] &&
 226         { skip "connectathon dir not found" && return; }
 227
 228     [ -e $cnt_DIR/runtests ] || \
 229         { skip "No connectathon runtests found" && return; }
 230
 231     local testdir=$DIR/d0.connectathon
 232     mkdir -p $testdir
 233
 234     local savePWD=$PWD
 235     cd $cnt_DIR
 236
 237     # -f      a quick functionality test
 238     # -a      run basic, general, special, and lock tests
 239     # -N numpasses - will be passed to the runtests script.  This argument
 240     #         is optional.  It specifies the number of times to run
 241     #         through the tests.
 242
 243     local cmd="./runtests -N $cnt_NRUN -a -f $testdir"
 244
 245     log "$cmd"
 246
 247     local rc=0
 248     eval $cmd
 249     rc=$?
 250
 251     cd $savePWD
 252     [ $rc = 0 ] || error "connectathon failed: $rc"
 253     rm -rf $testdir
 254 }
 255 run_test connectathon "connectathon"
 256
 257 test_ior() {
 258     [ x$IOR = x ] &&
 259         { skip "IOR not found" && return; }
 260
 261     local clients=$CLIENTS
 262     [ -z $clients ] && clients=$(hostname)
 263
 264     local num_clients=$(get_node_count ${clients//,/ })
 265
 266     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 267     echo "+ $ior_blockSize * 1024 * 1024 * $num_clients * $ior_THREADS "
 268     if [ $((space / 2)) -le $(( ior_blockSize * 1024 * 1024 * num_clients * ior_THREADS)) ]; then
 269         echo "+ $space * 9/10 / 1024 / 1024 / $num_clients / $ior_THREADS"
 270         ior_blockSize=$(( space /2 /1024 /1024 / num_clients / ior_THREADS ))
 271         [ $ior_blockSize = 0 ] && \
 272             skip "Need free space more than ($num_clients * $ior_THREADS )Gb: $((num_clients*ior_THREADS *1024 *1024*2)), have $space" && return
 273
 274         echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)"
 275     fi
 276
 277     generate_machine_file $clients $MACHINEFILE || \
 278         error "can not generate machinefile $MACHINEFILE"
 279
 280     print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
 281
 282     local testdir=$DIR/d0.ior
 283     mkdir -p $testdir
 284     # mpi_run uses mpiuser
 285     chmod 0777 $testdir
 286
 287     #
 288     # -b N  blockSize -- contiguous bytes to write per task  (e.g.: 8, 4k, 2m, 1g)"
 289     # -o S  testFileName
 290     # -t N  transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
 291     # -w    writeFile -- write file"
 292     # -r    readFile -- read existing file"
 293     # -T    maxTimeDuration -- max time in minutes to run tests"
 294     # -k    keepFile -- keep testFile(s) on program exit
 295     local cmd="$IOR -a POSIX -b ${ior_blockSize}g -o $testdir/iorData -t 2m -v -w -r -T $ior_DURATION -k"
 296
 297     echo "+ $cmd"
 298     mpi_run -np $((num_clients * $ior_THREADS)) -machinefile ${MACHINEFILE} $cmd
 299
 300     local rc=$?
 301     if [ $rc != 0 ] ; then
 302         error "ior failed! $rc"
 303     fi
 304     rm -rf $testdir
 305 }
 306 run_test ior "ior"
 307
 308 test_cascading_rw() {
 309     [ x$CASC_RW = x ] &&
 310         { skip "cascading_rw not found" && return; }
 311
 312     local clients=$CLIENTS
 313     [ -z $clients ] && clients=$(hostname)
 314
 315     num_clients=$(get_node_count ${clients//,/ })
 316
 317     # FIXME
 318     # Need space estimation here.
 319
 320     generate_machine_file $clients $MACHINEFILE || \
 321         error "can not generate machinefile $MACHINEFILE"
 322
 323     print_opts CASC_RW clients casc_THREADS casc_REP MACHINEFILE
 324
 325     local testdir=$DIR/d0.cascading_rw
 326     mkdir -p $testdir
 327     # mpi_run uses mpiuser
 328     chmod 0777 $testdir
 329
 330     # -g: debug mode
 331     # -n: repeat test # times
 332
 333     local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
 334
 335     echo "+ $cmd"
 336     mpi_run -np $((num_clients * $casc_THREADS)) -machinefile ${MACHINEFILE} $cmd
 337
 338     local rc=$?
 339     if [ $rc != 0 ] ; then
 340         error "cascading_rw failed! $rc"
 341     fi
 342     rm -rf $testdir
 343 }
 344 run_test cascading_rw "cascading_rw"
 345
 346 test_write_append_truncate() {
 347     # location is lustre/tests dir
 348     if ! which write_append_truncate > /dev/null 2>&1 ; then
 349         skip "write_append_truncate not found"
 350         return
 351     fi
 352
 353     local clients=$CLIENTS
 354     [ -z $clients ] && clients=$(hostname)
 355
 356     local num_clients=$(get_node_count ${clients//,/ })
 357
 358     # FIXME
 359     # Need space estimation here.
 360
 361     generate_machine_file $clients $MACHINEFILE || \
 362         error "can not generate machinefile $MACHINEFILE"
 363
 364     local testdir=$DIR/d0.write_append_truncate
 365     local file=$testdir/f0.wat
 366
 367     print_opts clients write_REP write_THREADS MACHINEFILE
 368
 369     mkdir -p $testdir
 370     # mpi_run uses mpiuser
 371     chmod 0777 $testdir
 372
 373     local cmd="write_append_truncate -n $write_REP $file"
 374
 375     echo "+ $cmd"
 376     mpi_run -np $((num_clients * $write_THREADS)) -machinefile ${MACHINEFILE} $cmd
 377
 378     local rc=$?
 379     if [ $rc != 0 ] ; then
 380         error "write_append_truncate failed! $rc"
 381         return $rc
 382     fi
 383     rm -rf $testdir
 384 }
 385 run_test write_append_truncate "write_append_truncate"
 386
 387 test_write_disjoint() {
 388     [ x$WRITE_DISJOINT = x ] &&
 389         { skip "write_disjoint not found" && return; }
 390
 391     local clients=$CLIENTS
 392     [ -z $clients ] && clients=$(hostname)
 393
 394     local num_clients=$(get_node_count ${clients//,/ })
 395
 396     # FIXME
 397     # Need space estimation here.
 398
 399     generate_machine_file $clients $MACHINEFILE || \
 400         error "can not generate machinefile $MACHINEFILE"
 401
 402     print_opts WRITE_DISJOINT clients wdisjoint_THREADS wdisjoint_REP MACHINEFILE
 403     local testdir=$DIR/d0.write_disjoint
 404     mkdir -p $testdir
 405     # mpi_run uses mpiuser
 406     chmod 0777 $testdir
 407
 408     local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP"
 409
 410     echo "+ $cmd"
 411     mpi_run -np $((num_clients * $wdisjoint_THREADS)) -machinefile ${MACHINEFILE} $cmd
 412
 413     local rc=$?
 414     if [ $rc != 0 ] ; then
 415         error "write_disjoint failed! $rc"
 416     fi
 417     rm -rf $testdir
 418 }
 419 run_test write_disjoint "write_disjoint"
 420
 421 equals_msg `basename $0`: test complete, cleaning up
 422 check_and_cleanup_lustre
 423 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true