lustre/tests/parallel-scale.sh

   1 #!/bin/bash
   2 #
   3 #set -vx
   4
   5 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
   6 . $LUSTRE/tests/test-framework.sh
   7 init_test_env $@
   8 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
   9
  10 #
  11 # compilbench
  12 #
  13 cbench_DIR=${cbench_DIR:-""}
  14 cbench_IDIRS=${cbench_IDIRS:-4}
  15 cbench_RUNS=${cbench_RUNS:-4}   # FIXME: wiki page requirements is 30, do we really need 30 ?
  16
  17 if [ "$SLOW" = "no" ]; then
  18     cbench_IDIRS=2
  19     cbench_RUNS=2
  20 fi
  21
  22 #
  23 # metabench
  24 #
  25 METABENCH=${METABENCH:-$(which metabench 2> /dev/null || true)}
  26 mbench_NFILES=${mbench_NFILES:-30400}
  27 [ "$SLOW" = "no" ] && mbench_NFILES=10000
  28 MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
  29 # threads per client
  30 mbench_THREADS=${mbench_THREADS:-4}
  31
  32 #
  33 # simul
  34 #
  35 SIMUL=${SIMUL:=$(which simul 2> /dev/null || true)}
  36 # threads per client
  37 simul_THREADS=${simul_THREADS:-2}
  38 simul_REP=${simul_REP:-20}
  39 [ "$SLOW" = "no" ] && simul_REP=2
  40
  41 #
  42 # connectathon
  43 #
  44 cnt_DIR=${cnt_DIR:-""}
  45 cnt_NRUN=${cnt_NRUN:-10}
  46 [ "$SLOW" = "no" ] && cnt_NRUN=2
  47
  48 #
  49 # cascading rw
  50 #
  51 CASC_RW=${CASC_RW:-$(which cascading_rw 2> /dev/null || true)}
  52 # threads per client
  53 casc_THREADS=${casc_THREADS:-2}
  54 casc_REP=${casc_REP:-300}
  55 [ "$SLOW" = "no" ] && casc_REP=10
  56
  57 #
  58 # IOR
  59 #
  60 IOR=${IOR:-$(which IOR 2> /dev/null || true)}
  61 # threads per client
  62 ior_THREADS=${ior_THREADS:-2}
  63 ior_blockSize=${ior_blockSize:-6}       # Gb
  64 ior_DURATION=${ior_DURATION:-30}        # minutes
  65 [ "$SLOW" = "no" ] && ior_DURATION=5
  66
  67 #
  68 # write_append_truncate
  69 #
  70 # threads per client
  71 write_THREADS=${write_THREADS:-8}
  72 write_REP=${write_REP:-10000}
  73 [ "$SLOW" = "no" ] && write_REP=100
  74
  75 #
  76 # write_disjoint
  77 #
  78 WRITE_DISJOINT=${WRITE_DISJOINT:-$(which write_disjoint 2> /dev/null || true)}
  79 # threads per client
  80 wdisjoint_THREADS=${wdisjoint_THREADS:-4}
  81 wdisjoint_REP=${wdisjoint_REP:-10000}
  82 [ "$SLOW" = "no" ] && wdisjoint_REP=100
  83
  84 build_test_filter
  85 check_and_setup_lustre
  86
  87 get_mpiuser_id $MPI_USER
  88 MPI_RUNAS=${MPI_RUNAS:-"runas -u $MPI_USER_UID -g $MPI_USER_GID"}
  89 $GSS_KRB5 && refresh_krb5_tgt $MPI_USER_UID $MPI_USER_GID $MPI_RUNAS
  90
  91 print_opts () {
  92     local var
  93
  94     echo OPTIONS:
  95
  96     for i in $@; do
  97         var=$i
  98         echo "${var}=${!var}"
  99     done
 100     [ -e $MACHINEFILE ] && cat $MACHINEFILE
 101 }
 102
 103 # Takes:
 104 # 5 min * cbench_RUNS
 105 #        SLOW=no     10 mins
 106 #        SLOW=yes    50 mins
 107 # Space estimation:
 108 #        compile dir kernel-1 680MB
 109 #        required space       680MB * cbench_IDIRS = ~7 Gb
 110
 111 test_compilebench() {
 112     print_opts cbench_DIR cbench_IDIRS cbench_RUNS
 113
 114     [ x$cbench_DIR = x ] &&
 115         { skip_env "compilebench not found" && return; }
 116
 117     [ -e $cbench_DIR/compilebench ] || \
 118         { skip_env "No compilebench build" && return; }
 119
 120     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 121     if [ $space -le $((680 * 1024 * cbench_IDIRS)) ]; then
 122         cbench_IDIRS=$(( space / 680 / 1024))
 123         [ $cbench_IDIRS = 0 ] && \
 124             skip_env "Need free space atleast 680 Mb, have $space" && return
 125
 126         log free space=$space, reducing initial dirs to $cbench_IDIRS
 127     fi
 128     # FIXME:
 129     # t-f _base needs to be modifyed to set properly tdir
 130     # for new "test_foo" functions names
 131     # local testdir=$DIR/$tdir
 132     local testdir=$DIR/d0.compilebench
 133     mkdir -p $testdir
 134
 135     local savePWD=$PWD
 136     cd $cbench_DIR
 137     local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej"
 138
 139     log "$cmd"
 140
 141     local rc=0
 142     eval $cmd
 143     rc=$?
 144
 145     cd $savePWD
 146     [ $rc = 0 ] || error "compilebench failed: $rc"
 147     rm -rf $testdir
 148 }
 149 run_test compilebench "compilebench"
 150
 151 test_metabench() {
 152     [ x$METABENCH = x ] &&
 153         { skip_env "metabench not found" && return; }
 154
 155     local clients=$CLIENTS
 156     [ -z $clients ] && clients=$(hostname)
 157
 158     num_clients=$(get_node_count ${clients//,/ })
 159
 160     # FIXME
 161     # Need space estimation here.
 162
 163     generate_machine_file $clients $MACHINEFILE || \
 164         error "can not generate machinefile $MACHINEFILE"
 165
 166     print_opts METABENCH clients mbench_NFILES mbench_THREADS
 167
 168     local testdir=$DIR/d0.metabench
 169     mkdir -p $testdir
 170     # mpi_run uses mpiuser
 171     chmod 0777 $testdir
 172
 173     # -C             Run the file creation tests.
 174     # -S             Run the file stat tests.
 175     # -c nfile       Number of files to be used in each test.
 176     # -k             Cleanup.  Remove the test directories.
 177     local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k"
 178     echo "+ $cmd"
 179     mpi_run -np $((num_clients * $mbench_THREADS)) -machinefile ${MACHINEFILE} $cmd
 180     local rc=$?
 181     if [ $rc != 0 ] ; then
 182         error "metabench failed! $rc"
 183     fi
 184     rm -rf $testdir
 185 }
 186 run_test metabench "metabench"
 187
 188 test_simul() {
 189     [ x$SIMUL = x ] &&
 190         { skip_env "simul not found" && return; }
 191
 192     local clients=$CLIENTS
 193     [ -z $clients ] && clients=$(hostname)
 194
 195     local num_clients=$(get_node_count ${clients//,/ })
 196
 197     # FIXME
 198     # Need space estimation here.
 199
 200     generate_machine_file $clients $MACHINEFILE || \
 201         error "can not generate machinefile $MACHINEFILE"
 202
 203     print_opts SIMUL clients simul_REP simul_THREADS
 204
 205     local testdir=$DIR/d0.simul
 206     mkdir -p $testdir
 207     # mpi_run uses mpiuser
 208     chmod 0777 $testdir
 209
 210     # -n # : repeat each test # times
 211     # -N # : repeat the entire set of tests # times
 212
 213     local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP"
 214
 215     echo "+ $cmd"
 216     mpi_run -np $((num_clients * $simul_THREADS)) -machinefile ${MACHINEFILE} $cmd
 217
 218     local rc=$?
 219     if [ $rc != 0 ] ; then
 220         error "simul failed! $rc"
 221     fi
 222     rm -rf $testdir
 223 }
 224 run_test simul "simul"
 225
 226 test_connectathon() {
 227     print_opts cnt_DIR cnt_NRUN
 228
 229     [ x$cnt_DIR = x ] &&
 230         { skip_env "connectathon dir not found" && return; }
 231
 232     [ -e $cnt_DIR/runtests ] || \
 233         { skip_env "No connectathon runtests found" && return; }
 234
 235     local testdir=$DIR/d0.connectathon
 236     mkdir -p $testdir
 237
 238     local savePWD=$PWD
 239     cd $cnt_DIR
 240
 241     # -f      a quick functionality test
 242     # -a      run basic, general, special, and lock tests
 243     # -N numpasses - will be passed to the runtests script.  This argument
 244     #         is optional.  It specifies the number of times to run
 245     #         through the tests.
 246
 247     local cmd="./runtests -N $cnt_NRUN -a -f $testdir"
 248
 249     log "$cmd"
 250
 251     local rc=0
 252     eval $cmd
 253     rc=$?
 254
 255     cd $savePWD
 256     [ $rc = 0 ] || error "connectathon failed: $rc"
 257     rm -rf $testdir
 258 }
 259 run_test connectathon "connectathon"
 260
 261 test_ior() {
 262     [ x$IOR = x ] &&
 263         { skip_env "IOR not found" && return; }
 264
 265     local clients=$CLIENTS
 266     [ -z $clients ] && clients=$(hostname)
 267
 268     local num_clients=$(get_node_count ${clients//,/ })
 269
 270     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 271     echo "+ $ior_blockSize * 1024 * 1024 * $num_clients * $ior_THREADS "
 272     if [ $((space / 2)) -le $(( ior_blockSize * 1024 * 1024 * num_clients * ior_THREADS)) ]; then
 273         echo "+ $space * 9/10 / 1024 / 1024 / $num_clients / $ior_THREADS"
 274         ior_blockSize=$(( space /2 /1024 /1024 / num_clients / ior_THREADS ))
 275         [ $ior_blockSize = 0 ] && \
 276             skip_env "Need free space more than ($num_clients * $ior_THREADS )Gb: $((num_clients*ior_THREADS *1024 *1024*2)), have $space" && return
 277
 278         echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)"
 279     fi
 280
 281     generate_machine_file $clients $MACHINEFILE || \
 282         error "can not generate machinefile $MACHINEFILE"
 283
 284     print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
 285
 286     local testdir=$DIR/d0.ior
 287     mkdir -p $testdir
 288     # mpi_run uses mpiuser
 289     chmod 0777 $testdir
 290
 291     #
 292     # -b N  blockSize -- contiguous bytes to write per task  (e.g.: 8, 4k, 2m, 1g)"
 293     # -o S  testFileName
 294     # -t N  transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
 295     # -w    writeFile -- write file"
 296     # -r    readFile -- read existing file"
 297     # -T    maxTimeDuration -- max time in minutes to run tests"
 298     # -k    keepFile -- keep testFile(s) on program exit
 299     local cmd="$IOR -a POSIX -b ${ior_blockSize}g -o $testdir/iorData -t 2m -v -w -r -T $ior_DURATION -k"
 300
 301     echo "+ $cmd"
 302     mpi_run -np $((num_clients * $ior_THREADS)) -machinefile ${MACHINEFILE} $cmd
 303
 304     local rc=$?
 305     if [ $rc != 0 ] ; then
 306         error "ior failed! $rc"
 307     fi
 308     rm -rf $testdir
 309 }
 310 run_test ior "ior"
 311
 312 test_cascading_rw() {
 313     [ x$CASC_RW = x ] &&
 314         { skip_env "cascading_rw not found" && return; }
 315
 316     local clients=$CLIENTS
 317     [ -z $clients ] && clients=$(hostname)
 318
 319     num_clients=$(get_node_count ${clients//,/ })
 320
 321     # FIXME
 322     # Need space estimation here.
 323
 324     generate_machine_file $clients $MACHINEFILE || \
 325         error "can not generate machinefile $MACHINEFILE"
 326
 327     print_opts CASC_RW clients casc_THREADS casc_REP MACHINEFILE
 328
 329     local testdir=$DIR/d0.cascading_rw
 330     mkdir -p $testdir
 331     # mpi_run uses mpiuser
 332     chmod 0777 $testdir
 333
 334     # -g: debug mode
 335     # -n: repeat test # times
 336
 337     local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
 338
 339     echo "+ $cmd"
 340     mpi_run -np $((num_clients * $casc_THREADS)) -machinefile ${MACHINEFILE} $cmd
 341
 342     local rc=$?
 343     if [ $rc != 0 ] ; then
 344         error "cascading_rw failed! $rc"
 345     fi
 346     rm -rf $testdir
 347 }
 348 run_test cascading_rw "cascading_rw"
 349
 350 test_write_append_truncate() {
 351     # location is lustre/tests dir
 352     if ! which write_append_truncate > /dev/null 2>&1 ; then
 353         skip_env "write_append_truncate not found"
 354         return
 355     fi
 356
 357     local clients=$CLIENTS
 358     [ -z $clients ] && clients=$(hostname)
 359
 360     local num_clients=$(get_node_count ${clients//,/ })
 361
 362     # FIXME
 363     # Need space estimation here.
 364
 365     generate_machine_file $clients $MACHINEFILE || \
 366         error "can not generate machinefile $MACHINEFILE"
 367
 368     local testdir=$DIR/d0.write_append_truncate
 369     local file=$testdir/f0.wat
 370
 371     print_opts clients write_REP write_THREADS MACHINEFILE
 372
 373     mkdir -p $testdir
 374     # mpi_run uses mpiuser
 375     chmod 0777 $testdir
 376
 377     local cmd="write_append_truncate -n $write_REP $file"
 378
 379     echo "+ $cmd"
 380     mpi_run -np $((num_clients * $write_THREADS)) -machinefile ${MACHINEFILE} $cmd
 381
 382     local rc=$?
 383     if [ $rc != 0 ] ; then
 384         error "write_append_truncate failed! $rc"
 385         return $rc
 386     fi
 387     rm -rf $testdir
 388 }
 389 run_test write_append_truncate "write_append_truncate"
 390
 391 test_write_disjoint() {
 392     [ x$WRITE_DISJOINT = x ] &&
 393         { skip_env "write_disjoint not found" && return; }
 394
 395     local clients=$CLIENTS
 396     [ -z $clients ] && clients=$(hostname)
 397
 398     local num_clients=$(get_node_count ${clients//,/ })
 399
 400     # FIXME
 401     # Need space estimation here.
 402
 403     generate_machine_file $clients $MACHINEFILE || \
 404         error "can not generate machinefile $MACHINEFILE"
 405
 406     print_opts WRITE_DISJOINT clients wdisjoint_THREADS wdisjoint_REP MACHINEFILE
 407     local testdir=$DIR/d0.write_disjoint
 408     mkdir -p $testdir
 409     # mpi_run uses mpiuser
 410     chmod 0777 $testdir
 411
 412     local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP"
 413
 414     echo "+ $cmd"
 415     mpi_run -np $((num_clients * $wdisjoint_THREADS)) -machinefile ${MACHINEFILE} $cmd
 416
 417     local rc=$?
 418     if [ $rc != 0 ] ; then
 419         error "write_disjoint failed! $rc"
 420     fi
 421     rm -rf $testdir
 422 }
 423 run_test write_disjoint "write_disjoint"
 424
 425 equals_msg `basename $0`: test complete, cleaning up
 426 check_and_cleanup_lustre
 427 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true