lustre/tests/parallel-scale.sh

   1 #!/bin/bash
   2 #
   3 #set -vx
   4
   5 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
   6 . $LUSTRE/tests/test-framework.sh
   7 init_test_env $@
   8 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
   9
  10 #              bug 20670           21255
  11 ALWAYS_EXCEPT="parallel_grouplock  statahead $PARALLEL_SCALE_EXCEPT"
  12
  13 #
  14 # compilbench
  15 #
  16 cbench_DIR=${cbench_DIR:-""}
  17 cbench_IDIRS=${cbench_IDIRS:-4}
  18 cbench_RUNS=${cbench_RUNS:-4}   # FIXME: wiki page requirements is 30, do we really need 30 ?
  19
  20 if [ "$SLOW" = "no" ]; then
  21     cbench_IDIRS=2
  22     cbench_RUNS=2
  23 fi
  24
  25 #
  26 # metabench
  27 #
  28 METABENCH=${METABENCH:-$(which metabench 2> /dev/null || true)}
  29 mbench_NFILES=${mbench_NFILES:-30400}
  30 [ "$SLOW" = "no" ] && mbench_NFILES=10000
  31 MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
  32 # threads per client
  33 mbench_THREADS=${mbench_THREADS:-4}
  34
  35 #
  36 # simul
  37 #
  38 SIMUL=${SIMUL:=$(which simul 2> /dev/null || true)}
  39 # threads per client
  40 simul_THREADS=${simul_THREADS:-2}
  41 simul_REP=${simul_REP:-20}
  42 [ "$SLOW" = "no" ] && simul_REP=2
  43
  44 #
  45 # connectathon
  46 #
  47 cnt_DIR=${cnt_DIR:-""}
  48 cnt_NRUN=${cnt_NRUN:-10}
  49 [ "$SLOW" = "no" ] && cnt_NRUN=2
  50
  51 #
  52 # cascading rw
  53 #
  54 CASC_RW=${CASC_RW:-$(which cascading_rw 2> /dev/null || true)}
  55 # threads per client
  56 casc_THREADS=${casc_THREADS:-2}
  57 casc_REP=${casc_REP:-300}
  58 [ "$SLOW" = "no" ] && casc_REP=10
  59
  60 #
  61 # IOR
  62 #
  63 IOR=${IOR:-$(which IOR 2> /dev/null || true)}
  64 # threads per client
  65 ior_THREADS=${ior_THREADS:-2}
  66 ior_blockSize=${ior_blockSize:-6}       # Gb
  67 ior_DURATION=${ior_DURATION:-30}        # minutes
  68 [ "$SLOW" = "no" ] && ior_DURATION=5
  69
  70 #
  71 # write_append_truncate
  72 #
  73 # threads per client
  74 write_THREADS=${write_THREADS:-8}
  75 write_REP=${write_REP:-10000}
  76 [ "$SLOW" = "no" ] && write_REP=100
  77
  78 #
  79 # write_disjoint
  80 #
  81 WRITE_DISJOINT=${WRITE_DISJOINT:-$(which write_disjoint 2> /dev/null || true)}
  82 # threads per client
  83 wdisjoint_THREADS=${wdisjoint_THREADS:-4}
  84 wdisjoint_REP=${wdisjoint_REP:-10000}
  85 [ "$SLOW" = "no" ] && wdisjoint_REP=100
  86
  87 #
  88 # parallel_grouplock
  89 #
  90 #
  91 PARALLEL_GROUPLOCK=${PARALLEL_GROUPLOCK:-$(which parallel_grouplock 2> /dev/null || true)}
  92 parallel_grouplock_MINTASKS=${parallel_grouplock_MINTASKS:-5}
  93
  94 build_test_filter
  95 check_and_setup_lustre
  96
  97 get_mpiuser_id $MPI_USER
  98 MPI_RUNAS=${MPI_RUNAS:-"runas -u $MPI_USER_UID -g $MPI_USER_GID"}
  99 $GSS_KRB5 && refresh_krb5_tgt $MPI_USER_UID $MPI_USER_GID $MPI_RUNAS
 100
 101 print_opts () {
 102     local var
 103
 104     echo OPTIONS:
 105
 106     for i in $@; do
 107         var=$i
 108         echo "${var}=${!var}"
 109     done
 110     [ -e $MACHINEFILE ] && cat $MACHINEFILE
 111 }
 112
 113 # Takes:
 114 # 5 min * cbench_RUNS
 115 #        SLOW=no     10 mins
 116 #        SLOW=yes    50 mins
 117 # Space estimation:
 118 #        compile dir kernel-1 680MB
 119 #        required space       680MB * cbench_IDIRS = ~7 Gb
 120
 121 test_compilebench() {
 122     print_opts cbench_DIR cbench_IDIRS cbench_RUNS
 123
 124     [ x$cbench_DIR = x ] &&
 125         { skip_env "compilebench not found" && return; }
 126
 127     [ -e $cbench_DIR/compilebench ] || \
 128         { skip_env "No compilebench build" && return; }
 129
 130     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 131     if [ $space -le $((680 * 1024 * cbench_IDIRS)) ]; then
 132         cbench_IDIRS=$(( space / 680 / 1024))
 133         [ $cbench_IDIRS = 0 ] && \
 134             skip_env "Need free space atleast 680 Mb, have $space" && return
 135
 136         log free space=$space, reducing initial dirs to $cbench_IDIRS
 137     fi
 138     # FIXME:
 139     # t-f _base needs to be modifyed to set properly tdir
 140     # for new "test_foo" functions names
 141     # local testdir=$DIR/$tdir
 142     local testdir=$DIR/d0.compilebench
 143     mkdir -p $testdir
 144
 145     local savePWD=$PWD
 146     cd $cbench_DIR
 147     local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej"
 148
 149     log "$cmd"
 150
 151     local rc=0
 152     eval $cmd
 153     rc=$?
 154
 155     cd $savePWD
 156     [ $rc = 0 ] || error "compilebench failed: $rc"
 157     rm -rf $testdir
 158 }
 159 run_test compilebench "compilebench"
 160
 161 test_metabench() {
 162     [ x$METABENCH = x ] &&
 163         { skip_env "metabench not found" && return; }
 164
 165     local clients=$CLIENTS
 166     [ -z $clients ] && clients=$(hostname)
 167
 168     num_clients=$(get_node_count ${clients//,/ })
 169
 170     # FIXME
 171     # Need space estimation here.
 172
 173     generate_machine_file $clients $MACHINEFILE || \
 174         error "can not generate machinefile $MACHINEFILE"
 175
 176     print_opts METABENCH clients mbench_NFILES mbench_THREADS
 177
 178     local testdir=$DIR/d0.metabench
 179     mkdir -p $testdir
 180     # mpi_run uses mpiuser
 181     chmod 0777 $testdir
 182
 183     # -C             Run the file creation tests.
 184     # -S             Run the file stat tests.
 185     # -c nfile       Number of files to be used in each test.
 186     # -k             Cleanup.  Remove the test directories.
 187     local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k"
 188     echo "+ $cmd"
 189     mpi_run -np $((num_clients * $mbench_THREADS)) -machinefile ${MACHINEFILE} $cmd
 190     local rc=$?
 191     if [ $rc != 0 ] ; then
 192         error "metabench failed! $rc"
 193     fi
 194     rm -rf $testdir
 195 }
 196 run_test metabench "metabench"
 197
 198 test_simul() {
 199     [ x$SIMUL = x ] &&
 200         { skip_env "simul not found" && return; }
 201
 202     local clients=$CLIENTS
 203     [ -z $clients ] && clients=$(hostname)
 204
 205     local num_clients=$(get_node_count ${clients//,/ })
 206
 207     # FIXME
 208     # Need space estimation here.
 209
 210     generate_machine_file $clients $MACHINEFILE || \
 211         error "can not generate machinefile $MACHINEFILE"
 212
 213     print_opts SIMUL clients simul_REP simul_THREADS
 214
 215     local testdir=$DIR/d0.simul
 216     mkdir -p $testdir
 217     # mpi_run uses mpiuser
 218     chmod 0777 $testdir
 219
 220     # -n # : repeat each test # times
 221     # -N # : repeat the entire set of tests # times
 222
 223     local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP"
 224
 225     echo "+ $cmd"
 226     mpi_run -np $((num_clients * $simul_THREADS)) -machinefile ${MACHINEFILE} $cmd
 227
 228     local rc=$?
 229     if [ $rc != 0 ] ; then
 230         error "simul failed! $rc"
 231     fi
 232     rm -rf $testdir
 233 }
 234 run_test simul "simul"
 235
 236 test_connectathon() {
 237     print_opts cnt_DIR cnt_NRUN
 238
 239     [ x$cnt_DIR = x ] &&
 240         { skip_env "connectathon dir not found" && return; }
 241
 242     [ -e $cnt_DIR/runtests ] || \
 243         { skip_env "No connectathon runtests found" && return; }
 244
 245     local testdir=$DIR/d0.connectathon
 246     mkdir -p $testdir
 247
 248     local savePWD=$PWD
 249     cd $cnt_DIR
 250
 251     # -f      a quick functionality test
 252     # -a      run basic, general, special, and lock tests
 253     # -N numpasses - will be passed to the runtests script.  This argument
 254     #         is optional.  It specifies the number of times to run
 255     #         through the tests.
 256
 257     local cmd="./runtests -N $cnt_NRUN -a -f $testdir"
 258
 259     log "$cmd"
 260
 261     local rc=0
 262     eval $cmd
 263     rc=$?
 264
 265     cd $savePWD
 266     [ $rc = 0 ] || error "connectathon failed: $rc"
 267     rm -rf $testdir
 268 }
 269 run_test connectathon "connectathon"
 270
 271 test_ior() {
 272     [ x$IOR = x ] &&
 273         { skip_env "IOR not found" && return; }
 274
 275     local clients=$CLIENTS
 276     [ -z $clients ] && clients=$(hostname)
 277
 278     local num_clients=$(get_node_count ${clients//,/ })
 279
 280     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 281     echo "+ $ior_blockSize * 1024 * 1024 * $num_clients * $ior_THREADS "
 282     if [ $((space / 2)) -le $(( ior_blockSize * 1024 * 1024 * num_clients * ior_THREADS)) ]; then
 283         echo "+ $space * 9/10 / 1024 / 1024 / $num_clients / $ior_THREADS"
 284         ior_blockSize=$(( space /2 /1024 /1024 / num_clients / ior_THREADS ))
 285         [ $ior_blockSize = 0 ] && \
 286             skip_env "Need free space more than ($num_clients * $ior_THREADS )Gb: $((num_clients*ior_THREADS *1024 *1024*2)), have $space" && return
 287
 288         echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)"
 289     fi
 290
 291     generate_machine_file $clients $MACHINEFILE || \
 292         error "can not generate machinefile $MACHINEFILE"
 293
 294     print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
 295
 296     local testdir=$DIR/d0.ior
 297     mkdir -p $testdir
 298     # mpi_run uses mpiuser
 299     chmod 0777 $testdir
 300     $LFS setstripe $testdir -c -1
 301
 302     #
 303     # -b N  blockSize -- contiguous bytes to write per task  (e.g.: 8, 4k, 2m, 1g)"
 304     # -o S  testFileName
 305     # -t N  transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
 306     # -w    writeFile -- write file"
 307     # -r    readFile -- read existing file"
 308     # -T    maxTimeDuration -- max time in minutes to run tests"
 309     # -k    keepFile -- keep testFile(s) on program exit
 310     local cmd="$IOR -a POSIX -b ${ior_blockSize}g -o $testdir/iorData -t 2m -v -w -r -T $ior_DURATION -k"
 311
 312     echo "+ $cmd"
 313     mpi_run -np $((num_clients * $ior_THREADS)) -machinefile ${MACHINEFILE} $cmd
 314
 315     local rc=$?
 316     if [ $rc != 0 ] ; then
 317         error "ior failed! $rc"
 318     fi
 319     rm -rf $testdir
 320 }
 321 run_test ior "ior"
 322
 323 test_cascading_rw() {
 324     if [ "$NFSCLIENT" ]; then
 325         skip "skipped for NFSCLIENT mode"
 326     fi
 327
 328     [ x$CASC_RW = x ] &&
 329         { skip_env "cascading_rw not found" && return; }
 330
 331     local clients=$CLIENTS
 332     [ -z $clients ] && clients=$(hostname)
 333
 334     num_clients=$(get_node_count ${clients//,/ })
 335
 336     # FIXME
 337     # Need space estimation here.
 338
 339     generate_machine_file $clients $MACHINEFILE || \
 340         error "can not generate machinefile $MACHINEFILE"
 341
 342     print_opts CASC_RW clients casc_THREADS casc_REP MACHINEFILE
 343
 344     local testdir=$DIR/d0.cascading_rw
 345     mkdir -p $testdir
 346     # mpi_run uses mpiuser
 347     chmod 0777 $testdir
 348
 349     # -g: debug mode
 350     # -n: repeat test # times
 351
 352     local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
 353
 354     echo "+ $cmd"
 355     mpi_run -np $((num_clients * $casc_THREADS)) -machinefile ${MACHINEFILE} $cmd
 356
 357     local rc=$?
 358     if [ $rc != 0 ] ; then
 359         error "cascading_rw failed! $rc"
 360     fi
 361     rm -rf $testdir
 362 }
 363 run_test cascading_rw "cascading_rw"
 364
 365 test_write_append_truncate() {
 366     # location is lustre/tests dir
 367     if ! which write_append_truncate > /dev/null 2>&1 ; then
 368         skip_env "write_append_truncate not found"
 369         return
 370     fi
 371
 372     local clients=$CLIENTS
 373     [ -z $clients ] && clients=$(hostname)
 374
 375     local num_clients=$(get_node_count ${clients//,/ })
 376
 377     # FIXME
 378     # Need space estimation here.
 379
 380     generate_machine_file $clients $MACHINEFILE || \
 381         error "can not generate machinefile $MACHINEFILE"
 382
 383     local testdir=$DIR/d0.write_append_truncate
 384     local file=$testdir/f0.wat
 385
 386     print_opts clients write_REP write_THREADS MACHINEFILE
 387
 388     mkdir -p $testdir
 389     # mpi_run uses mpiuser
 390     chmod 0777 $testdir
 391
 392     local cmd="write_append_truncate -n $write_REP $file"
 393
 394     echo "+ $cmd"
 395     mpi_run -np $((num_clients * $write_THREADS)) -machinefile ${MACHINEFILE} $cmd
 396
 397     local rc=$?
 398     if [ $rc != 0 ] ; then
 399         error "write_append_truncate failed! $rc"
 400         return $rc
 401     fi
 402     rm -rf $testdir
 403 }
 404 run_test write_append_truncate "write_append_truncate"
 405
 406 test_write_disjoint() {
 407     [ x$WRITE_DISJOINT = x ] &&
 408         { skip_env "write_disjoint not found" && return; }
 409
 410     local clients=$CLIENTS
 411     [ -z $clients ] && clients=$(hostname)
 412
 413     local num_clients=$(get_node_count ${clients//,/ })
 414
 415     # FIXME
 416     # Need space estimation here.
 417
 418     generate_machine_file $clients $MACHINEFILE || \
 419         error "can not generate machinefile $MACHINEFILE"
 420
 421     print_opts WRITE_DISJOINT clients wdisjoint_THREADS wdisjoint_REP MACHINEFILE
 422     local testdir=$DIR/d0.write_disjoint
 423     mkdir -p $testdir
 424     # mpi_run uses mpiuser
 425     chmod 0777 $testdir
 426
 427     local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP"
 428
 429     echo "+ $cmd"
 430     mpi_run -np $((num_clients * $wdisjoint_THREADS)) -machinefile ${MACHINEFILE} $cmd
 431
 432     local rc=$?
 433     if [ $rc != 0 ] ; then
 434         error "write_disjoint failed! $rc"
 435     fi
 436     rm -rf $testdir
 437 }
 438 run_test write_disjoint "write_disjoint"
 439
 440 test_parallel_grouplock() {
 441     [ x$PARALLEL_GROUPLOCK = x ] &&
 442         { skip "PARALLEL_GROUPLOCK not found" && return; }
 443
 444     local clients=$CLIENTS
 445     [ -z $clients ] && clients=$(hostname)
 446
 447     local num_clients=$(get_node_count ${clients//,/ })
 448
 449     generate_machine_file $clients $MACHINEFILE || \
 450         error "can not generate machinefile $MACHINEFILE"
 451
 452     print_opts clients parallel_grouplock_MINTASKS MACHINEFILE
 453
 454     local testdir=$DIR/d0.parallel_grouplock
 455     mkdir -p $testdir
 456     # mpi_run uses mpiuser
 457     chmod 0777 $testdir
 458
 459     do_nodes $clients "lctl set_param llite.*.max_rw_chunk=0" ||
 460         error "set_param max_rw_chunk=0 failed "
 461
 462     local cmd
 463     local status=0
 464     local subtest
 465     for i in $(seq 12); do
 466         subtest="-t $i"
 467         local cmd="$PARALLEL_GROUPLOCK -g -v -d $testdir $subtest"
 468         echo "+ $cmd"
 469
 470         mpi_run -np $parallel_grouplock_MINTASKS -machinefile ${MACHINEFILE} $cmd
 471         local rc=$?
 472         if [ $rc != 0 ] ; then
 473             error_noexit "parallel_grouplock subtests $subtest failed! $rc"
 474         else
 475             echo "parallel_grouplock subtests $subtest PASS"
 476         fi
 477         let status=$((status + rc))
 478         # clear debug to collect one log per one test
 479         do_nodes $(comma_list $(nodes_list)) lctl clear
 480      done
 481     [ $status -eq 0 ] || error "parallel_grouplock status: $status"
 482     rm -rf $testdir
 483 }
 484 run_test parallel_grouplock "parallel_grouplock"
 485
 486 statahead_NUMMNTPTS=${statahead_NUMMNTPTS:-5}
 487 statahead_NUMFILES=${statahead_NUMFILES:-500000}
 488
 489 cleanup_statahead () {
 490     trap 0
 491
 492     local clients=$1
 493     local mntpt_root=$2
 494     local num_mntpts=$3
 495
 496     for i in $(seq 0 $num_mntpts);do
 497         zconf_umount_clients $clients ${mntpt_root}$i ||
 498             error_exit "Failed to umount lustre on ${mntpt_root}$i"
 499     done
 500 }
 501
 502 test_statahead () {
 503
 504     # create large dir
 505
 506     local dir=d0.statahead
 507     # FIXME has to use DIR
 508     local testdir=$DIR/$dir
 509
 510     mkdir -p $testdir
 511
 512     local num_files=$statahead_NUMFILES
 513
 514     local IFree=$(inodes_available)
 515     if [ $IFree -lt $num_files ]; then
 516       num_files=$IFree
 517     fi
 518
 519     cancel_lru_locks mdc
 520
 521     log "createmany -o $testdir/f-%d $num_files"
 522     createmany -o $testdir/$f-%d $num_files
 523
 524     local rc=$?
 525     if [ $rc != 0 ] ; then
 526         error "createmany failed to create $rc"
 527         return $rc
 528     fi
 529
 530     local num_mntpts=$statahead_NUMMNTPTS
 531     local mntpt_root=$TMP/mntpt/lustre
 532     mntopts=${MNTOPTSTATAHEAD:-$MOUNTOPT}
 533
 534     local clients=$CLIENTS
 535     [ -z $clients ] && clients=$(hostname)
 536
 537     echo "Mounting $num_mntpts lustre clients starts on $clients"
 538     trap "cleanup_statahead $clients $mntpt_root $num_mntpts" EXIT ERR
 539     for i in $(seq 0 $num_mntpts);do
 540         zconf_mount_clients $clients ${mntpt_root}$i $mntopts ||
 541             error_exit "Failed to mount lustre on ${mntpt_root}$i on $clients"
 542     done
 543
 544     do_rpc_nodes $clients cancel_lru_locks mdc
 545
 546     do_rpc_nodes $clients do_ls $mntpt_root $num_mntpts $dir
 547
 548     cleanup_statahead $clients $mntpt_root $num_mntpts
 549 }
 550
 551 run_test statahead "statahead test, multiple clients"
 552
 553 equals_msg `basename $0`: test complete, cleaning up
 554 check_and_cleanup_lustre
 555 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true