lustre/tests/parallel-scale.sh

   1 #!/bin/bash
   2 #
   3 #set -vx
   4
   5 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
   6 . $LUSTRE/tests/test-framework.sh
   7 init_test_env $@
   8 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
   9
  10 #
  11 # compilbench
  12 #
  13 cbench_DIR=${cbench_DIR:-""}
  14 cbench_IDIRS=${cbench_IDIRS:-10}
  15 cbench_RUNS=${cbench_RUNS:-10}  # FIXME: wiki page requirements is 30, do we really need 30 ?
  16
  17 if [ "$SLOW" = "no" ]; then
  18     cbench_IDIRS=2
  19     cbench_RUNS=2
  20 fi
  21
  22 #
  23 # metabench
  24 #
  25 METABENCH=${METABENCH:-$(which metabench 2> /dev/null || true)}
  26 mbench_NFILES=${mbench_NFILES:-30400}
  27 [ "$SLOW" = "no" ] && mbench_NFILES=10000
  28 MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
  29 # threads per client
  30 mbench_THREADS=${mbench_THREADS:-4}
  31
  32 #
  33 # simul
  34 #
  35 SIMUL=${SIMUL:=$(which simul 2> /dev/null || true)}
  36 # threads per client
  37 simul_THREADS=${simul_THREADS:-2}
  38 simul_REP=${simul_REP:-20}
  39 [ "$SLOW" = "no" ] && simul_REP=2
  40
  41 #
  42 # connectathon
  43 #
  44 cnt_DIR=${cnt_DIR:-""}
  45 cnt_NRUN=${cnt_NRUN:-10}
  46 [ "$SLOW" = "no" ] && cnt_NRUN=2
  47
  48 #
  49 # cascading rw
  50 #
  51 CASC_RW=${CASC_RW:-$(which cascading_rw 2> /dev/null || true)}
  52 # threads per client
  53 casc_THREADS=${casc_THREADS:-2}
  54 casc_REP=${casc_REP:-300}
  55 [ "$SLOW" = "no" ] && casc_REP=10
  56
  57 #
  58 # IOR
  59 #
  60 IOR=${IOR:-$(which IOR 2> /dev/null || true)}
  61 # threads per client
  62 ior_THREADS=${ior_THREADS:-2}
  63 ior_blockSize=${ior_blockSize:-6}       # Gb
  64 ior_DURATION=${ior_DURATION:-30}        # minutes
  65 [ "$SLOW" = "no" ] && ior_DURATION=5
  66
  67 #
  68 # write_append_truncate
  69 #
  70 # threads per client
  71 write_THREADS=${write_THREADS:-8}
  72 write_REP=${write_REP:-10000}
  73 [ "$SLOW" = "no" ] && write_REP=100
  74
  75 #
  76 # write_disjoint
  77 #
  78 WRITE_DISJOINT=${WRITE_DISJOINT:-$(which write_disjoint 2> /dev/null || true)}
  79 # threads per client
  80 wdisjoint_THREADS=${wdisjoint_THREADS:-4}
  81 wdisjoint_REP=${wdisjoint_REP:-10000}
  82 [ "$SLOW" = "no" ] && wdisjoint_REP=100
  83
  84 #
  85 # parallel_grouplock
  86 #
  87 #
  88 PARALLEL_GROUPLOCK=${PARALLEL_GROUPLOCK:-$(which parallel_grouplock 2> /dev/null || true)}
  89 parallel_grouplock_MINTASKS=${parallel_grouplock_MINTASKS:-5}
  90
  91 build_test_filter
  92 check_and_setup_lustre
  93
  94 print_opts () {
  95     local var
  96
  97     echo OPTIONS:
  98
  99     for i in $@; do
 100         var=$i
 101         echo "${var}=${!var}"
 102     done
 103     [ -e $MACHINEFILE ] && cat $MACHINEFILE
 104 }
 105
 106 # Takes:
 107 # 5 min * cbench_RUNS
 108 #        SLOW=no     10 mins
 109 #        SLOW=yes    50 mins
 110 # Space estimation:
 111 #        compile dir kernel-1 680MB
 112 #        required space       680MB * cbench_IDIRS = ~7 Gb
 113
 114 test_compilebench() {
 115     print_opts cbench_DIR cbench_IDIRS cbench_RUNS
 116
 117     [ x$cbench_DIR = x ] &&
 118         { skip_env "compilebench not found" && return; }
 119
 120     [ -e $cbench_DIR/compilebench ] || \
 121         { skip_env "No compilebench build" && return; }
 122
 123     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 124     if [ $space -le $((680 * 1024 * cbench_IDIRS)) ]; then
 125         cbench_IDIRS=$(( space / 680 / 1024))
 126         [ $cbench_IDIRS = 0 ] && \
 127             skip_env "Need free space atleast 680 Mb, have $space" && return
 128
 129         log free space=$space, reducing initial dirs to $cbench_IDIRS
 130     fi
 131     # FIXME:
 132     # t-f _base needs to be modifyed to set properly tdir
 133     # for new "test_foo" functions names
 134     # local testdir=$DIR/$tdir
 135     local testdir=$DIR/d0.compilebench
 136     mkdir -p $testdir
 137
 138     local savePWD=$PWD
 139     cd $cbench_DIR
 140     local cmd="./compilebench -D $testdir -i $cbench_IDIRS -r $cbench_RUNS --makej"
 141
 142     log "$cmd"
 143
 144     local rc=0
 145     eval $cmd
 146     rc=$?
 147
 148     cd $savePWD
 149     [ $rc = 0 ] || error "compilebench failed: $rc"
 150     rm -rf $testdir
 151 }
 152 run_test compilebench "compilebench"
 153
 154 test_metabench() {
 155     [ x$METABENCH = x ] &&
 156         { skip_env "metabench not found" && return; }
 157
 158     local clients=$CLIENTS
 159     [ -z $clients ] && clients=$(hostname)
 160
 161     num_clients=$(get_node_count ${clients//,/ })
 162
 163     # FIXME
 164     # Need space estimation here.
 165
 166     generate_machine_file $clients $MACHINEFILE || \
 167         error "can not generate machinefile $MACHINEFILE"
 168
 169     print_opts METABENCH clients mbench_NFILES mbench_THREADS
 170
 171     local testdir=$DIR/d0.metabench
 172     mkdir -p $testdir
 173     # mpi_run uses mpiuser
 174     chmod 0777 $testdir
 175
 176     # -C             Run the file creation tests.
 177     # -S             Run the file stat tests.
 178     # -c nfile       Number of files to be used in each test.
 179     # -k             Cleanup.  Remove the test directories.
 180     local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k"
 181     echo "+ $cmd"
 182     mpi_run -np $((num_clients * $mbench_THREADS)) -machinefile ${MACHINEFILE} $cmd
 183     local rc=$?
 184     if [ $rc != 0 ] ; then
 185         error "metabench failed! $rc"
 186     fi
 187     rm -rf $testdir
 188 }
 189 run_test metabench "metabench"
 190
 191 test_simul() {
 192     [ x$SIMUL = x ] &&
 193         { skip_env "simul not found" && return; }
 194
 195     local clients=$CLIENTS
 196     [ -z $clients ] && clients=$(hostname)
 197
 198     local num_clients=$(get_node_count ${clients//,/ })
 199
 200     # FIXME
 201     # Need space estimation here.
 202
 203     generate_machine_file $clients $MACHINEFILE || \
 204         error "can not generate machinefile $MACHINEFILE"
 205
 206     print_opts SIMUL clients simul_REP simul_THREADS
 207
 208     local testdir=$DIR/d0.simul
 209     mkdir -p $testdir
 210     # mpi_run uses mpiuser
 211     chmod 0777 $testdir
 212
 213     # -n # : repeat each test # times
 214     # -N # : repeat the entire set of tests # times
 215
 216     local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP"
 217
 218     echo "+ $cmd"
 219     mpi_run -np $((num_clients * $simul_THREADS)) -machinefile ${MACHINEFILE} $cmd
 220
 221     local rc=$?
 222     if [ $rc != 0 ] ; then
 223         error "simul failed! $rc"
 224     fi
 225     rm -rf $testdir
 226 }
 227 run_test simul "simul"
 228
 229 test_connectathon() {
 230     print_opts cnt_DIR cnt_NRUN
 231
 232     [ x$cnt_DIR = x ] &&
 233         { skip_env "connectathon dir not found" && return; }
 234
 235     [ -e $cnt_DIR/runtests ] || \
 236         { skip_env "No connectathon runtests found" && return; }
 237
 238     local testdir=$DIR/d0.connectathon
 239     mkdir -p $testdir
 240
 241     local savePWD=$PWD
 242     cd $cnt_DIR
 243
 244     # -f      a quick functionality test
 245     # -a      run basic, general, special, and lock tests
 246     # -N numpasses - will be passed to the runtests script.  This argument
 247     #         is optional.  It specifies the number of times to run
 248     #         through the tests.
 249
 250     local cmd="./runtests -N $cnt_NRUN -a -f $testdir"
 251
 252     log "$cmd"
 253
 254     local rc=0
 255     eval $cmd
 256     rc=$?
 257
 258     cd $savePWD
 259     [ $rc = 0 ] || error "connectathon failed: $rc"
 260     rm -rf $testdir
 261 }
 262 run_test connectathon "connectathon"
 263
 264 test_ior() {
 265     [ x$IOR = x ] &&
 266         { skip_env "IOR not found" && return; }
 267
 268     local clients=$CLIENTS
 269     [ -z $clients ] && clients=$(hostname)
 270
 271     local num_clients=$(get_node_count ${clients//,/ })
 272
 273     local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
 274     echo "+ $ior_blockSize * 1024 * 1024 * $num_clients * $ior_THREADS "
 275     if [ $((space / 2)) -le $(( ior_blockSize * 1024 * 1024 * num_clients * ior_THREADS)) ]; then
 276         echo "+ $space * 9/10 / 1024 / 1024 / $num_clients / $ior_THREADS"
 277         ior_blockSize=$(( space /2 /1024 /1024 / num_clients / ior_THREADS ))
 278         [ $ior_blockSize = 0 ] && \
 279             skip_env "Need free space more than ($num_clients * $ior_THREADS )Gb: $((num_clients*ior_THREADS *1024 *1024*2)), have $space" && return
 280
 281         echo "free space=$space, Need: $num_clients x $ior_THREADS x $ior_blockSize Gb (blockSize reduced to $ior_blockSize Gb)"
 282     fi
 283
 284     generate_machine_file $clients $MACHINEFILE || \
 285         error "can not generate machinefile $MACHINEFILE"
 286
 287     print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
 288
 289     local testdir=$DIR/d0.ior
 290     mkdir -p $testdir
 291     # mpi_run uses mpiuser
 292     chmod 0777 $testdir
 293     $LFS setstripe $testdir -c -1
 294
 295     #
 296     # -b N  blockSize -- contiguous bytes to write per task  (e.g.: 8, 4k, 2m, 1g)"
 297     # -o S  testFileName
 298     # -t N  transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
 299     # -w    writeFile -- write file"
 300     # -r    readFile -- read existing file"
 301     # -T    maxTimeDuration -- max time in minutes to run tests"
 302     # -k    keepFile -- keep testFile(s) on program exit
 303     local cmd="$IOR -a POSIX -b ${ior_blockSize}g -o $testdir/iorData -t 2m -v -w -r -T $ior_DURATION -k"
 304
 305     echo "+ $cmd"
 306     mpi_run -np $((num_clients * $ior_THREADS)) -machinefile ${MACHINEFILE} $cmd
 307
 308     local rc=$?
 309     if [ $rc != 0 ] ; then
 310         error "ior failed! $rc"
 311     fi
 312     rm -rf $testdir
 313 }
 314 run_test ior "ior"
 315
 316 test_cascading_rw() {
 317     if [ "$NFSCLIENT" ]; then
 318         skip "skipped for NFSCLIENT mode"
 319     fi
 320
 321     [ x$CASC_RW = x ] &&
 322         { skip_env "cascading_rw not found" && return; }
 323
 324     local clients=$CLIENTS
 325     [ -z $clients ] && clients=$(hostname)
 326
 327     num_clients=$(get_node_count ${clients//,/ })
 328
 329     # FIXME
 330     # Need space estimation here.
 331
 332     generate_machine_file $clients $MACHINEFILE || \
 333         error "can not generate machinefile $MACHINEFILE"
 334
 335     print_opts CASC_RW clients casc_THREADS casc_REP MACHINEFILE
 336
 337     local testdir=$DIR/d0.cascading_rw
 338     mkdir -p $testdir
 339     # mpi_run uses mpiuser
 340     chmod 0777 $testdir
 341
 342     # -g: debug mode
 343     # -n: repeat test # times
 344
 345     local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
 346
 347     echo "+ $cmd"
 348     mpi_run -np $((num_clients * $casc_THREADS)) -machinefile ${MACHINEFILE} $cmd
 349
 350     local rc=$?
 351     if [ $rc != 0 ] ; then
 352         error "cascading_rw failed! $rc"
 353     fi
 354     rm -rf $testdir
 355 }
 356 run_test cascading_rw "cascading_rw"
 357
 358 test_write_append_truncate() {
 359     # location is lustre/tests dir
 360     if ! which write_append_truncate > /dev/null 2>&1 ; then
 361         skip_env "write_append_truncate not found"
 362         return
 363     fi
 364
 365     local clients=$CLIENTS
 366     [ -z $clients ] && clients=$(hostname)
 367
 368     local num_clients=$(get_node_count ${clients//,/ })
 369
 370     # FIXME
 371     # Need space estimation here.
 372
 373     generate_machine_file $clients $MACHINEFILE || \
 374         error "can not generate machinefile $MACHINEFILE"
 375
 376     local testdir=$DIR/d0.write_append_truncate
 377     local file=$testdir/f0.wat
 378
 379     print_opts clients write_REP write_THREADS MACHINEFILE
 380
 381     mkdir -p $testdir
 382     # mpi_run uses mpiuser
 383     chmod 0777 $testdir
 384
 385     local cmd="write_append_truncate -n $write_REP $file"
 386
 387     echo "+ $cmd"
 388     mpi_run -np $((num_clients * $write_THREADS)) -machinefile ${MACHINEFILE} $cmd
 389
 390     local rc=$?
 391     if [ $rc != 0 ] ; then
 392         error "write_append_truncate failed! $rc"
 393         return $rc
 394     fi
 395     rm -rf $testdir
 396 }
 397 run_test write_append_truncate "write_append_truncate"
 398
 399 test_write_disjoint() {
 400     [ x$WRITE_DISJOINT = x ] &&
 401         { skip_env "write_disjoint not found" && return; }
 402
 403     local clients=$CLIENTS
 404     [ -z $clients ] && clients=$(hostname)
 405
 406     local num_clients=$(get_node_count ${clients//,/ })
 407
 408     # FIXME
 409     # Need space estimation here.
 410
 411     generate_machine_file $clients $MACHINEFILE || \
 412         error "can not generate machinefile $MACHINEFILE"
 413
 414     print_opts WRITE_DISJOINT clients wdisjoint_THREADS wdisjoint_REP MACHINEFILE
 415     local testdir=$DIR/d0.write_disjoint
 416     mkdir -p $testdir
 417     # mpi_run uses mpiuser
 418     chmod 0777 $testdir
 419
 420     local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP"
 421
 422     echo "+ $cmd"
 423     mpi_run -np $((num_clients * $wdisjoint_THREADS)) -machinefile ${MACHINEFILE} $cmd
 424
 425     local rc=$?
 426     if [ $rc != 0 ] ; then
 427         error "write_disjoint failed! $rc"
 428     fi
 429     rm -rf $testdir
 430 }
 431 run_test write_disjoint "write_disjoint"
 432
 433 test_parallel_grouplock() {
 434     [ x$PARALLEL_GROUPLOCK = x ] &&
 435         { skip "PARALLEL_GROUPLOCK not found" && return; }
 436
 437     local clients=$CLIENTS
 438     [ -z $clients ] && clients=$(hostname)
 439
 440     local num_clients=$(get_node_count ${clients//,/ })
 441
 442     generate_machine_file $clients $MACHINEFILE || \
 443         error "can not generate machinefile $MACHINEFILE"
 444
 445     print_opts clients parallel_grouplock_MINTASKS MACHINEFILE
 446
 447     local testdir=$DIR/d0.parallel_grouplock
 448     mkdir -p $testdir
 449     # mpi_run uses mpiuser
 450     chmod 0777 $testdir
 451
 452     do_nodes $clients "lctl set_param llite.*.max_rw_chunk=0" ||
 453         error "set_param max_rw_chunk=0 failed "
 454
 455     local cmd
 456     local status=0
 457     local subtest
 458     for i in $(seq 12); do
 459         subtest="-t $i"
 460         local cmd="$PARALLEL_GROUPLOCK -g -v -d $testdir $subtest"
 461         echo "+ $cmd"
 462
 463         mpi_run -np $parallel_grouplock_MINTASKS -machinefile ${MACHINEFILE} $cmd
 464         local rc=$?
 465         if [ $rc != 0 ] ; then
 466             error_noexit "parallel_grouplock subtests $subtest failed! $rc"
 467         else
 468             echo "parallel_grouplock subtests $subtest PASS"
 469         fi
 470         let status=$((status + rc))
 471         # clear debug to collect one log per one test
 472         do_nodes $(comma_list $(nodes_list)) lctl clear
 473      done
 474     [ $status -eq 0 ] || error "parallel_grouplock status: $status"
 475     rm -rf $testdir
 476 }
 477 run_test parallel_grouplock "parallel_grouplock"
 478
 479 statahead_NUMMNTPTS=${statahead_NUMMNTPTS:-5}
 480 statahead_NUMFILES=${statahead_NUMFILES:-500000}
 481
 482 cleanup_statahead () {
 483     trap 0
 484
 485     local clients=$1
 486     local mntpt_root=$2
 487     local num_mntpts=$3
 488
 489     for i in $(seq 0 $num_mntpts);do
 490         zconf_umount_clients $clients ${mntpt_root}$i ||
 491             error_exit "Failed to umount lustre on ${mntpt_root}$i"
 492     done
 493 }
 494
 495 test_statahead () {
 496
 497     # create large dir
 498
 499     local dir=d0.statahead
 500     # FIXME has to use DIR
 501     local testdir=$DIR/$dir
 502
 503     mkdir -p $testdir
 504
 505     local num_files=$statahead_NUMFILES
 506
 507     local IFree=$(inodes_available)
 508     if [ $IFree -lt $num_files ]; then
 509       num_files=$IFree
 510     fi
 511
 512     cancel_lru_locks mdc
 513
 514     log "createmany -o $testdir/f-%d $num_files"
 515     createmany -o $testdir/$f-%d $num_files
 516
 517     local rc=$?
 518     if [ $rc != 0 ] ; then
 519         error "createmany failed to create $rc"
 520         return $rc
 521     fi
 522
 523     local num_mntpts=$statahead_NUMMNTPTS
 524     local mntpt_root=$TMP/mntpt/lustre
 525     mntopts=${MNTOPTSTATAHEAD:-$MOUNTOPT}
 526
 527     local clients=$CLIENTS
 528     [ -z $clients ] && clients=$(hostname)
 529
 530     echo "Mounting $num_mntpts lustre clients starts on $clients"
 531     trap "cleanup_statahead $clients $mntpt_root $num_mntpts" EXIT ERR
 532     for i in $(seq 0 $num_mntpts);do
 533         zconf_mount_clients $clients ${mntpt_root}$i $mntopts ||
 534             error_exit "Failed to mount lustre on ${mntpt_root}$i on $clients"
 535     done
 536
 537     do_rpc_nodes $clients cancel_lru_locks mdc
 538
 539     do_rpc_nodes $clients do_ls $mntpt_root $num_mntpts $dir
 540
 541     cleanup_statahead $clients $mntpt_root $num_mntpts
 542 }
 543
 544 run_test statahead "statahead test, multiple clients"
 545
 546 equals_msg `basename $0`: test complete, cleaning up
 547 check_and_cleanup_lustre
 548 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true