2 # -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
3 # vim:shiftwidth=4:softtabstop=4:tabstop=4:
5 # Simple function used by run_*.sh scripts
10 if [ -z "${!name}" ]; then
11 echo "$0: $name must be set"
15 [ $failed ] && exit 1 || true
18 # lrepl - Lustre test Read-Eval-Print Loop.
20 # This function implements a REPL for the Lustre test framework. It
21 # doesn't exec an actual shell because the user may want to inspect
22 # variables and use functions from the test framework.
29 This is an interactive read-eval-print loop interactive shell
30 simulation that you can use to debug failing tests. You can
31 enter most bash command lines (see notes below).
33 Use this REPL to inspect variables, set them, call test
34 framework shell functions, etcetera.
36 'exit' or EOF to exit this shell.
38 set \$retcode to 0 to cause the assertion failure that
39 triggered this REPL to be ignored.
42 do_facet ost1 lctl get_param ost.*.ost.threads_*
43 do_rpc_nodes \$OSTNODES unload_modules
46 All but the last line of multi-line statements or blocks
47 must end in a backslash.
49 "Here documents" are not supported.
51 History is not supported, but command-line editing is.
55 # Prompt escapes don't work in read -p, sadly.
56 prompt=":test_${testnum:-UNKNOWN}:$(uname -n):$(basename $PWD)% "
58 # We use read -r to get close to a shell experience
59 while read -e -r -p "$prompt" rawline; do
62 # Don't want to exit-exit, just exit the REPL
64 # We need to handle continuations, and read -r doesn't do
65 # that for us. Yet we need read -r.
67 # We also use case/esac to compare lines read to "*\\"
68 # because [ "$line" = *\\ ] and variants of that don't work.
70 while read -e -r -p '> ' rawline
72 line="$line"$'\n'"$rawline"
74 # We could check for here documents by matching
75 # against *<<*, but who cares.
88 # Finally! Time to eval.
92 echo $'\n\tExiting interactive shell...\n'
96 # lassert - Lustre test framework assert
98 # Arguments: failure code, failure message, expression/statement
100 # lassert evaluates the expression given, and, if false, calls
101 # error() to trigger test failure. If REPL_ON_LASSERT is true then
102 # lassert will call lrepl() to give the user an interactive shell.
103 # If the REPL sets retcode=0 then the assertion failure will be
110 echo "checking $* ($(eval echo \""$*"\"))..."
111 eval "$@" && return 0;
113 if ${REPL_ON_LASSERT:-false}; then
114 echo "Assertion $retcode failed: $* (expanded: $(eval echo \""$*"\"))
119 error "Assertion $retcode failed: $* (expanded: $(eval echo \""$*"\"))
124 # setmodopts- set module options for subsequent calls to load_modules
126 # Usage: setmodopts module_name new_value [var_in_which_to_save_old_value]
127 # setmodopts -a module_name new_value [var_in_which_to_save_old_value]
129 # In the second usage the new value is appended to the old.
133 if [ "$1" = -a ]; then
138 local _var=MODOPTS_$1
143 # Dynamic naming of variables is a pain in bash. In ksh93 we could
144 # write "nameref opts_var=${modname}_MODOPTS" then assign directly
145 # to opts_var. Associative arrays would also help, alternatively.
146 # Alas, we're stuck with eval until all distros move to a more recent
147 # version of bash. Fortunately we don't need to eval unset and export.
149 if [ -z "$_newvalue" ]; then
155 $_append && _newvalue="$_oldvalue $_newvalue"
156 export $_var="$_newvalue"
157 echo setmodopts: ${_var}=${_newvalue}
159 [ -n "$_savevar" ] && eval $_savevar=\""$_oldvalue"\"
162 echoerr () { echo "$@" 1>&2 ; }
165 echoerr "$(date +'%F %H:%M:%S'): client load was signaled to terminate"
167 local PGID=$(ps -eo "%c %p %r" | awk "/ $PPID / {print \$3}")
174 local mpirun="$MPIRUN $MPIRUN_OPTIONS"
175 local command="$mpirun $@"
176 local mpilog=$TMP/mpi.log
179 if [ -n "$MPI_USER" -a "$MPI_USER" != root -a -n "$mpirun" ]; then
180 echo "+ chmod 0777 $MOUNT"
182 command="su $MPI_USER sh -c \"$command \""
187 eval $command 2>&1 | tee $mpilog || true
190 if [ $rc -eq 0 ] && grep -q "p4_error:" $mpilog ; then
199 for i in ${1//,/ }; do
200 if [ "$list" = "" ]; then
203 list="$list$escape $i@$NETTYPE"
209 # FIXME: all setup/cleanup can be done without rpc.sh
212 [ x$1 = x--verbose ] && verbose=true
214 export LST_SESSION=`$LST show_session 2>/dev/null | awk -F " " '{print $5}'`
215 [ "$LST_SESSION" == "" ] && return
224 lst_session_cleanup_all () {
225 local list=$(comma_list $(nodes_list))
226 do_rpc_nodes $list lst_end_session
230 lsmod | grep -q lnet_selftest && \
231 rmmod lnet_selftest > /dev/null 2>&1 || true
235 local list=$(comma_list $(nodes_list))
237 # lst end_session needs to be executed only locally
238 # i.e. on node where lst new_session was called
239 lst_end_session --verbose
240 do_rpc_nodes $list lst_cleanup
244 load_module lnet_selftest
248 local list=$(comma_list $(nodes_list))
249 do_rpc_nodes $list lst_setup
255 # Passed a single argument, strips everything off following
256 # and includes the first period.
257 # client-20.lab.whamcloud.com becomes client-20
259 echo $(sed 's/\..*//' <<< $1)
265 # Find remote nodename, stripped of any domain, etc.
266 # 'hostname -s' is easy, but not implemented on all systems
268 local rname=$(do_node $1 "uname -n" || echo -1)
269 if [[ "$rname" = "-1" ]]; then
272 echo $(short_hostname $rname)
282 echo "${var}=${!var}"
284 [ -e $MACHINEFILE ] && cat $MACHINEFILE
289 # compile dir kernel-0 ~1GB
290 # required space ~1GB * cbench_IDIRS
292 cbench_DIR=${cbench_DIR:-""}
293 cbench_IDIRS=${cbench_IDIRS:-2}
294 cbench_RUNS=${cbench_RUNS:-2}
296 print_opts cbench_DIR cbench_IDIRS cbench_RUNS
298 [ x$cbench_DIR = x ] &&
299 { skip_env "compilebench not found" && return; }
301 [ -e $cbench_DIR/compilebench ] || \
302 { skip_env "No compilebench build" && return; }
304 local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
305 if [[ $space -le $((1024 * 1024 * cbench_IDIRS)) ]]; then
306 cbench_IDIRS=$((space / 1024 / 1024))
307 [[ $cbench_IDIRS -eq 0 ]] &&
308 skip_env "Need free space at least 1GB, have $space" &&
311 echo "free space=$space, reducing initial dirs to $cbench_IDIRS"
315 # t-f _base needs to be modifyed to set properly tdir
316 # for new "test_foo" functions names
317 # local testdir=$DIR/$tdir
318 local testdir=$DIR/d0.compilebench
323 local cmd="./compilebench -D $testdir -i $cbench_IDIRS \
324 -r $cbench_RUNS --makej"
333 [ $rc = 0 ] || error "compilebench failed: $rc"
339 METABENCH=${METABENCH:-$(which metabench 2> /dev/null || true)}
340 mbench_NFILES=${mbench_NFILES:-30400}
342 mbench_THREADS=${mbench_THREADS:-4}
343 mbench_OPTIONS=${mbench_OPTIONS:-}
345 [ x$METABENCH = x ] &&
346 { skip_env "metabench not found" && return; }
349 # Need space estimation here.
351 print_opts METABENCH clients mbench_NFILES mbench_THREADS
353 local testdir=$DIR/d0.metabench
355 # mpi_run uses mpiuser
358 # -C Run the file creation tests.
359 # -S Run the file stat tests.
360 # -c nfile Number of files to be used in each test.
361 # -k Cleanup. Remove the test directories.
362 local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k $mbench_OPTIONS"
365 # find out if we need to use srun by checking $SRUN_PARTITION
366 if [ "$SRUN_PARTITION" ]; then
367 $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \
368 -n $((num_clients * mbench_THREADS)) \
369 -p $SRUN_PARTITION -- $cmd
371 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
372 -np $((num_clients * $mbench_THREADS)) $cmd
376 if [ $rc != 0 ] ; then
377 error "metabench failed! $rc"
384 SIMUL=${SIMUL:=$(which simul 2> /dev/null || true)}
386 simul_THREADS=${simul_THREADS:-2}
387 simul_REP=${simul_REP:-20}
389 if [ "$NFSCLIENT" ]; then
390 skip "skipped for NFSCLIENT mode"
395 { skip_env "simul not found" && return; }
398 # Need space estimation here.
400 print_opts SIMUL clients simul_REP simul_THREADS
402 local testdir=$DIR/d0.simul
404 # mpi_run uses mpiuser
407 # -n # : repeat each test # times
408 # -N # : repeat the entire set of tests # times
410 local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP"
413 # find out if we need to use srun by checking $SRUN_PARTITION
414 if [ "$SRUN_PARTITION" ]; then
415 $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \
416 -n $((num_clients * simul_THREADS)) -p $SRUN_PARTITION \
419 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
420 -np $((num_clients * simul_THREADS)) $cmd
424 if [ $rc != 0 ] ; then
425 error "simul failed! $rc"
432 MDTEST=${MDTEST:=$(which mdtest 2> /dev/null || true)}
434 mdtest_THREADS=${mdtest_THREADS:-2}
435 mdtest_nFiles=${mdtest_nFiles:-"100000"}
436 # We devide the files by number of core
437 mdtest_nFiles=$((mdtest_nFiles/mdtest_THREADS/num_clients))
438 mdtest_iteration=${mdtest_iteration:-1}
440 local type=${1:-"ssf"}
442 if [ "$NFSCLIENT" ]; then
443 skip "skipped for NFSCLIENT mode"
448 { skip_env "mdtest not found" && return; }
451 # Need space estimation here.
453 print_opts MDTEST mdtest_iteration mdtest_THREADS mdtest_nFiles
455 local testdir=$DIR/d0.mdtest
457 # mpi_run uses mpiuser
460 # -i # : repeat each test # times
462 # -n # : number of file/dir to create/stat/remove
463 # -u : each process create/stat/remove individually
465 local cmd="$MDTEST -d $testdir -i $mdtest_iteration -n $mdtest_nFiles"
466 [ $type = "fpp" ] && cmd="$cmd -u"
469 # find out if we need to use srun by checking $SRUN_PARTITION
470 if [ "$SRUN_PARTITION" ]; then
471 $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \
472 -n $((num_clients * mdtest_THREADS)) \
473 -p $SRUN_PARTITION -- $cmd
475 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
476 -np $((num_clients * mdtest_THREADS)) $cmd
480 if [ $rc != 0 ] ; then
481 error "mdtest failed! $rc"
488 cnt_DIR=${cnt_DIR:-""}
489 cnt_NRUN=${cnt_NRUN:-10}
491 print_opts cnt_DIR cnt_NRUN
494 { skip_env "connectathon dir not found" && return; }
496 [ -e $cnt_DIR/runtests ] || \
497 { skip_env "No connectathon runtests found" && return; }
499 local testdir=$DIR/d0.connectathon
506 # cthon options (must be in this order)
508 # -N numpasses - will be passed to the runtests script. This argument
509 # is optional. It specifies the number of times to run
512 # One of these test types
517 # -a all of the above
519 # -f a quick functionality test
523 # Include lock tests unless we're running on nfsv4
524 local fstype=$(df -TP $testdir | awk 'NR==2 {print $2}')
525 echo "$testdir: $fstype"
526 if [[ $fstype != "nfs4" ]]; then
530 for test in $tests; do
531 local cmd="./runtests -N $cnt_NRUN $test -f $testdir"
537 [ $rc = 0 ] || error "connectathon failed: $rc"
545 local type=${1:="ssf"}
547 IOR=${IOR:-$(which IOR 2> /dev/null || true)}
549 ior_THREADS=${ior_THREADS:-2}
550 ior_iteration=${ior_iteration:-1}
551 ior_blockSize=${ior_blockSize:-6} # GB
552 ior_xferSize=${ior_xferSize:-2m}
553 ior_type=${ior_type:-POSIX}
554 ior_DURATION=${ior_DURATION:-30} # minutes
557 { skip_env "IOR not found" && return; }
559 local space=$(df -P $DIR | tail -n 1 | awk '{ print $4 }')
560 local total_threads=$(( num_clients * ior_THREADS ))
561 echo "+ $ior_blockSize * 1024 * 1024 * $total_threads "
562 if [ $((space / 2)) -le \
563 $(( ior_blockSize * 1024 * 1024 * total_threads)) ]; then
564 echo "+ $space * 9/10 / 1024 / 1024 / $num_clients / $ior_THREADS"
565 ior_blockSize=$(( space /2 /1024 /1024 / num_clients / ior_THREADS ))
566 [ $ior_blockSize = 0 ] && \
567 skip_env "Need free space more than $((2 * total_threads))GB: \
568 $((total_threads *1024 *1024*2)), have $space" && return
570 local reduced_size="$num_clients x $ior_THREADS x $ior_blockSize"
571 echo "free space=$space, Need: $reduced_size GB"
572 echo "(blockSize reduced to $ior_blockSize Gb)"
575 print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
577 local testdir=$DIR/d0.ior.$type
579 # mpi_run uses mpiuser
581 if [ "$NFSCLIENT" ]; then
582 setstripe_nfsserver $testdir -c -1 ||
583 { error "setstripe on nfsserver failed" && return 1; }
585 $LFS setstripe $testdir -c -1 ||
586 { error "setstripe failed" && return 2; }
590 # contiguous bytes to write per task (e.g.: 8, 4k, 2m, 1g)"
592 # -t N transferSize -- size of transfer in bytes (e.g.: 8, 4k, 2m, 1g)"
593 # -w writeFile -- write file"
594 # -r readFile -- read existing file"
595 # -W checkWrite -- check read after write"
596 # -C reorderTasks -- changes task ordering to n+1 ordering for readback
597 # -T maxTimeDuration -- max time in minutes to run tests"
598 # -k keepFile -- keep testFile(s) on program exit
600 local cmd="$IOR -a $ior_type -b ${ior_blockSize}g -o $testdir/iorData \
601 -t $ior_xferSize -v -C -w -r -W -i $ior_iteration -T $ior_DURATION -k"
602 [ $type = "fpp" ] && cmd="$cmd -F"
605 # find out if we need to use srun by checking $SRUN_PARTITION
606 if [ "$SRUN_PARTITION" ]; then
607 $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \
608 -n $((num_clients * ior_THREADS)) -p $SRUN_PARTITION \
611 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
612 -np $((num_clients * $ior_THREADS)) $cmd
616 if [ $rc != 0 ] ; then
617 error "ior failed! $rc"
624 MIB=${MIB:=$(which mib 2> /dev/null || true)}
626 mib_THREADS=${mib_THREADS:-2}
627 mib_xferSize=${mib_xferSize:-1m}
628 mib_xferLimit=${mib_xferLimit:-5000}
629 mib_timeLimit=${mib_timeLimit:-300}
631 if [ "$NFSCLIENT" ]; then
632 skip "skipped for NFSCLIENT mode"
637 { skip_env "MIB not found" && return; }
639 print_opts MIB mib_THREADS mib_xferSize mib_xferLimit mib_timeLimit \
642 local testdir=$DIR/d0.mib
644 # mpi_run uses mpiuser
646 $LFS setstripe $testdir -c -1 ||
647 { error "setstripe failed" && return 2; }
649 # -I Show intermediate values in output
650 # -H Show headers in output
651 # -L Do not issue new system calls after this many seconds
652 # -s Use system calls of this size
654 # -l Issue no more than this many system calls
655 local cmd="$MIB -t $testdir -s $mib_xferSize -l $mib_xferLimit \
656 -L $mib_timeLimit -HI -p mib.$(date +%Y%m%d%H%M%S)"
659 # find out if we need to use srun by checking $SRUN_PARTITION
660 if [ "$SRUN_PARTITION" ]; then
661 $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \
662 -n $((num_clients * mib_THREADS)) -p $SRUN_PARTITION \
665 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
666 -np $((num_clients * mib_THREADS)) $cmd
670 if [ $rc != 0 ] ; then
671 error "mib failed! $rc"
678 CASC_RW=${CASC_RW:-$(which cascading_rw 2> /dev/null || true)}
680 casc_THREADS=${casc_THREADS:-2}
681 casc_REP=${casc_REP:-300}
683 if [ "$NFSCLIENT" ]; then
684 skip "skipped for NFSCLIENT mode"
689 { skip_env "cascading_rw not found" && return; }
692 # Need space estimation here.
694 print_opts CASC_RW clients casc_THREADS casc_REP MACHINEFILE
696 local testdir=$DIR/d0.cascading_rw
698 # mpi_run uses mpiuser
702 # -n: repeat test # times
704 local cmd="$CASC_RW -g -d $testdir -n $casc_REP"
707 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
708 -np $((num_clients * $casc_THREADS)) $cmd
711 if [ $rc != 0 ] ; then
712 error "cascading_rw failed! $rc"
717 run_write_append_truncate() {
720 write_THREADS=${write_THREADS:-8}
721 write_REP=${write_REP:-10000}
723 if [ "$NFSCLIENT" ]; then
724 skip "skipped for NFSCLIENT mode"
728 # location is lustre/tests dir
729 if ! which write_append_truncate > /dev/null 2>&1 ; then
730 skip_env "write_append_truncate not found"
735 # Need space estimation here.
737 local testdir=$DIR/d0.write_append_truncate
738 local file=$testdir/f0.wat
740 print_opts clients write_REP write_THREADS MACHINEFILE
743 # mpi_run uses mpiuser
746 local cmd="write_append_truncate -n $write_REP $file"
749 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
750 -np $((num_clients * $write_THREADS)) $cmd
753 if [ $rc != 0 ] ; then
754 error "write_append_truncate failed! $rc"
760 run_write_disjoint() {
762 WRITE_DISJOINT=${WRITE_DISJOINT:-$(which write_disjoint \
763 2> /dev/null || true)}
765 wdisjoint_THREADS=${wdisjoint_THREADS:-4}
766 wdisjoint_REP=${wdisjoint_REP:-10000}
768 if [ "$NFSCLIENT" ]; then
769 skip "skipped for NFSCLIENT mode"
773 [ x$WRITE_DISJOINT = x ] &&
774 { skip_env "write_disjoint not found" && return; }
777 # Need space estimation here.
779 print_opts WRITE_DISJOINT clients wdisjoint_THREADS wdisjoint_REP \
781 local testdir=$DIR/d0.write_disjoint
783 # mpi_run uses mpiuser
786 local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP"
789 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
790 -np $((num_clients * $wdisjoint_THREADS)) $cmd
793 if [ $rc != 0 ] ; then
794 error "write_disjoint failed! $rc"
799 run_parallel_grouplock() {
801 PARALLEL_GROUPLOCK=${PARALLEL_GROUPLOCK:-$(which parallel_grouplock \
802 2> /dev/null || true)}
803 parallel_grouplock_MINTASKS=${parallel_grouplock_MINTASKS:-5}
805 if [ "$NFSCLIENT" ]; then
806 skip "skipped for NFSCLIENT mode"
810 [ x$PARALLEL_GROUPLOCK = x ] &&
811 { skip "PARALLEL_GROUPLOCK not found" && return; }
813 print_opts clients parallel_grouplock_MINTASKS MACHINEFILE
815 local testdir=$DIR/d0.parallel_grouplock
817 # mpi_run uses mpiuser
823 for i in $(seq 12); do
825 local cmd="$PARALLEL_GROUPLOCK -g -v -d $testdir $subtest"
828 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
829 -np $parallel_grouplock_MINTASKS $cmd
831 if [ $rc != 0 ] ; then
832 error_noexit "parallel_grouplock subtests $subtest " \
835 echo "parallel_grouplock subtests $subtest PASS"
837 let status=$((status + rc))
838 # clear debug to collect one log per one test
839 do_nodes $(comma_list $(nodes_list)) lctl clear
841 [ $status -eq 0 ] || error "parallel_grouplock status: $status"
845 cleanup_statahead () {
852 for i in $(seq 0 $num_mntpts);do
853 zconf_umount_clients $clients ${mntpt_root}$i ||
854 error_exit "Failed to umount lustre on ${mntpt_root}$i"
860 statahead_NUMMNTPTS=${statahead_NUMMNTPTS:-5}
861 statahead_NUMFILES=${statahead_NUMFILES:-500000}
863 if [[ -n $NFSCLIENT ]]; then
864 skip "Statahead testing is not supported on NFS clients."
869 { skip_env "mdsrate not found" && return; }
871 print_opts MDSRATE clients statahead_NUMMNTPTS statahead_NUMFILES
875 # do not use default "d[0-9]*" dir name
876 # to avoid of rm $statahead_NUMFILES (500k) files in t-f cleanup
878 local testdir=$DIR/$dir
880 # cleanup only if dir exists
881 # cleanup only $statahead_NUMFILES number of files
882 # ignore the other files created by someone else
884 mdsrate_cleanup $((num_clients * 32)) $MACHINEFILE \
885 $statahead_NUMFILES $testdir 'f%%d' --ignore
888 # mpi_run uses mpiuser
891 local num_files=$statahead_NUMFILES
893 local IFree=$(inodes_available)
894 if [ $IFree -lt $num_files ]; then
900 local cmd1="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir $testdir"
901 local cmd2="--nfiles $num_files --filefmt 'f%%d'"
902 local cmd="$cmd1 $cmd2"
905 mpi_run ${MACHINEFILE_OPTION} ${MACHINEFILE} \
906 -np $((num_clients * 32)) $cmd
909 if [ $rc != 0 ] ; then
910 error "mdsrate failed to create $rc"
914 local num_mntpts=$statahead_NUMMNTPTS
915 local mntpt_root=$TMP/mntpt/lustre
916 local mntopts=$MNTOPTSTATAHEAD
918 echo "Mounting $num_mntpts lustre clients starts on $clients"
919 trap "cleanup_statahead $clients $mntpt_root $num_mntpts" EXIT ERR
920 for i in $(seq 0 $num_mntpts); do
921 zconf_mount_clients $clients ${mntpt_root}$i "$mntopts" ||
922 error_exit "Failed to mount lustre on ${mntpt_root}$i on $clients"
925 do_rpc_nodes $clients cancel_lru_locks mdc
927 do_rpc_nodes $clients do_ls $mntpt_root $num_mntpts $dir
929 mdsrate_cleanup $((num_clients * 32)) $MACHINEFILE \
930 $num_files $testdir 'f%%d' --ignore
932 # use rm instead of rmdir because of
933 # testdir could contain the files created by someone else,
934 # or by previous run where is num_files prev > num_files current
936 cleanup_statahead $clients $mntpt_root $num_mntpts