From de677fa6395b2646fbeb9822c59f3e4547aefba6 Mon Sep 17 00:00:00 2001 From: "Jay J. Lan" Date: Tue, 18 Oct 2011 12:04:07 -0700 Subject: [PATCH] LU-676 tests: machinefile option for mpirun via a variable Not all MPI implementations pass the host file to mpirun via the same option. Common options are -machinefile and -hostfile. This problem can be resolved by using a variable MACHINEFILE_OPTION instead. A default value is assigned if the variable not defined. Signed-off-by: Jay J Lan Signed-off-by: Bobi Jam Change-Id: I3362a6e62a27318cff733aea2f99b1356b3ff02e Reviewed-on: http://review.whamcloud.com/3873 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Jian Yu Reviewed-by: Oleg Drokin --- lustre/tests/Makefile.am | 2 +- lustre/tests/cfg/local.sh | 1 + lustre/tests/functions.sh | 153 ++++++++++++++++++---------------- lustre/tests/ha.sh | 42 +++++----- lustre/tests/large-scale.sh | 76 +++++++++-------- lustre/tests/mdsrate-create-large.sh | 33 ++++---- lustre/tests/mdsrate-create-small.sh | 26 +++--- lustre/tests/mdsrate-lookup-10dirs.sh | 26 +++--- lustre/tests/mdsrate-lookup-1dir.sh | 19 +++-- lustre/tests/mdsrate-stat-large.sh | 13 +-- lustre/tests/mdsrate-stat-small.sh | 14 ++-- lustre/tests/metadata-updates.sh | 11 +-- lustre/tests/run_IOR.sh | 5 +- lustre/tests/test-framework.sh | 9 +- 14 files changed, 232 insertions(+), 198 deletions(-) diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index d72683f..1d09245 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -31,7 +31,7 @@ noinst_SCRIPTS += lnet-selftest.sh obdfilter-survey.sh mmp.sh mmp_mark.sh noinst_SCRIPTS += sgpdd-survey.sh maloo_upload.sh auster setup-nfs.sh noinst_SCRIPTS += mds-survey.sh parallel-scale-nfs.sh large-lun.sh noinst_SCRIPTS += parallel-scale-nfsv3.sh parallel-scale-nfsv4.sh -noinst_SCRIPTS += posix.sh sanity-scrub.sh scrub-performance.sh +noinst_SCRIPTS += posix.sh sanity-scrub.sh scrub-performance.sh ha.sh noinst_SCRIPTS += sanity-quota-old.sh noinst_SCRIPTS += resolveip nobase_noinst_SCRIPTS = cfg/local.sh diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index 53135ae..9d63868 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -125,6 +125,7 @@ FAIL_ON_ERROR=${FAIL_ON_ERROR:-true} MPIRUN=$(which mpirun 2>/dev/null) || true MPI_USER=${MPI_USER:-mpiuser} SHARED_DIR_LOGS=${SHARED_DIR_LOGS:-""} +MACHINEFILE_OPTION=${MACHINEFILE_OPTION:-"-machinefile"} # This is used by a small number of tests to share state between the client # running the tests, or in some cases between the servers (e.g. lfsck.sh). diff --git a/lustre/tests/functions.sh b/lustre/tests/functions.sh index b176b7b..3543946 100644 --- a/lustre/tests/functions.sh +++ b/lustre/tests/functions.sh @@ -346,14 +346,15 @@ run_metabench() { local cmd="$METABENCH -w $testdir -c $mbench_NFILES -C -S -k" echo "+ $cmd" - # find out if we need to use srun by checking $SRUN_PARTITION - if [ "$SRUN_PARTITION" ]; then - $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ - -n $((num_clients * mbench_THREADS)) -p $SRUN_PARTITION -- $cmd - else - mpi_run -np $((num_clients * $mbench_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd - fi + # find out if we need to use srun by checking $SRUN_PARTITION + if [ "$SRUN_PARTITION" ]; then + $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ + -n $((num_clients * mbench_THREADS)) \ + -p $SRUN_PARTITION -- $cmd + else + mpi_run -np $((num_clients * $mbench_THREADS)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} $cmd + fi local rc=$? if [ $rc != 0 ] ; then @@ -392,15 +393,16 @@ run_simul() { local cmd="$SIMUL -d $testdir -n $simul_REP -N $simul_REP" - echo "+ $cmd" - # find out if we need to use srun by checking $SRUN_PARTITION - if [ "$SRUN_PARTITION" ]; then - $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ - -n $((num_clients * simul_THREADS)) -p $SRUN_PARTITION -- $cmd - else - mpi_run -np $((num_clients * simul_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd - fi + echo "+ $cmd" + # find out if we need to use srun by checking $SRUN_PARTITION + if [ "$SRUN_PARTITION" ]; then + $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ + -n $((num_clients * simul_THREADS)) -p $SRUN_PARTITION \ + -- $cmd + else + mpi_run -np $((num_clients * simul_THREADS)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} $cmd + fi local rc=$? if [ $rc != 0 ] ; then @@ -447,15 +449,16 @@ run_mdtest() { local cmd="$MDTEST -d $testdir -i $mdtest_iteration -n $mdtest_nFiles" [ $type = "fpp" ] && cmd="$cmd -u" - echo "+ $cmd" - # find out if we need to use srun by checking $SRUN_PARTITION - if [ "$SRUN_PARTITION" ]; then - $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ - -n $((num_clients * mdtest_THREADS)) -p $SRUN_PARTITION -- $cmd - else - mpi_run -np $((num_clients * mdtest_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd - fi + echo "+ $cmd" + # find out if we need to use srun by checking $SRUN_PARTITION + if [ "$SRUN_PARTITION" ]; then + $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ + -n $((num_clients * mdtest_THREADS)) \ + -p $SRUN_PARTITION -- $cmd + else + mpi_run -np $((num_clients * mdtest_THREADS)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} $cmd + fi local rc=$? if [ $rc != 0 ] ; then @@ -580,15 +583,16 @@ run_ior() { -t $ior_xferSize -v -w -r -i $ior_iteration -T $ior_DURATION -k" [ $type = "fpp" ] && cmd="$cmd -F" - echo "+ $cmd" - # find out if we need to use srun by checking $SRUN_PARTITION - if [ "$SRUN_PARTITION" ]; then - $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ - -n $((num_clients * ior_THREADS)) -p $SRUN_PARTITION -- $cmd - else - mpi_run -np $((num_clients * $ior_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd - fi + echo "+ $cmd" + # find out if we need to use srun by checking $SRUN_PARTITION + if [ "$SRUN_PARTITION" ]; then + $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ + -n $((num_clients * ior_THREADS)) -p $SRUN_PARTITION \ + -- $cmd + else + mpi_run -np $((num_clients * $ior_THREADS)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} $cmd + fi local rc=$? if [ $rc != 0 ] ; then @@ -633,15 +637,16 @@ run_mib() { local cmd="$MIB -t $testdir -s $mib_xferSize -l $mib_xferLimit \ -L $mib_timeLimit -HI -p mib.$(date +%Y%m%d%H%M%S)" - echo "+ $cmd" - # find out if we need to use srun by checking $SRUN_PARTITION - if [ "$SRUN_PARTITION" ]; then - $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ - -n $((num_clients * mib_THREADS)) -p $SRUN_PARTITION -- $cmd - else - mpi_run -np $((num_clients * mib_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd - fi + echo "+ $cmd" + # find out if we need to use srun by checking $SRUN_PARTITION + if [ "$SRUN_PARTITION" ]; then + $SRUN $SRUN_OPTIONS -D $testdir -w $clients -N $num_clients \ + -n $((num_clients * mib_THREADS)) -p $SRUN_PARTITION \ + -- $cmd + else + mpi_run -np $((num_clients * mib_THREADS)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} $cmd + fi local rc=$? if [ $rc != 0 ] ; then @@ -680,9 +685,9 @@ run_cascading_rw() { local cmd="$CASC_RW -g -d $testdir -n $casc_REP" - echo "+ $cmd" - mpi_run -np $((num_clients * $casc_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd + echo "+ $cmd" + mpi_run -np $((num_clients * $casc_THREADS)) ${MACHINEFILE_OPTION} \ + ${MACHINEFILE} $cmd local rc=$? if [ $rc != 0 ] ; then @@ -722,9 +727,9 @@ run_write_append_truncate() { local cmd="write_append_truncate -n $write_REP $file" - echo "+ $cmd" - mpi_run -np $((num_clients * $write_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd + echo "+ $cmd" + mpi_run -np $((num_clients * $write_THREADS)) ${MACHINEFILE_OPTION} \ + ${MACHINEFILE} $cmd local rc=$? if [ $rc != 0 ] ; then @@ -762,9 +767,9 @@ run_write_disjoint() { local cmd="$WRITE_DISJOINT -f $testdir/file -n $wdisjoint_REP" - echo "+ $cmd" - mpi_run -np $((num_clients * $wdisjoint_THREADS)) \ - -machinefile ${MACHINEFILE} $cmd + echo "+ $cmd" + mpi_run -np $((num_clients * $wdisjoint_THREADS)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} $cmd local rc=$? if [ $rc != 0 ] ; then @@ -800,25 +805,26 @@ run_parallel_grouplock() { local cmd local status=0 local subtest - for i in $(seq 12); do - subtest="-t $i" - local cmd="$PARALLEL_GROUPLOCK -g -v -d $testdir $subtest" - echo "+ $cmd" - - mpi_run -np $parallel_grouplock_MINTASKS \ - -machinefile ${MACHINEFILE} $cmd - local rc=$? - if [ $rc != 0 ] ; then - error_noexit "parallel_grouplock subtests $subtest failed! $rc" - else - echo "parallel_grouplock subtests $subtest PASS" - fi - let status=$((status + rc)) - # clear debug to collect one log per one test - do_nodes $(comma_list $(nodes_list)) lctl clear - done - [ $status -eq 0 ] || error "parallel_grouplock status: $status" - rm -rf $testdir + for i in $(seq 12); do + subtest="-t $i" + local cmd="$PARALLEL_GROUPLOCK -g -v -d $testdir $subtest" + echo "+ $cmd" + + mpi_run -np $parallel_grouplock_MINTASKS ${MACHINEFILE_OPTION} \ + ${MACHINEFILE} $cmd + local rc=$? + if [ $rc != 0 ] ; then + error_noexit "parallel_grouplock subtests $subtest " \ + "failed! $rc" + else + echo "parallel_grouplock subtests $subtest PASS" + fi + let status=$((status + rc)) + # clear debug to collect one log per one test + do_nodes $(comma_list $(nodes_list)) lctl clear + done + [ $status -eq 0 ] || error "parallel_grouplock status: $status" + rm -rf $testdir } cleanup_statahead () { @@ -881,7 +887,8 @@ run_statahead () { local cmd="$cmd1 $cmd2" echo "+ $cmd" - mpi_run -np $((num_clients * 32)) -machinefile ${MACHINEFILE} $cmd + mpi_run -np $((num_clients * 32)) ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + $cmd local rc=$? if [ $rc != 0 ] ; then diff --git a/lustre/tests/ha.sh b/lustre/tests/ha.sh index 1c36fd9..e4f623d 100755 --- a/lustre/tests/ha.sh +++ b/lustre/tests/ha.sh @@ -277,27 +277,27 @@ ha_repeat_mpi_load() ha_info "Starting $tag" - while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do - { - ha_on ${ha_clients[0]} mkdir -p "$dir" && \ - mpirun -np ${#ha_clients[@]} -machinefile "$ha_machine_file" \ - $cmd && \ - ha_on ${ha_clients[0]} rm -rf "$dir" - } >>"$log" 2>&1 || rc=$? - - if ((rc != 0)); then - ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}" - touch "$ha_fail_file" - touch "$ha_stop_file" - fi - echo $rc >"$status" - - nr_loops=$((nr_loops + 1)) - done - - avg_loop_time=$((($(date +%s) - start_time) / nr_loops)) - - ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time" + while [ ! -e "$ha_stop_file" ] && ((rc == 0)); do + { + ha_on ${ha_clients[0]} mkdir -p "$dir" && \ + mpirun -np ${#ha_clients[@]} ${MACHINEFILE_OPTION} \ + "$ha_machine_file" $cmd && \ + ha_on ${ha_clients[0]} rm -rf "$dir" + } >>"$log" 2>&1 || rc=$? + + if ((rc != 0)); then + ha_dump_logs "${ha_clients[*]} ${ha_servers[*]}" + touch "$ha_fail_file" + touch "$ha_stop_file" + fi + echo $rc >"$status" + + nr_loops=$((nr_loops + 1)) + done + + avg_loop_time=$((($(date +%s) - start_time) / nr_loops)) + + ha_info "$tag stopped: rc $rc avg loop time $avg_loop_time" } ha_start_mpi_loads() diff --git a/lustre/tests/large-scale.sh b/lustre/tests/large-scale.sh index 8da9783..195f93b 100644 --- a/lustre/tests/large-scale.sh +++ b/lustre/tests/large-scale.sh @@ -71,42 +71,46 @@ test_3a() { local num=$increment - while [ $num -le $CLIENTCOUNT ]; do - list=$(comma_list ${nodes[@]:0:$num}) - - generate_machine_file $list $machinefile || - { error "can not generate machinefile"; exit 1; } - - for i in $(seq $iters); do - mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore - - COMMAND="${MDSRATE} --create --nfiles $nfiles --dir $dir --filefmt 'f%%d'" - mpi_run -np $((num * nthreads)) -machinefile $machinefile ${COMMAND} | tee ${LOG} & - - pid=$! - echo "pid=$pid" - - # 2 threads 100000 creates 117 secs - sleep 20 - - log "$i : Starting failover on $SINGLEMDS" - facet_failover $SINGLEMDS - if ! wait_recovery_complete $SINGLEMDS $((TIMEOUT * 10)); then - echo "$SINGLEMDS recovery is not completed!" - kill -9 $pid - exit 7 - fi - - duration=$(do_facet $SINGLEMDS lctl get_param -n $procfile | grep recovery_duration) - - res=( "${res[@]}" "$num" ) - res=( "${res[@]}" "$duration" ) - echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration" - wait $pid - - done - num=$((num + increment)) - done + while [ $num -le $CLIENTCOUNT ]; do + list=$(comma_list ${nodes[@]:0:$num}) + + generate_machine_file $list $machinefile || + { error "can not generate machinefile"; exit 1; } + + for i in $(seq $iters); do + mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' \ + --ignore + + COMMAND="${MDSRATE} --create --nfiles $nfiles --dir + $dir --filefmt 'f%%d'" + mpi_run -np $((num * nthreads)) ${MACHINEFILE_OPTION} \ + $machinefile ${COMMAND} | tee ${LOG} & + + pid=$! + echo "pid=$pid" + + # 2 threads 100000 creates 117 secs + sleep 20 + + log "$i : Starting failover on $SINGLEMDS" + facet_failover $SINGLEMDS + if ! wait_recovery_complete $SINGLEMDS \ + $((TIMEOUT * 10)); then + echo "$SINGLEMDS recovery is not completed!" + kill -9 $pid + exit 7 + fi + + duration=$(do_facet $SINGLEMDS lctl get_param -n \ + $procfile | grep recovery_duration) + + res=( "${res[@]}" "$num" ) + res=( "${res[@]}" "$duration" ) + echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration" + wait $pid + done + num=$((num + increment)) + done mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore diff --git a/lustre/tests/mdsrate-create-large.sh b/lustre/tests/mdsrate-create-large.sh index 398eb1a..5b7414a 100644 --- a/lustre/tests/mdsrate-create-large.sh +++ b/lustre/tests/mdsrate-create-large.sh @@ -55,23 +55,24 @@ else log "===== $0 ### 1 NODE CREATE ###" - COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" - echo "+ ${COMMAND}" - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} + --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" + echo "+ ${COMMAND}" + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | + tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG error "mdsrate creates for a single client failed, aborting" fi - + log "===== $0 ### 1 NODE UNLINK ###" COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" echo "+ ${COMMAND}" - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} - + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | tee ${LOG} + if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG error "mdsrate unlink on a single client failed, aborting" @@ -93,10 +94,11 @@ else log "===== $0 ### $NUM_CLIENTS NODES CREATE ###" - COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" - echo "+ ${COMMAND}" - mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} + --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" + echo "+ ${COMMAND}" + mpi_run -np ${NUM_CLIENTS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG @@ -105,10 +107,11 @@ else log "===== $0 ### $NUM_CLIENTS NODES UNLINK ###" - COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink - --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" - echo "+ ${COMMAND}" - mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink + --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" + echo "+ ${COMMAND}" + mpi_run -np ${NUM_CLIENTS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG diff --git a/lustre/tests/mdsrate-create-small.sh b/lustre/tests/mdsrate-create-small.sh index 8cad213..dcb8d56 100644 --- a/lustre/tests/mdsrate-create-small.sh +++ b/lustre/tests/mdsrate-create-small.sh @@ -63,10 +63,11 @@ else log "===== $0 ### 1 NODE CREATE ###" - COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --nfiles $NUM_FILES --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" - echo "+ ${COMMAND}" - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} + --nfiles $NUM_FILES --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" + echo "+ ${COMMAND}" + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | + tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG @@ -82,7 +83,8 @@ else COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --nfiles ${NUM_FILES} --dir ${TESTDIR_SINGLE} --filefmt 'f%%d'" echo "+ ${COMMAND}" - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | + tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG @@ -108,11 +110,11 @@ else log "===== $0 ### $NUM_CLIENTS NODES CREATE with $THREADS_PER_CLIENT threads per client ###" - COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} - --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" - echo "+ ${COMMAND}" - mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \ - ${COMMAND} | tee ${LOG} + COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --create --time ${TIME_PERIOD} + --nfiles $NUM_FILES --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" + echo "+ ${COMMAND}" + mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG error "mdsrate create on multiple nodes failed, aborting" @@ -127,8 +129,8 @@ else COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --unlink --nfiles ${NUM_FILES} --dir ${TESTDIR_MULTI} --filefmt 'f%%d'" echo "+ ${COMMAND}" - mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \ - ${COMMAND} | tee ${LOG} + mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG error "mdsrate unlinks multiple nodes failed, aborting" diff --git a/lustre/tests/mdsrate-lookup-10dirs.sh b/lustre/tests/mdsrate-lookup-10dirs.sh index 22ad632..6dd5b11 100644 --- a/lustre/tests/mdsrate-lookup-10dirs.sh +++ b/lustre/tests/mdsrate-lookup-10dirs.sh @@ -66,11 +66,13 @@ else --ndirs ${NUM_DIRS} --dirfmt '${DIRfmt}' --nfiles ${NUM_FILES} --filefmt 'f%%d'" - echo "+" ${COMMAND} - # For files creation we can use -np equal to NUM_DIRS - # This is just a test preparation, does not matter how many threads we use for files creation; - # we just should be aware that NUM_DIRS is less than or equal to the number of threads np - mpi_run -np ${NUM_DIRS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1 + echo "+" ${COMMAND} + # For files creation we can use -np equal to NUM_DIRS + # This is just a test preparation, does not matter how many threads we + # use for files creation; we just should be aware that NUM_DIRS is less + # than or equal to the number of threads np + mpi_run -np ${NUM_DIRS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} 2>&1 # No lookup if error occurs on file creation, abort. [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting" @@ -84,9 +86,10 @@ COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --lookup --time ${TIME_PERIOD} ${SEED_OPTIO if [ -n "$NOSINGLE" ]; then echo "NO Test for lookups on a single client." else - log "===== $0 ### 1 NODE LOOKUPS ###" - echo "+" ${COMMAND} - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + log "===== $0 ### 1 NODE LOOKUPS ###" + echo "+" ${COMMAND} + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | + tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG @@ -99,9 +102,10 @@ fi if [ -n "$NOMULTI" ]; then echo "NO test for lookups on multiple nodes." else - log "===== $0 ### ${NUM_CLIENTS} NODES LOOKUPS ###" - echo "+" ${COMMAND} - mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + log "===== $0 ### ${NUM_CLIENTS} NODES LOOKUPS ###" + echo "+" ${COMMAND} + mpi_run -np ${NUM_CLIENTS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG diff --git a/lustre/tests/mdsrate-lookup-1dir.sh b/lustre/tests/mdsrate-lookup-1dir.sh index 29ea5f3..ac25e72 100644 --- a/lustre/tests/mdsrate-lookup-1dir.sh +++ b/lustre/tests/mdsrate-lookup-1dir.sh @@ -63,8 +63,9 @@ else fi COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --mknod --dir ${TESTDIR} --nfiles ${NUM_FILES} --filefmt 'f%%d'" - echo "+" ${COMMAND} - mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1 + echo "+" ${COMMAND} + mpi_run -np ${NUM_THREADS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} 2>&1 # No lockup if error occurs on file creation, abort. [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting" @@ -77,9 +78,10 @@ COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --lookup --time ${TIME_PERIOD} ${SEED_OPTIO if [ -n "$NOSINGLE" ]; then echo "NO Test for lookups on a single client." else - log "===== $0 ### 1 NODE LOOKUPS ###" - echo "+" ${COMMAND} - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + log "===== $0 ### 1 NODE LOOKUPS ###" + echo "+" ${COMMAND} + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | + tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG @@ -92,9 +94,10 @@ fi if [ -n "$NOMULTI" ]; then echo "NO test for lookups on multiple nodes." else - log "===== $0 ### ${NUM_CLIENTS} NODES LOOKUPS ###" - echo "+" ${COMMAND} - mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + log "===== $0 ### ${NUM_CLIENTS} NODES LOOKUPS ###" + echo "+" ${COMMAND} + mpi_run -np ${NUM_CLIENTS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG diff --git a/lustre/tests/mdsrate-stat-large.sh b/lustre/tests/mdsrate-stat-large.sh index 9fd2c08..19c4375 100644 --- a/lustre/tests/mdsrate-stat-large.sh +++ b/lustre/tests/mdsrate-stat-large.sh @@ -68,9 +68,10 @@ else NUM_THREADS=$NUM_CLIENTS fi - mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1 - [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting" - + mpi_run -np ${NUM_THREADS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} 2>&1 + [ ${PIPESTATUS[0]} != 0 ] && + error "mdsrate file creation failed, aborting" fi COMMAND="${MDSRATE} ${MDSRATE_DEBUG} --stat --time ${TIME_PERIOD} @@ -83,7 +84,8 @@ else log "===== $0 ### 1 NODE STAT ###" echo "+" ${COMMAND} - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | + tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG @@ -99,7 +101,8 @@ else log "===== $0 ### ${NUM_CLIENTS} NODES STAT ###" echo "+" ${COMMAND} - mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + mpi_run -np ${NUM_CLIENTS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG diff --git a/lustre/tests/mdsrate-stat-small.sh b/lustre/tests/mdsrate-stat-small.sh index f7b84c1..0d75135 100644 --- a/lustre/tests/mdsrate-stat-small.sh +++ b/lustre/tests/mdsrate-stat-small.sh @@ -68,8 +68,10 @@ else NUM_THREADS=$NUM_CLIENTS fi - mpi_run -np ${NUM_THREADS} -machinefile ${MACHINEFILE} ${COMMAND} 2>&1 - [ ${PIPESTATUS[0]} != 0 ] && error "mdsrate file creation failed, aborting" + mpi_run -np ${NUM_THREADS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} 2>&1 + [ ${PIPESTATUS[0]} != 0 ] && + error "mdsrate file creation failed, aborting" fi @@ -83,8 +85,9 @@ else log "===== $0 ### 1 NODE STAT ###" echo "+" ${COMMAND} - mpi_run -np 1 -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} - + mpi_run -np 1 ${MACHINEFILE_OPTION} ${MACHINEFILE} ${COMMAND} | + tee ${LOG} + if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG error "mdsrate on a single client failed, aborting" @@ -99,7 +102,8 @@ else log "===== $0 ### ${NUM_CLIENTS} NODES STAT ###" echo "+" ${COMMAND} - mpi_run -np ${NUM_CLIENTS} -machinefile ${MACHINEFILE} ${COMMAND} | tee ${LOG} + mpi_run -np ${NUM_CLIENTS} ${MACHINEFILE_OPTION} ${MACHINEFILE} \ + ${COMMAND} | tee ${LOG} if [ ${PIPESTATUS[0]} != 0 ]; then [ -f $LOG ] && sed -e "s/^/log: /" $LOG diff --git a/lustre/tests/metadata-updates.sh b/lustre/tests/metadata-updates.sh index 66d9d8e..41a1849 100755 --- a/lustre/tests/metadata-updates.sh +++ b/lustre/tests/metadata-updates.sh @@ -251,11 +251,12 @@ check_dir_contents $(($NUM_FILES / 2 + 1)) || # "write_disjoint" test echo "Part 5. write_disjoint test: see lustre/tests/mpi/write_disjoint.c for details" if [ -f "$WRITE_DISJOINT" ]; then - set $TRACE - MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines} - generate_machine_file $NODES_TO_USE $MACHINEFILE - mpi_run -np $(get_node_count ${NODES_TO_USE//,/ }) -machinefile $MACHINEFILE \ - $WRITE_DISJOINT -f $WRITE_DISJOINT_FILE -n $NUMLOOPS || STATUS=1 + set $TRACE + MACHINEFILE=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines} + generate_machine_file $NODES_TO_USE $MACHINEFILE + mpi_run -np $(get_node_count ${NODES_TO_USE//,/ }) \ + ${MACHINEFILE_OPTION} $MACHINEFILE $WRITE_DISJOINT \ + -f $WRITE_DISJOINT_FILE -n $NUMLOOPS || STATUS=1 else skip_env "$0 : write_disjoint not found " fi diff --git a/lustre/tests/run_IOR.sh b/lustre/tests/run_IOR.sh index 9f8f816..dd0c831 100755 --- a/lustre/tests/run_IOR.sh +++ b/lustre/tests/run_IOR.sh @@ -44,8 +44,9 @@ while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do # need this only if TESTDIR is not default chmod -R 777 $TESTDIR - mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) -machinefile ${MACHINEFILE} \ - $IOR -a POSIX -b 1g -o $TESTDIR/IOR-file -s 1 -t 1m -v -w -r 1>$LOG & + mpi_run -np $((NUM_CLIENTS * THREADS_PER_CLIENT)) \ + ${MACHINEFILE_OPTION} ${MACHINEFILE} $IOR -a POSIX -b 1g \ + -o $TESTDIR/IOR-file -s 1 -t 1m -v -w -r 1>$LOG & load_pid=$! wait $load_pid if [ ${PIPESTATUS[0]} -eq 0 ]; then diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 510109d..f47f3ab 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -4855,10 +4855,11 @@ get_mds_dir () { } mdsrate_cleanup () { - if [ -d $4 ]; then - mpi_run -np $1 -machinefile $2 ${MDSRATE} --unlink --nfiles $3 --dir $4 --filefmt $5 $6 - rmdir $4 - fi + if [ -d $4 ]; then + mpi_run -np $1 ${MACHINEFILE_OPTION} $2 ${MDSRATE} --unlink \ + --nfiles $3 --dir $4 --filefmt $5 $6 + rmdir $4 + fi } delayed_recovery_enabled () { -- 1.8.3.1