X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Flarge-scale.sh;h=7973edb1e2778dc2b7ed1a6f74168cd550c14474;hb=72c1f7095203cc1badadf581c66f9546476438ab;hp=6792612efe2573bf25114b9583a5fc9fb5ecca2f;hpb=5d37670e8507563db556879041c7992936aefa56;p=fs%2Flustre-release.git diff --git a/lustre/tests/large-scale.sh b/lustre/tests/large-scale.sh index 6792612..7973edb 100644 --- a/lustre/tests/large-scale.sh +++ b/lustre/tests/large-scale.sh @@ -1,35 +1,27 @@ #!/bin/bash -# -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*- -# vim:shiftwidth=4:softtabstop=4:tabstop=4: set -e -# bug number: -ALWAYS_EXCEPT="$LARGE_SCALE_EXCEPT" - -SAVE_PWD=$PWD PTLDEBUG=${PTLDEBUG:--1} -LUSTRE=${LUSTRE:-`dirname $0`/..} SETUP=${SETUP:-""} CLEANUP=${CLEANUP:-""} -. $LUSTRE/tests/test-framework.sh +LUSTRE=${LUSTRE:-$(dirname $0)/..} +. $LUSTRE/tests/test-framework.sh init_test_env $@ - -. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} init_logging -remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 +ALWAYS_EXCEPT="$LARGE_SCALE_EXCEPT " -[ -n "$CLIENTS" ] || { skip_env "$0: Need two or more clients" && exit 0; } -[ $CLIENTCOUNT -ge 2 ] || \ - { skip_env "$0: Need two or more remote clients, have $CLIENTCOUNT" && exit 0; } +build_test_filter + +remote_mds_nodsh && skip "remote MDS with nodsh" -# -[ "$SLOW" = "no" ] && EXCEPT_SLOW="" +[ -z "$CLIENTS" ] && skip_env "$TESTSUITE: Need two or more clients" +[ $CLIENTCOUNT -lt 2 ] && + skip_env "$TESTSUITE: Need 2+ clients, have only $CLIENTCOUNT" MOUNT_2="" -build_test_filter check_and_setup_lustre rm -rf $DIR/[df][0-9]* @@ -41,84 +33,81 @@ $GSS_KRB5 && refresh_krb5_tgt $MPI_USER_UID $MPI_USER_GID $MPI_RUNAS [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE test_3a() { - assert_env CLIENTS MDSRATE MPIRUN - - local -a nodes=(${CLIENTS//,/ }) - - # INCREMENT is a number of clients - # a half of clients by default - increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))} - - machinefile=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines} - local LOG=$TMP/${TESTSUITE}_$tfile - - local var=${SINGLEMDS}_svc - local procfile="*.${!var}.recovery_status" - local iters=${ITERS:-3} - local nfiles=${NFILES:-50000} - local nthreads=${THREADS_PER_CLIENT:-3} - - local IFree=$(inodes_available) - [ $IFree -gt $nfiles ] || nfiles=$IFree - - local dir=$DIR/d0.$TESTNAME - mkdir -p $dir - chmod 0777 $dir - - local pid - local list - local -a res - - local num=$increment - - while [ $num -le $CLIENTCOUNT ]; do - list=$(comma_list ${nodes[@]:0:$num}) - - generate_machine_file $list $machinefile || - { error "can not generate machinefile"; exit 1; } - - for i in $(seq $iters); do - mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore - - COMMAND="${MDSRATE} --create --nfiles $nfiles --dir $dir --filefmt 'f%%d'" - mpi_run -np $((num * nthreads)) -machinefile $machinefile ${COMMAND} | tee ${LOG} & - - pid=$! - echo "pid=$pid" - - # 2 threads 100000 creates 117 secs - sleep 20 - - log "$i : Starting failover on $SINGLEMDS" - facet_failover $SINGLEMDS - if ! wait_recovery_complete $SINGLEMDS $((TIMEOUT * 10)); then - echo "$SINGLEMDS recovery is not completed!" - kill -9 $pid - exit 7 - fi - - duration=$(do_facet $SINGLEMDS lctl get_param -n $procfile | grep recovery_duration) - - res=( "${res[@]}" "$num" ) - res=( "${res[@]}" "$duration" ) - echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration" - wait $pid - - done - num=$((num + increment)) - done - - mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore - - i=0 - while [ $i -lt ${#res[@]} ]; do - echo "RECOVERY TIME: NFILES=$nfiles number of clients: ${res[i]} ${res[i+1]}" - i=$((i+2)) - done + assert_env CLIENTS MDSRATE MPIRUN + + local -a nodes=(${CLIENTS//,/ }) + # INCREMENT is a number of clients a half of clients by default + local increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))} + local num=$increment + local LOG=$TMP/${TESTSUITE}_$tfile + local var=${SINGLEMDS}_svc + local procfile="*.${!var}.recovery_status" + local iters=${ITERS:-3} + local nfiles=${NFILES:-50000} + local nthreads=${THREADS_PER_CLIENT:-3} + local IFree=$(inodes_available) + local pid + local list + local -a res + local dir=$DIR/d0.$TESTNAME + + [ $IFree -gt $nfiles ] || nfiles=$IFree + + mkdir -p $dir + chmod 0777 $dir + + while [ $num -le $CLIENTCOUNT ]; do + list=$(comma_list ${nodes[@]:0:$num}) + + generate_machine_file $list $MACHINEFILE || + error "can not generate machinefile" + + for i in $(seq $iters); do + mdsrate_cleanup $num $MACHINEFILE $nfiles $dir 'f%%d' \ + --ignore + + COMMAND="${MDSRATE} --create --nfiles $nfiles --dir + $dir --filefmt 'f%%d'" + mpi_run ${MACHINEFILE_OPTION} $MACHINEFILE \ + -np $((num * nthreads)) ${COMMAND} | tee ${LOG}& + + pid=$! + echo "pid=$pid" + + # 2 threads 100000 creates 117 secs + sleep 20 + + log "$i : Starting failover on $SINGLEMDS" + facet_failover $SINGLEMDS + if ! wait_recovery_complete $SINGLEMDS \ + $((TIMEOUT * 10)); then + echo "$SINGLEMDS recovery is not completed!" + kill -9 $pid + exit 7 + fi + + duration=$(do_facet $SINGLEMDS lctl get_param -n \ + $procfile | grep recovery_duration) + + res=( "${res[@]}" "$num" ) + res=( "${res[@]}" "$duration" ) + echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration" + wait $pid + done + num=$((num + increment)) + done + + mdsrate_cleanup $num $MACHINEFILE $nfiles $dir 'f%%d' --ignore + + i=0 + while [ $i -lt ${#res[@]} ]; do + echo "RECOVERY TIME: NFILES=$nfiles number of clients: ${res[i]} ${res[i+1]}" + i=$((i+2)) + done } run_test 3a "recovery time, $CLIENTCOUNT clients" -complete $(basename $0) $SECONDS +complete $SECONDS check_and_cleanup_lustre exit_status