Whamcloud - gitweb
LU-12043 llite: make sure readahead cover current read
[fs/lustre-release.git] / lustre / tests / large-scale.sh
1 #!/bin/bash
2 # -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
3 # vim:shiftwidth=4:softtabstop=4:tabstop=4:
4
5 set -e
6
7 # bug number:
8 ALWAYS_EXCEPT="$LARGE_SCALE_EXCEPT"
9
10 SAVE_PWD=$PWD
11 PTLDEBUG=${PTLDEBUG:--1}
12 LUSTRE=${LUSTRE:-`dirname $0`/..}
13 SETUP=${SETUP:-""}
14 CLEANUP=${CLEANUP:-""}
15 . $LUSTRE/tests/test-framework.sh
16
17 init_test_env $@
18
19 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 init_logging
21
22 remote_mds_nodsh && skip "remote MDS with nodsh"
23
24 [ -z "$CLIENTS" ] && skip_env "$TESTSUITE: Need two or more clients"
25 [ $CLIENTCOUNT -lt 2 ] &&
26         skip_env "$TESTSUITE: Need 2+ clients, have only $CLIENTCOUNT"
27
28 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
29
30 MOUNT_2=""
31 build_test_filter
32
33 check_and_setup_lustre
34 rm -rf $DIR/[df][0-9]*
35
36 get_mpiuser_id $MPI_USER
37 MPI_RUNAS=${MPI_RUNAS:-"runas -u $MPI_USER_UID -g $MPI_USER_GID"}
38 $GSS_KRB5 && refresh_krb5_tgt $MPI_USER_UID $MPI_USER_GID $MPI_RUNAS
39
40 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
41
42 test_3a() {
43     assert_env CLIENTS MDSRATE MPIRUN
44
45     local -a nodes=(${CLIENTS//,/ })
46
47     # INCREMENT is a number of clients 
48     # a half of clients by default
49     increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))}
50
51     machinefile=${MACHINEFILE:-$TMP/$TESTSUITE.machines}
52     local LOG=$TMP/${TESTSUITE}_$tfile
53
54     local var=${SINGLEMDS}_svc
55     local procfile="*.${!var}.recovery_status"
56     local iters=${ITERS:-3}
57     local nfiles=${NFILES:-50000}
58     local nthreads=${THREADS_PER_CLIENT:-3}
59
60     local IFree=$(inodes_available)
61     [ $IFree -gt $nfiles ] || nfiles=$IFree
62
63     local dir=$DIR/d0.$TESTNAME
64     mkdir -p $dir
65     chmod 0777 $dir
66
67     local pid
68     local list
69     local -a res
70
71     local num=$increment
72
73         while [ $num -le $CLIENTCOUNT ]; do
74                 list=$(comma_list ${nodes[@]:0:$num})
75
76                 generate_machine_file $list $machinefile ||
77                         { error "can not generate machinefile"; exit 1; }
78
79                 for i in $(seq $iters); do
80                         mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' \
81                                 --ignore
82
83                         COMMAND="${MDSRATE} --create --nfiles $nfiles --dir
84                                  $dir --filefmt 'f%%d'"
85                         mpi_run ${MACHINEFILE_OPTION} $machinefile \
86                                 -np $((num * nthreads)) ${COMMAND} | tee ${LOG}&
87
88                         pid=$!
89                         echo "pid=$pid"
90
91                         # 2 threads 100000 creates 117 secs
92                         sleep 20
93
94                         log "$i : Starting failover on $SINGLEMDS"
95                         facet_failover $SINGLEMDS
96                         if ! wait_recovery_complete $SINGLEMDS \
97                              $((TIMEOUT * 10)); then
98                                 echo "$SINGLEMDS recovery is not completed!"
99                                 kill -9 $pid
100                                 exit 7
101                         fi
102
103                         duration=$(do_facet $SINGLEMDS lctl get_param -n \
104                                 $procfile | grep recovery_duration)
105
106                         res=( "${res[@]}" "$num" )
107                         res=( "${res[@]}" "$duration" )
108                         echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration"
109                         wait $pid
110                 done
111                 num=$((num + increment))
112         done
113
114     mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore
115
116     i=0
117     while [ $i -lt ${#res[@]} ]; do
118         echo "RECOVERY TIME: NFILES=$nfiles number of clients: ${res[i]}  ${res[i+1]}"
119         i=$((i+2))
120     done
121 }
122
123 run_test 3a "recovery time, $CLIENTCOUNT clients"
124
125 complete $SECONDS
126 check_and_cleanup_lustre
127 exit_status