Whamcloud - gitweb
LU-13773 tests: subscript failure propagation
[fs/lustre-release.git] / lustre / tests / large-scale.sh
1 #!/bin/bash
2
3 set -e
4
5 PTLDEBUG=${PTLDEBUG:--1}
6 SETUP=${SETUP:-""}
7 CLEANUP=${CLEANUP:-""}
8
9 LUSTRE=${LUSTRE:-$(dirname $0)/..}
10 . $LUSTRE/tests/test-framework.sh
11 init_test_env $@
12 init_logging
13
14 ALWAYS_EXCEPT="$LARGE_SCALE_EXCEPT "
15
16 build_test_filter
17
18 remote_mds_nodsh && skip "remote MDS with nodsh"
19
20 [ -z "$CLIENTS" ] && skip_env "$TESTSUITE: Need two or more clients"
21 [ $CLIENTCOUNT -lt 2 ] &&
22         skip_env "$TESTSUITE: Need 2+ clients, have only $CLIENTCOUNT"
23
24 MOUNT_2=""
25
26 check_and_setup_lustre
27 rm -rf $DIR/[df][0-9]*
28
29 get_mpiuser_id $MPI_USER
30 MPI_RUNAS=${MPI_RUNAS:-"runas -u $MPI_USER_UID -g $MPI_USER_GID"}
31 $GSS_KRB5 && refresh_krb5_tgt $MPI_USER_UID $MPI_USER_GID $MPI_RUNAS
32
33 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
34
35 test_3a() {
36         assert_env CLIENTS MDSRATE MPIRUN
37
38         local -a nodes=(${CLIENTS//,/ })
39         # INCREMENT is a number of clients a half of clients by default
40         local increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))}
41         local num=$increment
42         local LOG=$TMP/${TESTSUITE}_$tfile
43         local var=${SINGLEMDS}_svc
44         local procfile="*.${!var}.recovery_status"
45         local iters=${ITERS:-3}
46         local nfiles=${NFILES:-50000}
47         local nthreads=${THREADS_PER_CLIENT:-3}
48         local IFree=$(inodes_available)
49         local pid
50         local list
51         local -a res
52         local dir=$DIR/d0.$TESTNAME
53
54         [ $IFree -gt $nfiles ] || nfiles=$IFree
55
56         mkdir -p $dir
57         chmod 0777 $dir
58
59         while [ $num -le $CLIENTCOUNT ]; do
60                 list=$(comma_list ${nodes[@]:0:$num})
61
62                 generate_machine_file $list $MACHINEFILE ||
63                         error "can not generate machinefile"
64
65                 for i in $(seq $iters); do
66                         mdsrate_cleanup $num $MACHINEFILE $nfiles $dir 'f%%d' \
67                                 --ignore
68
69                         COMMAND="${MDSRATE} --create --nfiles $nfiles --dir
70                                  $dir --filefmt 'f%%d'"
71                         mpi_run ${MACHINEFILE_OPTION} $MACHINEFILE \
72                                 -np $((num * nthreads)) ${COMMAND} | tee ${LOG}&
73
74                         pid=$!
75                         echo "pid=$pid"
76
77                         # 2 threads 100000 creates 117 secs
78                         sleep 20
79
80                         log "$i : Starting failover on $SINGLEMDS"
81                         facet_failover $SINGLEMDS
82                         if ! wait_recovery_complete $SINGLEMDS \
83                              $((TIMEOUT * 10)); then
84                                 echo "$SINGLEMDS recovery is not completed!"
85                                 kill -9 $pid
86                                 exit 7
87                         fi
88
89                         duration=$(do_facet $SINGLEMDS lctl get_param -n \
90                                 $procfile | grep recovery_duration)
91
92                         res=( "${res[@]}" "$num" )
93                         res=( "${res[@]}" "$duration" )
94                         echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration"
95                         wait $pid
96                 done
97                 num=$((num + increment))
98         done
99
100         mdsrate_cleanup $num $MACHINEFILE $nfiles $dir 'f%%d' --ignore
101
102         i=0
103         while [ $i -lt ${#res[@]} ]; do
104                 echo "RECOVERY TIME: NFILES=$nfiles number of clients: ${res[i]}  ${res[i+1]}"
105                 i=$((i+2))
106         done
107 }
108
109 run_test 3a "recovery time, $CLIENTCOUNT clients"
110
111 complete $SECONDS
112 check_and_cleanup_lustre
113 exit_status