6 ALWAYS_EXCEPT="$LARGE_SCALE_EXCEPT"
9 PTLDEBUG=${PTLDEBUG:--1}
10 LUSTRE=${LUSTRE:-`dirname $0`/..}
12 CLEANUP=${CLEANUP:-""}
13 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
19 remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
21 [ -n "$CLIENTS" ] || { skip "$0: Need two or more clients" && exit 0; }
22 [ $CLIENTCOUNT -ge 2 ] || \
23 { skip "$0: Need two or more clients, have $CLIENTCOUNT" && exit 0; }
26 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
31 check_and_setup_lustre
32 rm -rf $DIR/[df][0-9]*
34 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
38 do_nodes $CLIENTS "$LCTL get_param mdc.*.connect_flags | grep version_recovery"
42 { skip "$0: no version_recovery" && exit 0; }
44 FAKE_NUM_MAX=${FAKE_NUM_MAX:-1000}
45 [ "$SLOW" = "no" ] && FAKE_NUM_MAX=100
50 local start_ts=`date +%s`
54 local current_ts=`date +%s`
55 ELAPSED=`expr $current_ts - $start_ts`
56 echo "===== START $start_ts CURRENT $current_ts"
59 delete_fake_exports () {
60 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
62 OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
64 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
65 sleep $((NEW_AGE + 3))
66 EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
67 [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
69 do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
70 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
74 delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
79 for FAKE_NUM in 10 $FAKE_NUM_MAX; do
80 zconf_umount_clients $CLIENTS $DIR
81 zconf_mount $CLIENT1 $DIR
83 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
85 log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
86 create_fake_exports mds $FAKE_NUM
87 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
88 [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -ne $FAKE_NUM"
89 echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
90 do_and_time "zconf_mount_clients $CLIENTS $DIR"
91 echo "==== $TESTNAME ===== CONNECTION TIME $ELAPSED: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
93 # do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
99 run_test 1b "VBR: connect $CLIENTCOUNT clients with delayed exports"
101 # Sigh. One more function for mds failover
102 # fail fn does not do df on all clients
109 delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
111 zconf_mount_clients $CLIENTS $DIR
113 # sanity mds fail (to exclude the recults on fresh formatted fs)
121 for FAKE_NUM in 10 $FAKE_NUM_MAX; do
123 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
125 log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
126 create_fake_exports mds $FAKE_NUM
127 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
128 [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -ne $FAKE_NUM"
129 echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
132 do_nodes $CLIENTS "createmany -o $DIR/$tfile-\\\$(hostname)" 25
133 # XXX For FAILURE_MODE=HARD it is better to exclude
134 # shutdown_facet and reboot_facet time
137 local current_ts=`date +%s`
138 local elapsed=`expr $current_ts - $RECOVERY_START_TIME`
140 do_nodes $CLIENTS "unlinkmany $DIR/$tfile-\\\$(hostname) 25"
141 echo "==== $TESTNAME ===== RECOVERY TIME $elapsed: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
143 # do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
149 run_test 1c "VBR: recovery $CLIENTCOUNT clients with delayed exports"
152 delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
157 for FAKE_NUM in 10 $FAKE_NUM_MAX; do
158 zconf_umount_clients $CLIENTS $DIR
159 zconf_mount $CLIENT1 $DIR
161 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
163 log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
164 create_fake_exports mds $FAKE_NUM
165 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
166 [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -lt $FAKE_NUM"
167 echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
169 OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
170 echo OLD_AGE=$OLD_AGE
172 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
173 sleep $((NEW_AGE + 3))
174 EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
175 [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
177 do_and_time "zconf_mount_clients $CLIENTS $DIR"
178 echo "==== $TESTNAME===== CONNECTION TIME $ELAPSED: expired FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
180 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
185 run_test 1d "VBR: expire exports, connect $CLIENTCOUNT clients"
186 # VBR scale tests end
189 assert_env CLIENTS MDSRATE MPIRUN
191 local -a nodes=(${CLIENTS//,/ })
193 # INCREMENT is a number of clients
194 # a half of clients by default
195 increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))}
197 machinefile=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
198 local LOG=$TMP/${TESTSUITE}_$tfile
201 local procfile="*.${!var}.recovery_status"
202 local iters=${ITERS:-3}
203 local nfiles=${NFILES:-50000}
204 local nthreads=${THREADS_PER_CLIENT:-3}
206 local IFree=$(inodes_available)
207 [ $IFree -gt $nfiles ] || nfiles=$IFree
219 while [ $num -le $CLIENTCOUNT ]; do
220 list=$(comma_list ${nodes[@]:0:$num})
222 generate_machine_file $list $machinefile ||
223 { error "can not generate machinefile"; exit 1; }
225 for i in $(seq $iters); do
226 mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore
228 COMMAND="${MDSRATE} --create --nfiles $nfiles --dir $dir --filefmt 'f%%d'"
229 mpi_run -np $((num * nthreads)) -machinefile $machinefile ${COMMAND} | tee ${LOG} &
234 # 2 threads 100000 creates 117 secs
237 log "$i : Starting failover on mds"
239 if ! wait_recovery_complete mds $((TIMEOUT * 10)); then
240 echo "mds recovery is not completed!"
245 duration=$(do_facet mds lctl get_param -n $procfile | grep recovery_duration)
247 res=( "${res[@]}" "$num" )
248 res=( "${res[@]}" "$duration" )
249 echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration"
253 num=$((num + increment))
256 mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore
259 while [ $i -lt ${#res[@]} ]; do
260 echo "RECOVERY TIME: NFILES=$nfiles number of clients: ${res[i]} ${res[i+1]}"
265 run_test 3a "recovery time, $CLIENTCOUNT clients"
267 equals_msg `basename $0`: test complete, cleaning up
268 check_and_cleanup_lustre
269 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true