6 ALWAYS_EXCEPT="$LARGE_SCALE_EXCEPT"
9 PTLDEBUG=${PTLDEBUG:--1}
10 LUSTRE=${LUSTRE:-`dirname $0`/..}
12 CLEANUP=${CLEANUP:-""}
13 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
19 remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0
21 [ -n "$CLIENTS" ] || { skip_env "$0: Need two or more clients" && exit 0; }
22 [ $CLIENTCOUNT -ge 2 ] || \
23 { skip_env "$0: Need two or more remote clients, have $CLIENTCOUNT" && exit 0; }
26 [ "$SLOW" = "no" ] && EXCEPT_SLOW=""
31 check_and_setup_lustre
32 rm -rf $DIR/[df][0-9]*
34 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
38 do_nodes $CLIENTS "$LCTL get_param mdc.*.connect_flags | grep version_recovery"
42 { skip_env "$0: no version_recovery" && exit 0; }
44 FAKE_NUM_MAX=${FAKE_NUM_MAX:-1000}
45 [ "$SLOW" = "no" ] && FAKE_NUM_MAX=100
47 delete_fake_exports () {
48 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
50 OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
52 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
53 sleep $((NEW_AGE + 3))
54 EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
55 [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
57 do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
58 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
62 delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
67 for FAKE_NUM in 10 $FAKE_NUM_MAX; do
68 zconf_umount_clients $CLIENTS $DIR
69 zconf_mount $CLIENT1 $DIR
71 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
73 log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
74 create_fake_exports mds $FAKE_NUM
75 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
76 [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -ne $FAKE_NUM"
77 echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
78 local elapsed=$(do_and_time "zconf_mount_clients $CLIENTS $DIR")
79 echo "==== $TESTNAME ===== CONNECTION TIME $elapsed: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
81 # do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
87 run_test 1b "VBR: connect $CLIENTCOUNT clients with delayed exports"
89 # Sigh. One more function for mds failover
90 # fail fn does not do df on all clients
97 delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
99 zconf_mount_clients $CLIENTS $DIR
101 # sanity mds fail (to exclude the recults on fresh formatted fs)
109 for FAKE_NUM in 10 $FAKE_NUM_MAX; do
111 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
113 log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
114 create_fake_exports mds $FAKE_NUM
115 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
116 [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -ne $FAKE_NUM"
117 echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
120 do_nodes $CLIENTS "createmany -o $DIR/$tfile-\\\$(hostname)" 25
121 # XXX For FAILURE_MODE=HARD it is better to exclude
122 # shutdown_facet and reboot_facet time
125 local current_ts=`date +%s`
126 local elapsed=`expr $current_ts - $RECOVERY_START_TIME`
128 do_nodes $CLIENTS "unlinkmany $DIR/$tfile-\\\$(hostname) 25"
129 echo "==== $TESTNAME ===== RECOVERY TIME $elapsed: FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
131 # do_facet mds "lctl set_param mds.${mds_svc}.flush_stale_exports=1"
137 run_test 1c "VBR: recovery $CLIENTCOUNT clients with delayed exports"
140 delayed_recovery_enabled || { skip "No delayed recovery support"; return 0; }
145 for FAKE_NUM in 10 $FAKE_NUM_MAX; do
146 zconf_umount_clients $CLIENTS $DIR
147 zconf_mount $CLIENT1 $DIR
149 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
151 log "===== CREATE FAKE EXPORTS: $FAKE_NUM ( were $NUM )"
152 create_fake_exports mds $FAKE_NUM
153 NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|wc -l")
154 [ $NUM -lt $FAKE_NUM ] && error "fake exports $NUM -lt $FAKE_NUM"
155 echo "===== STALE EXPORTS: FAKE_NUM=$FAKE_NUM NUM=$NUM"
157 OLD_AGE=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_export_age")
158 echo OLD_AGE=$OLD_AGE
160 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$NEW_AGE"
161 sleep $((NEW_AGE + 3))
162 EX_NUM=$(do_facet mds "lctl get_param -n mds.${mds_svc}.stale_exports|grep -c EXPIRED")
163 [ "$EX_NUM" -eq "$NUM" ] || error "not all exports are expired $EX_NUM != $NUM"
165 local elapsed=$(do_and_time "zconf_mount_clients $CLIENTS $DIR")
166 echo "==== $TESTNAME===== CONNECTION TIME $elapsed: expired FAKE_NUM=$FAKE_NUM CLIENTCOUNT=$CLIENTCOUNT"
168 do_facet mds "lctl set_param mds.${mds_svc}.stale_export_age=$OLD_AGE"
173 run_test 1d "VBR: expire exports, connect $CLIENTCOUNT clients"
174 # VBR scale tests end
177 assert_env CLIENTS MDSRATE MPIRUN
179 local -a nodes=(${CLIENTS//,/ })
181 # INCREMENT is a number of clients
182 # a half of clients by default
183 increment=${INCREMENT:-$(( CLIENTCOUNT / 2 ))}
185 machinefile=${MACHINEFILE:-$TMP/$(basename $0 .sh).machines}
186 local LOG=$TMP/${TESTSUITE}_$tfile
189 local procfile="*.${!var}.recovery_status"
190 local iters=${ITERS:-3}
191 local nfiles=${NFILES:-50000}
192 local nthreads=${THREADS_PER_CLIENT:-3}
194 local IFree=$(inodes_available)
195 [ $IFree -gt $nfiles ] || nfiles=$IFree
197 local dir=$DIR/d0.$TESTNAME
207 while [ $num -le $CLIENTCOUNT ]; do
208 list=$(comma_list ${nodes[@]:0:$num})
210 generate_machine_file $list $machinefile ||
211 { error "can not generate machinefile"; exit 1; }
213 for i in $(seq $iters); do
214 mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore
216 COMMAND="${MDSRATE} --create --nfiles $nfiles --dir $dir --filefmt 'f%%d'"
217 mpi_run -np $((num * nthreads)) -machinefile $machinefile ${COMMAND} | tee ${LOG} &
222 # 2 threads 100000 creates 117 secs
225 log "$i : Starting failover on mds"
227 if ! wait_recovery_complete mds $((TIMEOUT * 10)); then
228 echo "mds recovery is not completed!"
233 duration=$(do_facet mds lctl get_param -n $procfile | grep recovery_duration)
235 res=( "${res[@]}" "$num" )
236 res=( "${res[@]}" "$duration" )
237 echo "RECOVERY TIME: NFILES=$nfiles number of clients: $num $duration"
241 num=$((num + increment))
244 mdsrate_cleanup $num $machinefile $nfiles $dir 'f%%d' --ignore
247 while [ $i -lt ${#res[@]} ]; do
248 echo "RECOVERY TIME: NFILES=$nfiles number of clients: ${res[i]} ${res[i+1]}"
253 run_test 3a "recovery time, $CLIENTCOUNT clients"
255 complete $(basename $0) $SECONDS
256 check_and_cleanup_lustre