5 LUSTRE=${LUSTRE:-`dirname $0`/..}
6 . $LUSTRE/tests/test-framework.sh
10 . ${CONFIG:=$LUSTRE/tests/cfg/lmv.sh}
12 SETUP=${SETUP:-"setup"}
13 CLEANUP=${CLEANUP:-"cleanup"}
17 if [ "$MDSCOUNT" -gt 1 ]; then
19 for mds in `mds_list`; do
20 MDSDEV=$TMP/${mds}-`hostname`
21 add_mds $mds --dev $MDSDEV --size $MDSSIZE --lmv lmv1_svc
23 add_lov_to_lmv lov1 lmv1_svc --stripe_sz $STRIPE_BYTES \
24 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
27 add_mds mds1 --dev $MDSDEV --size $MDSSIZE
28 add_lov lov1 mds1 --stripe_sz $STRIPE_BYTES \
29 --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
33 add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover
34 add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE --failover
35 add_client client ${MDS} --lov lov1 --path $MOUNT
40 SETUP=${SETUP:-"setup"}
41 CLEANUP=${CLEANUP:-"cleanup"}
44 # make sure we are using the primary MDS, so the config log will
45 # be able to clean up properly.
46 activemds=`facet_active mds1`
47 if [ $activemds != "mds1" ]; then
51 umount $MOUNT2 || true
56 # In mds recovery, the mds will clear orphans in ost by
57 # mds_lov_clear_orphan, which will sent the request to ost and waiting for
58 # the reply, if we stop mds at this time, we will got the obd_refcount > 1
59 # errors, because mds_lov_clear_orphan grab a export of mds,
60 # so the obd_refcount of mds will not be zero. So, wait a while before
61 # stop mds. This bug needs further work.
62 for mds in `mds_list`; do
64 stop $mds ${FORCE} $MDSLCONFARGS
69 stop ost ${FORCE} --dump cleanup-dual.log
72 if [ "$ONLY" == "cleanup" ]; then
73 sysctl -w portals.debug=0
81 start_krb5_kdc || exit 1
82 start ost --reformat $OSTLCONFARGS
83 PINGER=`cat /proc/fs/lustre/pinger`
85 if [ "$PINGER" != "on" ]; then
86 echo "ERROR: Lustre must be built with --enable-pinger for replay-dual"
91 start ost2 --reformat $OSTLCONFARGS
92 start_lsvcgssd || exit 2
94 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
95 for mds in `mds_list`; do
96 start $mds --reformat $MDSLCONFARGS
98 grep " $MOUNT " /proc/mounts || zconf_mount `hostname` $MOUNT
99 grep " $MOUNT2 " /proc/mounts || zconf_mount `hostname` $MOUNT2
101 echo $TIMEOUT > /proc/sys/lustre/timeout
102 echo $UPCALL > /proc/sys/lustre/upcall
106 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
115 checkstat $MOUNT2/a || return 1
116 checkstat $MOUNT1/b || return 2
117 rm $MOUNT2/a $MOUNT1/b
118 checkstat $MOUNT1/a && return 3
119 checkstat $MOUNT2/b && return 4
123 run_test 1 "|X| simple create"
131 checkstat $MOUNT2/adir || return 1
133 checkstat $MOUNT2/adir && return 2
137 run_test 2 "|X| mkdir adir"
142 mkdir $MOUNT2/adir/bdir
145 checkstat $MOUNT2/adir || return 1
146 checkstat $MOUNT1/adir/bdir || return 2
147 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
148 checkstat $MOUNT1/adir && return 3
149 checkstat $MOUNT2/adir/bdir && return 4
153 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
158 mkdir $MOUNT1/adir && return 1
159 mkdir $MOUNT2/adir/bdir
162 checkstat $MOUNT2/adir || return 2
163 checkstat $MOUNT1/adir/bdir || return 3
165 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
166 checkstat $MOUNT1/adir && return 4
167 checkstat $MOUNT2/adir/bdir && return 5
171 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
175 # multiclient version of replay_single.sh/test_8
177 multiop $MOUNT2/a o_tSc &
179 # give multiop a chance to open
184 wait $pid || return 1
187 [ -e $MOUNT2/a ] && return 2
190 run_test 5 "open, unlink |X| close"
195 multiop $MOUNT2/a o_c &
197 multiop $MOUNT1/a o_c &
199 # give multiop a chance to open
204 wait $pid1 || return 1
208 wait $pid2 || return 1
209 [ -e $MOUNT2/a ] && return 2
212 run_test 6 "open1, open2, unlink |X| close1 [fail mds] close2"
216 multiop $MOUNT2/a o_c &
218 multiop $MOUNT1/a o_c &
220 # give multiop a chance to open
225 wait $pid2 || return 1
229 wait $pid1 || return 1
230 [ -e $MOUNT2/a ] && return 2
233 run_test 6b "open1, open2, unlink |X| close2 [fail mds] close1"
237 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
239 checkstat $MOUNT2/$tfile || return 2
240 rm $MOUNT1/$tfile || return 3
244 run_test 8 "replay of resent request"
248 mcreate $MOUNT1/$tfile-1
249 mcreate $MOUNT2/$tfile-2
250 # drop first reint reply
251 sysctl -w lustre.fail_loc=0x80000119
253 sysctl -w lustre.fail_loc=0
255 rm $MOUNT1/$tfile-[1,2] || return 1
259 run_test 9 "resending a replayed create"
262 mcreate $MOUNT1/$tfile-1
264 munlink $MOUNT1/$tfile-1
265 mcreate $MOUNT2/$tfile-2
266 # drop first reint reply
267 sysctl -w lustre.fail_loc=0x80000119
269 sysctl -w lustre.fail_loc=0
271 checkstat $MOUNT1/$tfile-1 && return 1
272 checkstat $MOUNT1/$tfile-2 || return 2
277 run_test 10 "resending a replayed unlink"
281 mcreate $MOUNT1/$tfile-1
282 mcreate $MOUNT2/$tfile-2
283 mcreate $MOUNT1/$tfile-3
284 mcreate $MOUNT2/$tfile-4
285 mcreate $MOUNT1/$tfile-5
286 # drop all reint replies for a while
287 sysctl -w lustre.fail_loc=0x0119
289 #sleep for while, let both clients reconnect and timeout
290 sleep $((TIMEOUT * 2))
291 sysctl -w lustre.fail_loc=0
293 rm $MOUNT1/$tfile-[1-5] || return 1
297 run_test 11 "both clients timeout during replay"
302 multiop $DIR/$tfile mo_c &
307 sysctl -w lustre.fail_loc=0x80000302
309 df $MOUNT || return 1
310 sysctl -w lustre.fail_loc=0
313 $CHECKSTAT -t file $DIR/$tfile || return 2
314 kill -USR1 $MULTIPID || return 3
315 wait $MULTIPID || return 4
320 run_test 12 "open resend timeout"
323 multiop $DIR/$tfile mo_c &
329 kill -USR1 $MULTIPID || return 3
330 wait $MULTIPID || return 4
333 sysctl -w lustre.fail_loc=0x80000115
335 df $MOUNT || return 1
336 sysctl -w lustre.fail_loc=0
339 $CHECKSTAT -t file $DIR/$tfile || return 2
344 run_test 13 "close resend timeout"
349 createmany -o $MOUNT1/$tfile- 25
350 createmany -o $MOUNT2/$tfile-2- 1
351 createmany -o $MOUNT1/$tfile-3- 25
355 # expect failover to fail
356 df $MOUNT && return 1
359 # first 25 files shouuld have been
362 unlinkmany $MOUNT1/$tfile- 25 || return 2
364 zconf_mount `hostname` $MOUNT2
367 run_test 14 "timeouts waiting for lost client during replay"
371 createmany -o $MOUNT1/$tfile- 25
372 createmany -o $MOUNT2/$tfile-2- 1
376 df $MOUNT || return 1
379 unlinkmany $MOUNT1/$tfile- 25 || return 2
381 zconf_mount `hostname` $MOUNT2
384 run_test 15 "timeout waiting for lost client during replay, 1 client completes"
387 createmany -o $MOUNT1/$tfile- 25
388 createmany -o $MOUNT2/$tfile-2- 1
394 df $MOUNT || return 1
397 unlinkmany $MOUNT1/$tfile- 25 || return 2
399 zconf_mount `hostname` $MOUNT2
403 #run_test 16 "fail MDS during recovery (3571)"
406 createmany -o $MOUNT1/$tfile- 25
407 createmany -o $MOUNT2/$tfile-2- 1
409 # Make sure the disconnect is lost
413 echo -1 > /proc/sys/portals/debug
417 df $MOUNT || return 1
420 unlinkmany $MOUNT1/$tfile- 25 || return 2
422 zconf_mount `hostname` $MOUNT2
426 #Still not support ost fail over
427 #run_test 17 "fail OST during recovery (3571)"
431 multiop $MOUNT2/$tfile O_c &
433 multiop $MOUNT1/$tfile O_c &
435 # give multiop a chance to open
443 zconf_mount `hostname` $MOUNT2
445 run_test 18 "replay open, Abort recovery, don't assert (3892)"
447 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for
450 test_20() { # bug 3822 - evicting client with enqueued lock
451 mkdir -p $MOUNT1/$tdir
452 touch $MOUNT1/$tdir/f0
453 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
454 statmany -s $MOUNT1/$tdir/f 500 &
457 do_facet mds1 sysctl -w lustre.fail_loc=0x8000030b # hold enqueue
459 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
460 do_facet client sysctl -w lustre.fail_loc=0x80000305 # drop cb, evict
462 usleep 500 # wait to ensure first client is one that will be evicted
463 openfile -f O_RDONLY $MOUNT2/$tdir/f0
465 dmesg | grep "entering recovery in server" && \
466 error "client not evicted" || true
468 run_test 20 "ldlm_handle_enqueue succeeds on evicted export (3822)"
470 if [ "$ONLY" != "setup" ]; then
471 equals_msg test complete, cleaning up
473 SLEEP=$((`date +%s` - $NOW))
474 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP