5 # bug number: 10124 19884
6 ALWAYS_EXCEPT="15c 14b $REPLAY_DUAL_EXCEPT"
9 PTLDEBUG=${PTLDEBUG:--1}
10 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
12 CLEANUP=${CLEANUP:-""}
13 MOUNT_2=${MOUNT_2:-"yes"}
14 . $LUSTRE/tests/test-framework.sh
18 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
22 [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 5 14"
26 check_and_setup_lustre
27 MOUNTED=$(mounted_lustre_filesystems)
28 if ! $(echo $MOUNTED | grep -w -q $MOUNT2); then
29 zconf_mount $HOSTNAME $MOUNT2
34 rm -rf $DIR/[df][0-9]*
36 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
44 checkstat $MOUNT2/a || return 1
45 checkstat $MOUNT1/b || return 2
46 rm $MOUNT2/a $MOUNT1/b
47 checkstat $MOUNT1/a && return 3
48 checkstat $MOUNT2/b && return 4
52 run_test 1 "|X| simple create"
60 checkstat $MOUNT2/adir || return 1
62 checkstat $MOUNT2/adir && return 2
65 run_test 2 "|X| mkdir adir"
70 mkdir $MOUNT2/adir/bdir
73 checkstat $MOUNT2/adir || return 1
74 checkstat $MOUNT1/adir/bdir || return 2
75 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
76 checkstat $MOUNT1/adir && return 3
77 checkstat $MOUNT2/adir/bdir && return 4
80 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
85 mkdir $MOUNT1/adir && return 1
86 mkdir $MOUNT2/adir/bdir
89 checkstat $MOUNT2/adir || return 2
90 checkstat $MOUNT1/adir/bdir || return 3
92 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
93 checkstat $MOUNT1/adir && return 4
94 checkstat $MOUNT2/adir/bdir && return 5
97 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
101 # multiclient version of replay_single.sh/test_8
103 multiop_bg_pause $MOUNT2/a o_tSc || return 1
108 wait $pid || return 1
111 [ -e $MOUNT2/a ] && return 2
114 run_test 5 "open, unlink |X| close"
119 multiop_bg_pause $MOUNT2/a o_c || return 1
121 multiop_bg_pause $MOUNT1/a o_c || return 1
126 wait $pid1 || return 1
130 wait $pid2 || return 1
131 [ -e $MOUNT2/a ] && return 2
134 run_test 6 "open1, open2, unlink |X| close1 [fail mds] close2"
138 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
140 checkstat $MOUNT2/$tfile || return 2
141 rm $MOUNT1/$tfile || return 3
145 run_test 8 "replay of resent request"
149 mcreate $MOUNT1/$tfile-1
150 mcreate $MOUNT2/$tfile-2
151 # drop first reint reply
152 do_facet mds lctl set_param fail_loc=0x80000119
154 do_facet mds lctl set_param fail_loc=0
156 rm $MOUNT1/$tfile-[1,2] || return 1
160 run_test 9 "resending a replayed create"
163 mcreate $MOUNT1/$tfile-1
165 munlink $MOUNT1/$tfile-1
166 mcreate $MOUNT2/$tfile-2
167 # drop first reint reply
168 do_facet mds lctl set_param fail_loc=0x80000119
170 do_facet mds lctl set_param fail_loc=0
172 checkstat $MOUNT1/$tfile-1 && return 1
173 checkstat $MOUNT1/$tfile-2 || return 2
178 run_test 10 "resending a replayed unlink"
182 mcreate $MOUNT1/$tfile-1
183 mcreate $MOUNT2/$tfile-2
184 mcreate $MOUNT1/$tfile-3
185 mcreate $MOUNT2/$tfile-4
186 mcreate $MOUNT1/$tfile-5
187 # drop all reint replies for a while
188 do_facet mds lctl set_param fail_loc=0x0119
189 # note that with this fail_loc set, facet_failover df will fail
191 #sleep for while, let both clients reconnect and timeout
192 sleep $((TIMEOUT * 2))
193 do_facet mds lctl set_param fail_loc=0
195 while [ -z "$(ls $MOUNT1/$tfile-[1-5] 2>/dev/null)" ]; do
199 ls $MOUNT1/$tfile-[1-5]
200 rm $MOUNT1/$tfile-[1-5] || return 1
204 run_test 11 "both clients timeout during replay"
209 multiop_bg_pause $DIR/$tfile mo_c || return 1
212 #define OBD_FAIL_LDLM_ENQUEUE 0x302
213 do_facet mds lctl set_param fail_loc=0x80000302
215 do_facet mds lctl set_param fail_loc=0
216 clients_up || { kill -USR1 $MULTIPID && return 1; }
219 kill -USR1 $MULTIPID || return 3
220 wait $MULTIPID || return 4
221 $CHECKSTAT -t file $DIR/$tfile || return 2
226 run_test 12 "open resend timeout"
229 multiop_bg_pause $DIR/$tfile mo_c || return 1
234 kill -USR1 $MULTIPID || return 3
235 wait $MULTIPID || return 4
238 do_facet mds lctl set_param fail_loc=0x80000115
240 do_facet mds lctl set_param fail_loc=0
241 clients_up || return 1
244 $CHECKSTAT -t file $DIR/$tfile || return 2
249 run_test 13 "close resend timeout"
253 local lustre_version=$(get_lustre_version mds)
254 if [[ $lustre_version != 1.8* ]]; then
255 skip "mds is running $lustre_version, test is obsoleted"
259 createmany -o $MOUNT1/$tfile- 25
260 createmany -o $MOUNT2/$tfile-2- 1
261 createmany -o $MOUNT1/$tfile-3- 25
265 # expect recovery to fail due to missing client 2
266 client_evicted || return 1
269 # first 25 files should have been replayed
270 unlinkmany $MOUNT1/$tfile- 25 || return 2
272 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
275 run_test 14a "timeouts waiting for lost client during replay"
278 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
279 #lfs setstripe --index=0 --count=1 $MOUNT1
280 mkdir -p $MOUNT1/$tdir
281 #lfs setstripe --index=0 --count=1 $MOUNT1/$tdir
283 createmany -o $MOUNT1/$tfile- 5
284 echo "data" > $MOUNT2/$tdir/$tfile-2
285 createmany -o $MOUNT1/$tfile-3- 5
289 wait_recovery_complete mds || error "MDS recovery isn't done"
291 # first 25 files should have been replayed
292 unlinkmany $MOUNT1/$tfile- 5 || return 2
293 unlinkmany $MOUNT1/$tfile-3- 5 || return 3
295 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
297 wait_mds_ost_sync || return 5
298 wait_destroy_complete || return 6
300 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
301 log "before $BEFOREUSED, after $AFTERUSED"
302 [ $AFTERUSED -ne $BEFOREUSED ] && \
303 error "after $AFTERUSED > before $BEFOREUSED" && return 4
306 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
308 test_15a() { # was test_15
310 createmany -o $MOUNT1/$tfile- 25
311 createmany -o $MOUNT2/$tfile-2- 1
316 unlinkmany $MOUNT1/$tfile- 25 || return 2
317 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
319 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
322 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
326 for ((i = 0; i < 2000; i++)); do
327 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
333 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
336 run_test 15c "remove multiple OST orphans"
340 createmany -o $MOUNT1/$tfile- 25
341 createmany -o $MOUNT2/$tfile-2- 1
348 unlinkmany $MOUNT1/$tfile- 25 || return 2
350 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
354 run_test 16 "fail MDS during recovery (3571)"
357 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
359 createmany -o $MOUNT1/$tfile- 25
360 createmany -o $MOUNT2/$tfile-2- 1
362 # Make sure the disconnect is lost
370 unlinkmany $MOUNT1/$tfile- 25 || return 2
372 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
376 run_test 17 "fail OST during recovery (3571)"
378 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
381 test_18() { # bug 3822 - evicting client with enqueued lock
383 mkdir -p $MOUNT1/$tdir
384 touch $MOUNT1/$tdir/f0
385 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
386 statmany -s $MOUNT1/$tdir/f 1 500 &
389 do_facet mds lctl set_param fail_loc=0x8000030b # hold enqueue
391 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
392 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
394 sleep 0.500s # wait to ensure first client is one that will be evicted
395 openfile -f O_RDONLY $MOUNT2/$tdir/f0
397 dmesg | grep "entering recovery in server" && \
398 error "client not evicted" || true
400 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
402 test_19() { # Bug 10991 - resend of open request does not fail assertion.
404 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
406 checkstat $DIR2/${tfile}0 || return 2
407 rm $DIR/${tfile}0 || return 3
411 run_test 19 "resend of open request"
421 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
422 TIER1=$((`date +%s` - BEFORE))
430 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
431 TIER2=$((`date +%s` - BEFORE))
432 [ $TIER2 -ge $((TIER1 * 2)) ] && \
433 error "recovery time is growing $TIER2 > $TIER1"
436 run_test 20 "recovery time is not increasing"
438 test_22() { #bug 18927
439 multiop_bg_pause $MOUNT1/$tfile O_c || return 1
441 multiop_bg_pause $MOUNT2/$tfile O_c || return 2
447 wait $pid1 || return 3
449 wait $pid2 || return 4
450 [ -e $MOUNT1/$tfile ] && return 5
453 run_test 22 "double open|creat in replay with open orphan from two mntp"
455 equals_msg `basename $0`: test complete, cleaning up
456 SLEEP=$((`date +%s` - $NOW))
457 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
458 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
459 check_and_cleanup_lustre
460 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true