2 # -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
3 # vim:shiftwidth=4:softtabstop=4:tabstop=4:
8 ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT"
11 PTLDEBUG=${PTLDEBUG:--1}
12 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
14 CLEANUP=${CLEANUP:-""}
15 MOUNT_2=${MOUNT_2:-"yes"}
16 export MULTIOP=${MULTIOP:-multiop}
17 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
25 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
29 check_and_setup_lustre
30 MOUNTED=$(mounted_lustre_filesystems)
31 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
32 zconf_mount $HOSTNAME $MOUNT2
37 rm -rf $DIR/[df][0-9]*
39 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
41 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
42 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
44 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
47 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
49 echo "Check file is LU482_FAILED=$LU482_FAILED"
50 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
51 replay_barrier $SINGLEMDS
52 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
54 createmany -o $MOUNT1/$tfile- 50
55 $LCTL set_param fail_loc=0x80000514
56 facet_failover $SINGLEMDS
57 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
61 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
62 unlinkmany $MOUNT1/$tfile- 50 || return 2
63 rm $MOUNT2/$tfile || return 3
64 rm $MOUNT2/$tfile-A || return 4
66 run_test 0a "expired recovery with lost client"
68 if [ -f "$LU482_FAILED" ]; then
69 log "Found check file $LU482_FAILED, aborting test script"
70 rm -vf "$LU482_FAILED"
71 complete $(basename $0) $SECONDS
72 do_nodes $CLIENTS umount -f $MOUNT2 || true
73 do_nodes $CLIENTS umount -f $MOUNT || true
74 # copied from stopall, but avoid the MDS recovery
75 for num in `seq $OSTCOUNT`; do
77 rm -f $TMP/ost${num}active
79 if ! combined_mgs_mds ; then
87 replay_barrier $SINGLEMDS
89 touch $MOUNT1/$tfile-2
91 facet_failover $SINGLEMDS
93 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
94 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
95 checkstat $MOUNT1/$tfile-2 && return 1
96 checkstat $MOUNT2/$tfile && return 2
99 run_test 0b "lost client during waiting for next transno"
103 replay_barrier $SINGLEMDS
107 checkstat $MOUNT2/a || return 1
108 checkstat $MOUNT1/b || return 2
109 rm $MOUNT2/a $MOUNT1/b
110 checkstat $MOUNT1/a && return 3
111 checkstat $MOUNT2/b && return 4
115 run_test 1 "|X| simple create"
119 replay_barrier $SINGLEMDS
123 checkstat $MOUNT2/adir || return 1
125 checkstat $MOUNT2/adir && return 2
128 run_test 2 "|X| mkdir adir"
131 replay_barrier $SINGLEMDS
133 mkdir $MOUNT2/adir/bdir
136 checkstat $MOUNT2/adir || return 1
137 checkstat $MOUNT1/adir/bdir || return 2
138 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
139 checkstat $MOUNT1/adir && return 3
140 checkstat $MOUNT2/adir/bdir && return 4
143 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
147 replay_barrier $SINGLEMDS
148 mkdir $MOUNT1/adir && return 1
149 mkdir $MOUNT2/adir/bdir
152 checkstat $MOUNT2/adir || return 2
153 checkstat $MOUNT1/adir/bdir || return 3
155 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
156 checkstat $MOUNT1/adir && return 4
157 checkstat $MOUNT2/adir/bdir && return 5
160 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
164 # multiclient version of replay_single.sh/test_8
166 multiop_bg_pause $MOUNT2/a o_tSc || return 1
169 replay_barrier $SINGLEMDS
171 wait $pid || return 1
174 [ -e $MOUNT2/a ] && return 2
177 run_test 5 "open, unlink |X| close"
182 multiop_bg_pause $MOUNT2/a o_c || return 1
184 multiop_bg_pause $MOUNT1/a o_c || return 1
187 replay_barrier $SINGLEMDS
189 wait $pid1 || return 1
193 wait $pid2 || return 1
194 [ -e $MOUNT2/a ] && return 2
197 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
200 replay_barrier $SINGLEMDS
201 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
203 checkstat $MOUNT2/$tfile || return 2
204 rm $MOUNT1/$tfile || return 3
208 run_test 8 "replay of resent request"
211 replay_barrier $SINGLEMDS
212 mcreate $MOUNT1/$tfile-1
213 mcreate $MOUNT2/$tfile-2
214 # drop first reint reply
215 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
217 do_facet $SINGLEMDS lctl set_param fail_loc=0
219 rm $MOUNT1/$tfile-[1,2] || return 1
223 run_test 9 "resending a replayed create"
226 mcreate $MOUNT1/$tfile-1
227 replay_barrier $SINGLEMDS
228 munlink $MOUNT1/$tfile-1
229 mcreate $MOUNT2/$tfile-2
230 # drop first reint reply
231 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
233 do_facet $SINGLEMDS lctl set_param fail_loc=0
235 checkstat $MOUNT1/$tfile-1 && return 1
236 checkstat $MOUNT1/$tfile-2 || return 2
241 run_test 10 "resending a replayed unlink"
244 replay_barrier $SINGLEMDS
245 mcreate $MOUNT1/$tfile-1
246 mcreate $MOUNT2/$tfile-2
247 mcreate $MOUNT1/$tfile-3
248 mcreate $MOUNT2/$tfile-4
249 mcreate $MOUNT1/$tfile-5
250 # drop all reint replies for a while
251 do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
252 # note that with this fail_loc set, facet_failover df will fail
253 facet_failover $SINGLEMDS
254 #sleep for while, let both clients reconnect and timeout
255 sleep $((TIMEOUT * 2))
256 do_facet $SINGLEMDS lctl set_param fail_loc=0
258 rm $MOUNT1/$tfile-[1-5] || return 1
262 run_test 11 "both clients timeout during replay"
265 replay_barrier $SINGLEMDS
267 multiop_bg_pause $DIR/$tfile mo_c || return 1
270 #define OBD_FAIL_LDLM_ENQUEUE 0x302
271 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
272 facet_failover $SINGLEMDS
273 do_facet $SINGLEMDS lctl set_param fail_loc=0
274 clients_up || return 1
277 kill -USR1 $MULTIPID || return 3
278 wait $MULTIPID || return 4
279 $CHECKSTAT -t file $DIR/$tfile || return 2
284 run_test 12 "open resend timeout"
287 multiop_bg_pause $DIR/$tfile mo_c || return 1
290 replay_barrier $SINGLEMDS
292 kill -USR1 $MULTIPID || return 3
293 wait $MULTIPID || return 4
296 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
297 facet_failover $SINGLEMDS
298 do_facet $SINGLEMDS lctl set_param fail_loc=0
299 clients_up || return 1
302 $CHECKSTAT -t file $DIR/$tfile || return 2
307 run_test 13 "close resend timeout"
309 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
314 wait_destroy_complete
315 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
316 mkdir -p $MOUNT1/$tdir
317 $SETSTRIPE -i 0 $MOUNT1/$tdir
318 replay_barrier $SINGLEMDS
319 createmany -o $MOUNT1/$tdir/$tfile- 5
321 $SETSTRIPE -i 0 $MOUNT2/f14b-3
322 echo "data" > $MOUNT2/f14b-3
323 createmany -o $MOUNT1/$tdir/$tfile-3- 5
327 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
329 # first 25 files should have been replayed
330 unlinkmany $MOUNT1/$tdir/$tfile- 5 || return 2
331 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || return 3
333 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
335 wait_mds_ost_sync || return 4
336 wait_destroy_complete || return 5
338 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
339 log "before $BEFOREUSED, after $AFTERUSED"
340 [ $AFTERUSED -ne $BEFOREUSED ] && \
341 error "after $AFTERUSED > before $BEFOREUSED" && return 4
344 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
346 test_15a() { # was test_15
347 replay_barrier $SINGLEMDS
348 createmany -o $MOUNT1/$tfile- 25
349 createmany -o $MOUNT2/$tfile-2- 1
354 unlinkmany $MOUNT1/$tfile- 25 || return 2
355 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
357 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
360 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
363 replay_barrier $SINGLEMDS
364 for ((i = 0; i < 2000; i++)); do
365 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
371 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
374 run_test 15c "remove multiple OST orphans"
377 replay_barrier $SINGLEMDS
378 createmany -o $MOUNT1/$tfile- 25
379 createmany -o $MOUNT2/$tfile-2- 1
382 facet_failover $SINGLEMDS
386 unlinkmany $MOUNT1/$tfile- 25 || return 2
388 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
392 run_test 16 "fail MDS during recovery (3571)"
395 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
397 createmany -o $MOUNT1/$tfile- 25
398 createmany -o $MOUNT2/$tfile-2- 1
400 # Make sure the disconnect is lost
408 unlinkmany $MOUNT1/$tfile- 25 || return 2
410 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
414 run_test 17 "fail OST during recovery (3571)"
416 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
419 test_18() { # bug 3822 - evicting client with enqueued lock
421 mkdir -p $MOUNT1/$tdir
422 touch $MOUNT1/$tdir/f0
423 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
424 statmany -s $MOUNT1/$tdir/f 1 500 &
427 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
429 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
430 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
432 usleep 500 # wait to ensure first client is one that will be evicted
433 openfile -f O_RDONLY $MOUNT2/$tdir/f0
435 dmesg | grep "entering recovery in server" && \
436 error "client not evicted" || true
437 do_facet client "lctl set_param fail_loc=0"
438 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
440 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
442 test_19() { # Bug 10991 - resend of open request does not fail assertion.
443 replay_barrier $SINGLEMDS
444 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
446 checkstat $DIR2/${tfile}0 || return 2
447 rm $DIR/${tfile}0 || return 3
451 run_test 19 "resend of open request"
455 replay_barrier $SINGLEMDS
461 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
462 TIER1=$((`date +%s` - BEFORE))
464 replay_barrier $SINGLEMDS
470 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
471 TIER2=$((`date +%s` - BEFORE))
472 [ $TIER2 -ge $((TIER1 * 2)) ] && \
473 error "recovery time is growing $TIER2 > $TIER1"
476 run_test 20 "recovery time is not increasing"
478 # commit on sharing tests
480 local param_file=$TMP/$tfile-params
482 save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file
483 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
484 touch $MOUNT1/$tfile-1
485 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
486 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
487 replay_barrier_nosync $SINGLEMDS
490 facet_failover $SINGLEMDS
492 # all renames are replayed
493 unlink $MOUNT1/$tfile-3 || return 2
495 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
497 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
498 rm -rf $MOUNT1/$tfile-*
499 restore_lustre_params < $param_file
503 run_test 21a "commit on sharing"
507 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
510 do_node $CLIENT1 touch $MOUNT1/$tfile-1
511 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
512 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
514 replay_barrier_nosync $mds
515 shutdown_client $CLIENT2 $MOUNT1
519 # were renames replayed?
521 echo UNLINK $MOUNT1/$tfile-3
522 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \
523 { echo "unlink $tfile-3 fail!" && rc=1; }
526 zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail"
532 [ -z "$CLIENTS" ] && skip "Need two or more clients." && return
533 [ $CLIENTCOUNT -lt 2 ] && \
534 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
536 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
537 skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. "
542 zconf_umount_clients $CLIENTS $MOUNT2
543 zconf_mount_clients $CLIENTS $MOUNT1
545 local param_file=$TMP/$tfile-params
547 local num=$(get_mds_dir $MOUNT1)
549 save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file
553 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
555 test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS"
557 # COS disabled (should fail)
559 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
561 # there is still a window when transactions may be written to disk before
562 # the mds device is set R/O. To avoid such a rare test failure, the check
563 # is repeated several times.
566 test_21b_sub mds$num || break;
567 let n_attempts=n_attempts+1
568 [ $n_attempts -gt 3 ] &&
569 error "The test cannot check whether COS works or not: all renames are replied w/o COS"
571 restore_lustre_params < $param_file
575 run_test 21b "commit on sharing, two clients"
577 # end commit on sharing tests
579 complete $(basename $0) $SECONDS
580 SLEEP=$((`date +%s` - $NOW))
581 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
582 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
583 check_and_cleanup_lustre