2 # -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
3 # vim:shiftwidth=4:softtabstop=4:tabstop=4:
8 ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT"
11 PTLDEBUG=${PTLDEBUG:--1}
12 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
14 CLEANUP=${CLEANUP:-""}
15 MOUNT_2=${MOUNT_2:-"yes"}
16 . $LUSTRE/tests/test-framework.sh
19 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
22 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
24 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
28 check_and_setup_lustre
29 MOUNTED=$(mounted_lustre_filesystems)
30 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
31 zconf_mount $HOSTNAME $MOUNT2
36 rm -rf $DIR/[df][0-9]*
38 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
40 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
41 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
43 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
46 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
48 echo "Check file is LU482_FAILED=$LU482_FAILED"
49 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
50 replay_barrier $SINGLEMDS
51 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
53 createmany -o $MOUNT1/$tfile- 50
54 $LCTL set_param fail_loc=0x80000514
55 facet_failover $SINGLEMDS
56 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
60 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
61 unlinkmany $MOUNT1/$tfile- 50 || return 2
62 rm $MOUNT2/$tfile || return 3
63 rm $MOUNT2/$tfile-A || return 4
65 run_test 0a "expired recovery with lost client"
67 if [ -f "$LU482_FAILED" ]; then
68 log "Found check file $LU482_FAILED, aborting test script"
69 rm -vf "$LU482_FAILED"
70 complete $(basename $0) $SECONDS
71 do_nodes $CLIENTS umount -f $MOUNT2 || true
72 do_nodes $CLIENTS umount -f $MOUNT || true
73 # copied from stopall, but avoid the MDS recovery
74 for num in `seq $OSTCOUNT`; do
76 rm -f $TMP/ost${num}active
78 if ! combined_mgs_mds ; then
86 replay_barrier $SINGLEMDS
88 touch $MOUNT1/$tfile-2
90 facet_failover $SINGLEMDS
92 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
93 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
94 checkstat $MOUNT1/$tfile-2 && return 1
95 checkstat $MOUNT2/$tfile && return 2
98 run_test 0b "lost client during waiting for next transno"
102 replay_barrier $SINGLEMDS
106 checkstat $MOUNT2/a || return 1
107 checkstat $MOUNT1/b || return 2
108 rm $MOUNT2/a $MOUNT1/b
109 checkstat $MOUNT1/a && return 3
110 checkstat $MOUNT2/b && return 4
114 run_test 1 "|X| simple create"
118 replay_barrier $SINGLEMDS
122 checkstat $MOUNT2/adir || return 1
124 checkstat $MOUNT2/adir && return 2
127 run_test 2 "|X| mkdir adir"
130 replay_barrier $SINGLEMDS
132 mkdir $MOUNT2/adir/bdir
135 checkstat $MOUNT2/adir || return 1
136 checkstat $MOUNT1/adir/bdir || return 2
137 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
138 checkstat $MOUNT1/adir && return 3
139 checkstat $MOUNT2/adir/bdir && return 4
142 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
146 replay_barrier $SINGLEMDS
147 mkdir $MOUNT1/adir && return 1
148 mkdir $MOUNT2/adir/bdir
151 checkstat $MOUNT2/adir || return 2
152 checkstat $MOUNT1/adir/bdir || return 3
154 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
155 checkstat $MOUNT1/adir && return 4
156 checkstat $MOUNT2/adir/bdir && return 5
159 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
163 # multiclient version of replay_single.sh/test_8
165 multiop_bg_pause $MOUNT2/a o_tSc || return 1
168 replay_barrier $SINGLEMDS
170 wait $pid || return 1
173 [ -e $MOUNT2/a ] && return 2
176 run_test 5 "open, unlink |X| close"
181 multiop_bg_pause $MOUNT2/a o_c || return 1
183 multiop_bg_pause $MOUNT1/a o_c || return 1
186 replay_barrier $SINGLEMDS
188 wait $pid1 || return 1
192 wait $pid2 || return 1
193 [ -e $MOUNT2/a ] && return 2
196 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
199 replay_barrier $SINGLEMDS
200 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
202 checkstat $MOUNT2/$tfile || return 2
203 rm $MOUNT1/$tfile || return 3
207 run_test 8 "replay of resent request"
210 replay_barrier $SINGLEMDS
211 mcreate $MOUNT1/$tfile-1
212 mcreate $MOUNT2/$tfile-2
213 # drop first reint reply
214 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
216 do_facet $SINGLEMDS lctl set_param fail_loc=0
218 rm $MOUNT1/$tfile-[1,2] || return 1
222 run_test 9 "resending a replayed create"
225 mcreate $MOUNT1/$tfile-1
226 replay_barrier $SINGLEMDS
227 munlink $MOUNT1/$tfile-1
228 mcreate $MOUNT2/$tfile-2
229 # drop first reint reply
230 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
232 do_facet $SINGLEMDS lctl set_param fail_loc=0
234 checkstat $MOUNT1/$tfile-1 && return 1
235 checkstat $MOUNT1/$tfile-2 || return 2
240 run_test 10 "resending a replayed unlink"
243 replay_barrier $SINGLEMDS
244 mcreate $MOUNT1/$tfile-1
245 mcreate $MOUNT2/$tfile-2
246 mcreate $MOUNT1/$tfile-3
247 mcreate $MOUNT2/$tfile-4
248 mcreate $MOUNT1/$tfile-5
249 # drop all reint replies for a while
250 do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
251 # note that with this fail_loc set, facet_failover df will fail
252 facet_failover $SINGLEMDS
253 #sleep for while, let both clients reconnect and timeout
254 sleep $((TIMEOUT * 2))
255 do_facet $SINGLEMDS lctl set_param fail_loc=0
257 rm $MOUNT1/$tfile-[1-5] || return 1
261 run_test 11 "both clients timeout during replay"
264 replay_barrier $SINGLEMDS
266 multiop_bg_pause $DIR/$tfile mo_c || return 1
269 #define OBD_FAIL_LDLM_ENQUEUE 0x302
270 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
271 facet_failover $SINGLEMDS
272 do_facet $SINGLEMDS lctl set_param fail_loc=0
273 clients_up || return 1
276 kill -USR1 $MULTIPID || return 3
277 wait $MULTIPID || return 4
278 $CHECKSTAT -t file $DIR/$tfile || return 2
283 run_test 12 "open resend timeout"
286 multiop_bg_pause $DIR/$tfile mo_c || return 1
289 replay_barrier $SINGLEMDS
291 kill -USR1 $MULTIPID || return 3
292 wait $MULTIPID || return 4
295 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
296 facet_failover $SINGLEMDS
297 do_facet $SINGLEMDS lctl set_param fail_loc=0
298 clients_up || return 1
301 $CHECKSTAT -t file $DIR/$tfile || return 2
306 run_test 13 "close resend timeout"
308 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
313 wait_destroy_complete
314 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
315 mkdir -p $MOUNT1/$tdir
316 $SETSTRIPE -i 0 $MOUNT1/$tdir
317 replay_barrier $SINGLEMDS
318 createmany -o $MOUNT1/$tdir/$tfile- 5
320 $SETSTRIPE -i 0 $MOUNT2/f14b-3
321 echo "data" > $MOUNT2/f14b-3
322 createmany -o $MOUNT1/$tdir/$tfile-3- 5
326 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
328 # first 25 files should have been replayed
329 unlinkmany $MOUNT1/$tdir/$tfile- 5 || return 2
330 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || return 3
332 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
334 wait_mds_ost_sync || return 4
335 wait_destroy_complete || return 5
337 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
338 log "before $BEFOREUSED, after $AFTERUSED"
339 [ $AFTERUSED -ne $BEFOREUSED ] && \
340 error "after $AFTERUSED > before $BEFOREUSED" && return 4
343 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
345 test_15a() { # was test_15
346 replay_barrier $SINGLEMDS
347 createmany -o $MOUNT1/$tfile- 25
348 createmany -o $MOUNT2/$tfile-2- 1
353 unlinkmany $MOUNT1/$tfile- 25 || return 2
354 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
356 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
359 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
362 replay_barrier $SINGLEMDS
363 for ((i = 0; i < 2000; i++)); do
364 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
370 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
373 run_test 15c "remove multiple OST orphans"
376 replay_barrier $SINGLEMDS
377 createmany -o $MOUNT1/$tfile- 25
378 createmany -o $MOUNT2/$tfile-2- 1
381 facet_failover $SINGLEMDS
385 unlinkmany $MOUNT1/$tfile- 25 || return 2
387 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
391 run_test 16 "fail MDS during recovery (3571)"
394 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
396 createmany -o $MOUNT1/$tfile- 25
397 createmany -o $MOUNT2/$tfile-2- 1
399 # Make sure the disconnect is lost
407 unlinkmany $MOUNT1/$tfile- 25 || return 2
409 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
413 run_test 17 "fail OST during recovery (3571)"
415 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
418 test_18() { # bug 3822 - evicting client with enqueued lock
420 mkdir -p $MOUNT1/$tdir
421 touch $MOUNT1/$tdir/f0
422 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
423 statmany -s $MOUNT1/$tdir/f 1 500 &
426 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
428 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
429 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
431 usleep 500 # wait to ensure first client is one that will be evicted
432 openfile -f O_RDONLY $MOUNT2/$tdir/f0
434 dmesg | grep "entering recovery in server" && \
435 error "client not evicted" || true
436 do_facet client "lctl set_param fail_loc=0"
437 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
439 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
441 test_19() { # Bug 10991 - resend of open request does not fail assertion.
442 replay_barrier $SINGLEMDS
443 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
445 checkstat $DIR2/${tfile}0 || return 2
446 rm $DIR/${tfile}0 || return 3
450 run_test 19 "resend of open request"
454 replay_barrier $SINGLEMDS
460 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
461 TIER1=$((`date +%s` - BEFORE))
463 replay_barrier $SINGLEMDS
469 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
470 TIER2=$((`date +%s` - BEFORE))
471 [ $TIER2 -ge $((TIER1 * 2)) ] && \
472 error "recovery time is growing $TIER2 > $TIER1"
475 run_test 20 "recovery time is not increasing"
477 # commit on sharing tests
479 local param_file=$TMP/$tfile-params
481 save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file
482 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
483 touch $MOUNT1/$tfile-1
484 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
485 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
486 replay_barrier_nosync $SINGLEMDS
489 facet_failover $SINGLEMDS
491 # all renames are replayed
492 unlink $MOUNT1/$tfile-3 || return 2
494 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
496 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
497 rm -rf $MOUNT1/$tfile-*
498 restore_lustre_params < $param_file
502 run_test 21a "commit on sharing"
506 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
509 do_node $CLIENT1 touch $MOUNT1/$tfile-1
510 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
511 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
513 replay_barrier_nosync $mds
514 shutdown_client $CLIENT2 $MOUNT1
518 # were renames replayed?
520 echo UNLINK $MOUNT1/$tfile-3
521 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \
522 { echo "unlink $tfile-3 fail!" && rc=1; }
525 zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail"
531 [ -z "$CLIENTS" ] && skip "Need two or more clients." && return
532 [ $CLIENTCOUNT -lt 2 ] && \
533 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
535 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
536 skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. "
541 zconf_umount_clients $CLIENTS $MOUNT2
542 zconf_mount_clients $CLIENTS $MOUNT1
544 local param_file=$TMP/$tfile-params
546 local num=$(get_mds_dir $MOUNT1)
548 save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file
552 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
554 test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS"
556 # COS disabled (should fail)
558 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
560 # there is still a window when transactions may be written to disk before
561 # the mds device is set R/O. To avoid such a rare test failure, the check
562 # is repeated several times.
565 test_21b_sub mds$num || break;
566 let n_attempts=n_attempts+1
567 [ $n_attempts -gt 3 ] &&
568 error "The test cannot check whether COS works or not: all renames are replied w/o COS"
570 restore_lustre_params < $param_file
574 run_test 21b "commit on sharing, two clients"
576 # end commit on sharing tests
578 complete $(basename $0) $SECONDS
579 SLEEP=$((`date +%s` - $NOW))
580 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
581 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
582 check_and_cleanup_lustre