2 # -*- mode: Bash; tab-width: 4; indent-tabs-mode: t; -*-
3 # vim:shiftwidth=4:softtabstop=4:tabstop=4:
8 ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT"
11 PTLDEBUG=${PTLDEBUG:--1}
12 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
14 CLEANUP=${CLEANUP:-""}
15 MOUNT_2=${MOUNT_2:-"yes"}
16 . $LUSTRE/tests/test-framework.sh
19 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
22 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
24 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
28 check_and_setup_lustre
29 MOUNTED=$(mounted_lustre_filesystems)
30 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
31 zconf_mount $HOSTNAME $MOUNT2
36 rm -rf $DIR/[df][0-9]*
38 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
40 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
41 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
43 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
46 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
48 echo "Check file is LU482_FAILED=$LU482_FAILED"
49 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
50 replay_barrier $SINGLEMDS
51 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
53 createmany -o $MOUNT1/$tfile- 50
54 $LCTL set_param fail_loc=0x80000514
55 facet_failover $SINGLEMDS
56 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
60 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
61 unlinkmany $MOUNT1/$tfile- 50 || return 2
62 rm $MOUNT2/$tfile || return 3
63 rm $MOUNT2/$tfile-A || return 4
65 run_test 0a "expired recovery with lost client"
67 if [ -f "$LU482_FAILED" ]; then
68 log "Found check file $LU482_FAILED, aborting test script"
69 rm -vf "$LU482_FAILED"
70 complete $(basename $0) $SECONDS
71 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
72 check_and_cleanup_lustre
77 replay_barrier $SINGLEMDS
79 touch $MOUNT1/$tfile-2
81 facet_failover $SINGLEMDS
83 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
84 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
85 checkstat $MOUNT1/$tfile-2 && return 1
86 checkstat $MOUNT2/$tfile && return 2
89 run_test 0b "lost client during waiting for next transno"
93 replay_barrier $SINGLEMDS
97 checkstat $MOUNT2/a || return 1
98 checkstat $MOUNT1/b || return 2
99 rm $MOUNT2/a $MOUNT1/b
100 checkstat $MOUNT1/a && return 3
101 checkstat $MOUNT2/b && return 4
105 run_test 1 "|X| simple create"
109 replay_barrier $SINGLEMDS
113 checkstat $MOUNT2/adir || return 1
115 checkstat $MOUNT2/adir && return 2
118 run_test 2 "|X| mkdir adir"
121 replay_barrier $SINGLEMDS
123 mkdir $MOUNT2/adir/bdir
126 checkstat $MOUNT2/adir || return 1
127 checkstat $MOUNT1/adir/bdir || return 2
128 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
129 checkstat $MOUNT1/adir && return 3
130 checkstat $MOUNT2/adir/bdir && return 4
133 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
137 replay_barrier $SINGLEMDS
138 mkdir $MOUNT1/adir && return 1
139 mkdir $MOUNT2/adir/bdir
142 checkstat $MOUNT2/adir || return 2
143 checkstat $MOUNT1/adir/bdir || return 3
145 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
146 checkstat $MOUNT1/adir && return 4
147 checkstat $MOUNT2/adir/bdir && return 5
150 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
154 # multiclient version of replay_single.sh/test_8
156 multiop_bg_pause $MOUNT2/a o_tSc || return 1
159 replay_barrier $SINGLEMDS
161 wait $pid || return 1
164 [ -e $MOUNT2/a ] && return 2
167 run_test 5 "open, unlink |X| close"
172 multiop_bg_pause $MOUNT2/a o_c || return 1
174 multiop_bg_pause $MOUNT1/a o_c || return 1
177 replay_barrier $SINGLEMDS
179 wait $pid1 || return 1
183 wait $pid2 || return 1
184 [ -e $MOUNT2/a ] && return 2
187 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
190 replay_barrier $SINGLEMDS
191 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
193 checkstat $MOUNT2/$tfile || return 2
194 rm $MOUNT1/$tfile || return 3
198 run_test 8 "replay of resent request"
201 replay_barrier $SINGLEMDS
202 mcreate $MOUNT1/$tfile-1
203 mcreate $MOUNT2/$tfile-2
204 # drop first reint reply
205 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
207 do_facet $SINGLEMDS lctl set_param fail_loc=0
209 rm $MOUNT1/$tfile-[1,2] || return 1
213 run_test 9 "resending a replayed create"
216 mcreate $MOUNT1/$tfile-1
217 replay_barrier $SINGLEMDS
218 munlink $MOUNT1/$tfile-1
219 mcreate $MOUNT2/$tfile-2
220 # drop first reint reply
221 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
223 do_facet $SINGLEMDS lctl set_param fail_loc=0
225 checkstat $MOUNT1/$tfile-1 && return 1
226 checkstat $MOUNT1/$tfile-2 || return 2
231 run_test 10 "resending a replayed unlink"
234 replay_barrier $SINGLEMDS
235 mcreate $MOUNT1/$tfile-1
236 mcreate $MOUNT2/$tfile-2
237 mcreate $MOUNT1/$tfile-3
238 mcreate $MOUNT2/$tfile-4
239 mcreate $MOUNT1/$tfile-5
240 # drop all reint replies for a while
241 do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
242 # note that with this fail_loc set, facet_failover df will fail
243 facet_failover $SINGLEMDS
244 #sleep for while, let both clients reconnect and timeout
245 sleep $((TIMEOUT * 2))
246 do_facet $SINGLEMDS lctl set_param fail_loc=0
248 rm $MOUNT1/$tfile-[1-5] || return 1
252 run_test 11 "both clients timeout during replay"
255 replay_barrier $SINGLEMDS
257 multiop_bg_pause $DIR/$tfile mo_c || return 1
260 #define OBD_FAIL_LDLM_ENQUEUE 0x302
261 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
262 facet_failover $SINGLEMDS
263 do_facet $SINGLEMDS lctl set_param fail_loc=0
264 clients_up || return 1
267 kill -USR1 $MULTIPID || return 3
268 wait $MULTIPID || return 4
269 $CHECKSTAT -t file $DIR/$tfile || return 2
274 run_test 12 "open resend timeout"
277 multiop_bg_pause $DIR/$tfile mo_c || return 1
280 replay_barrier $SINGLEMDS
282 kill -USR1 $MULTIPID || return 3
283 wait $MULTIPID || return 4
286 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
287 facet_failover $SINGLEMDS
288 do_facet $SINGLEMDS lctl set_param fail_loc=0
289 clients_up || return 1
292 $CHECKSTAT -t file $DIR/$tfile || return 2
297 run_test 13 "close resend timeout"
299 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
304 wait_destroy_complete
305 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
306 mkdir -p $MOUNT1/$tdir
307 $SETSTRIPE -i 0 $MOUNT1/$tdir
308 replay_barrier $SINGLEMDS
309 createmany -o $MOUNT1/$tdir/$tfile- 5
311 $SETSTRIPE -i 0 $MOUNT2/f14b-3
312 echo "data" > $MOUNT2/f14b-3
313 createmany -o $MOUNT1/$tdir/$tfile-3- 5
317 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
319 # first 25 files should have been replayed
320 unlinkmany $MOUNT1/$tdir/$tfile- 5 || return 2
321 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || return 3
323 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
325 wait_mds_ost_sync || return 4
326 wait_destroy_complete || return 5
328 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
329 log "before $BEFOREUSED, after $AFTERUSED"
330 [ $AFTERUSED -ne $BEFOREUSED ] && \
331 error "after $AFTERUSED > before $BEFOREUSED" && return 4
334 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
336 test_15a() { # was test_15
337 replay_barrier $SINGLEMDS
338 createmany -o $MOUNT1/$tfile- 25
339 createmany -o $MOUNT2/$tfile-2- 1
344 unlinkmany $MOUNT1/$tfile- 25 || return 2
345 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
347 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
350 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
353 replay_barrier $SINGLEMDS
354 for ((i = 0; i < 2000; i++)); do
355 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
361 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
364 run_test 15c "remove multiple OST orphans"
367 replay_barrier $SINGLEMDS
368 createmany -o $MOUNT1/$tfile- 25
369 createmany -o $MOUNT2/$tfile-2- 1
372 facet_failover $SINGLEMDS
376 unlinkmany $MOUNT1/$tfile- 25 || return 2
378 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
382 run_test 16 "fail MDS during recovery (3571)"
385 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
387 createmany -o $MOUNT1/$tfile- 25
388 createmany -o $MOUNT2/$tfile-2- 1
390 # Make sure the disconnect is lost
398 unlinkmany $MOUNT1/$tfile- 25 || return 2
400 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
404 run_test 17 "fail OST during recovery (3571)"
406 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
409 test_18() { # bug 3822 - evicting client with enqueued lock
411 mkdir -p $MOUNT1/$tdir
412 touch $MOUNT1/$tdir/f0
413 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
414 statmany -s $MOUNT1/$tdir/f 1 500 &
417 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
419 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
420 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
422 usleep 500 # wait to ensure first client is one that will be evicted
423 openfile -f O_RDONLY $MOUNT2/$tdir/f0
425 dmesg | grep "entering recovery in server" && \
426 error "client not evicted" || true
427 do_facet client "lctl set_param fail_loc=0"
428 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
430 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
432 test_19() { # Bug 10991 - resend of open request does not fail assertion.
433 replay_barrier $SINGLEMDS
434 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
436 checkstat $DIR2/${tfile}0 || return 2
437 rm $DIR/${tfile}0 || return 3
441 run_test 19 "resend of open request"
445 replay_barrier $SINGLEMDS
451 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
452 TIER1=$((`date +%s` - BEFORE))
454 replay_barrier $SINGLEMDS
460 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
461 TIER2=$((`date +%s` - BEFORE))
462 [ $TIER2 -ge $((TIER1 * 2)) ] && \
463 error "recovery time is growing $TIER2 > $TIER1"
466 run_test 20 "recovery time is not increasing"
468 # commit on sharing tests
470 local param_file=$TMP/$tfile-params
472 save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file
473 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
474 touch $MOUNT1/$tfile-1
475 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
476 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
477 replay_barrier_nosync $SINGLEMDS
480 facet_failover $SINGLEMDS
482 # all renames are replayed
483 unlink $MOUNT1/$tfile-3 || return 2
485 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
487 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
488 rm -rf $MOUNT1/$tfile-*
489 restore_lustre_params < $param_file
493 run_test 21a "commit on sharing"
497 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
500 do_node $CLIENT1 touch $MOUNT1/$tfile-1
501 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
502 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
504 replay_barrier_nosync $mds
505 shutdown_client $CLIENT2 $MOUNT1
509 # were renames replayed?
511 echo UNLINK $MOUNT1/$tfile-3
512 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \
513 { echo "unlink $tfile-3 fail!" && rc=1; }
516 zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail"
522 [ -z "$CLIENTS" ] && skip "Need two or more clients." && return
523 [ $CLIENTCOUNT -lt 2 ] && \
524 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
526 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
527 skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. "
532 zconf_umount_clients $CLIENTS $MOUNT2
533 zconf_mount_clients $CLIENTS $MOUNT1
535 local param_file=$TMP/$tfile-params
537 local num=$(get_mds_dir $MOUNT1)
539 save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file
543 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
545 test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS"
547 # COS disabled (should fail)
549 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
551 # there is still a window when transactions may be written to disk before
552 # the mds device is set R/O. To avoid such a rare test failure, the check
553 # is repeated several times.
556 test_21b_sub mds$num || break;
557 let n_attempts=n_attempts+1
558 [ $n_attempts -gt 3 ] &&
559 error "The test cannot check whether COS works or not: all renames are replied w/o COS"
561 restore_lustre_params < $param_file
565 run_test 21b "commit on sharing, two clients"
567 # end commit on sharing tests
569 complete $(basename $0) $SECONDS
570 SLEEP=$((`date +%s` - $NOW))
571 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
572 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
573 check_and_cleanup_lustre