5 ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT"
8 PTLDEBUG=${PTLDEBUG:--1}
9 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
11 CLEANUP=${CLEANUP:-""}
12 MOUNT_2=${MOUNT_2:-"yes"}
13 . $LUSTRE/tests/test-framework.sh
16 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
19 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
21 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
25 check_and_setup_lustre
26 MOUNTED=$(mounted_lustre_filesystems)
27 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
28 zconf_mount $HOSTNAME $MOUNT2
33 rm -rf $DIR/[df][0-9]*
35 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
37 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
38 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
40 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
43 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
45 echo "Check file is LU482_FAILED=$LU482_FAILED"
46 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
47 replay_barrier $SINGLEMDS
48 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
50 createmany -o $MOUNT1/$tfile- 50
51 $LCTL set_param fail_loc=0x80000514
52 facet_failover $SINGLEMDS
53 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
57 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
58 unlinkmany $MOUNT1/$tfile- 50 || return 2
59 rm $MOUNT2/$tfile || return 3
60 rm $MOUNT2/$tfile-A || return 4
62 run_test 0a "expired recovery with lost client"
64 if [ -f "$LU482_FAILED" ]; then
65 log "Found check file $LU482_FAILED, aborting test script"
66 rm -vf "$LU482_FAILED"
67 complete $(basename $0) $SECONDS
68 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
69 check_and_cleanup_lustre
74 replay_barrier $SINGLEMDS
76 touch $MOUNT1/$tfile-2
78 facet_failover $SINGLEMDS
80 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
81 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
82 checkstat $MOUNT1/$tfile-2 && return 1
83 checkstat $MOUNT2/$tfile && return 2
86 run_test 0b "lost client during waiting for next transno"
90 replay_barrier $SINGLEMDS
94 checkstat $MOUNT2/a || return 1
95 checkstat $MOUNT1/b || return 2
96 rm $MOUNT2/a $MOUNT1/b
97 checkstat $MOUNT1/a && return 3
98 checkstat $MOUNT2/b && return 4
102 run_test 1 "|X| simple create"
106 replay_barrier $SINGLEMDS
110 checkstat $MOUNT2/adir || return 1
112 checkstat $MOUNT2/adir && return 2
115 run_test 2 "|X| mkdir adir"
118 replay_barrier $SINGLEMDS
120 mkdir $MOUNT2/adir/bdir
123 checkstat $MOUNT2/adir || return 1
124 checkstat $MOUNT1/adir/bdir || return 2
125 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
126 checkstat $MOUNT1/adir && return 3
127 checkstat $MOUNT2/adir/bdir && return 4
130 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
134 replay_barrier $SINGLEMDS
135 mkdir $MOUNT1/adir && return 1
136 mkdir $MOUNT2/adir/bdir
139 checkstat $MOUNT2/adir || return 2
140 checkstat $MOUNT1/adir/bdir || return 3
142 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
143 checkstat $MOUNT1/adir && return 4
144 checkstat $MOUNT2/adir/bdir && return 5
147 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
151 # multiclient version of replay_single.sh/test_8
153 multiop_bg_pause $MOUNT2/a o_tSc || return 1
156 replay_barrier $SINGLEMDS
158 wait $pid || return 1
161 [ -e $MOUNT2/a ] && return 2
164 run_test 5 "open, unlink |X| close"
169 multiop_bg_pause $MOUNT2/a o_c || return 1
171 multiop_bg_pause $MOUNT1/a o_c || return 1
174 replay_barrier $SINGLEMDS
176 wait $pid1 || return 1
180 wait $pid2 || return 1
181 [ -e $MOUNT2/a ] && return 2
184 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
187 replay_barrier $SINGLEMDS
188 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
190 checkstat $MOUNT2/$tfile || return 2
191 rm $MOUNT1/$tfile || return 3
195 run_test 8 "replay of resent request"
198 replay_barrier $SINGLEMDS
199 mcreate $MOUNT1/$tfile-1
200 mcreate $MOUNT2/$tfile-2
201 # drop first reint reply
202 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
204 do_facet $SINGLEMDS lctl set_param fail_loc=0
206 rm $MOUNT1/$tfile-[1,2] || return 1
210 run_test 9 "resending a replayed create"
213 mcreate $MOUNT1/$tfile-1
214 replay_barrier $SINGLEMDS
215 munlink $MOUNT1/$tfile-1
216 mcreate $MOUNT2/$tfile-2
217 # drop first reint reply
218 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
220 do_facet $SINGLEMDS lctl set_param fail_loc=0
222 checkstat $MOUNT1/$tfile-1 && return 1
223 checkstat $MOUNT1/$tfile-2 || return 2
228 run_test 10 "resending a replayed unlink"
231 replay_barrier $SINGLEMDS
232 mcreate $MOUNT1/$tfile-1
233 mcreate $MOUNT2/$tfile-2
234 mcreate $MOUNT1/$tfile-3
235 mcreate $MOUNT2/$tfile-4
236 mcreate $MOUNT1/$tfile-5
237 # drop all reint replies for a while
238 do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
239 # note that with this fail_loc set, facet_failover df will fail
240 facet_failover $SINGLEMDS
241 #sleep for while, let both clients reconnect and timeout
242 sleep $((TIMEOUT * 2))
243 do_facet $SINGLEMDS lctl set_param fail_loc=0
245 rm $MOUNT1/$tfile-[1-5] || return 1
249 run_test 11 "both clients timeout during replay"
252 replay_barrier $SINGLEMDS
254 multiop_bg_pause $DIR/$tfile mo_c || return 1
257 #define OBD_FAIL_LDLM_ENQUEUE 0x302
258 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
259 facet_failover $SINGLEMDS
260 do_facet $SINGLEMDS lctl set_param fail_loc=0
261 clients_up || return 1
264 kill -USR1 $MULTIPID || return 3
265 wait $MULTIPID || return 4
266 $CHECKSTAT -t file $DIR/$tfile || return 2
271 run_test 12 "open resend timeout"
274 multiop_bg_pause $DIR/$tfile mo_c || return 1
277 replay_barrier $SINGLEMDS
279 kill -USR1 $MULTIPID || return 3
280 wait $MULTIPID || return 4
283 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
284 facet_failover $SINGLEMDS
285 do_facet $SINGLEMDS lctl set_param fail_loc=0
286 clients_up || return 1
289 $CHECKSTAT -t file $DIR/$tfile || return 2
294 run_test 13 "close resend timeout"
296 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
301 wait_destroy_complete
302 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
303 mkdir -p $MOUNT1/$tdir
304 $SETSTRIPE -i 0 $MOUNT1/$tdir
305 replay_barrier $SINGLEMDS
306 createmany -o $MOUNT1/$tdir/$tfile- 5
308 $SETSTRIPE -i 0 $MOUNT2/f14b-3
309 echo "data" > $MOUNT2/f14b-3
310 createmany -o $MOUNT1/$tdir/$tfile-3- 5
314 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
316 # first 25 files should have been replayed
317 unlinkmany $MOUNT1/$tdir/$tfile- 5 || return 2
318 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || return 3
320 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
322 wait_mds_ost_sync || return 4
323 wait_destroy_complete || return 5
325 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
326 log "before $BEFOREUSED, after $AFTERUSED"
327 [ $AFTERUSED -ne $BEFOREUSED ] && \
328 error "after $AFTERUSED > before $BEFOREUSED" && return 4
331 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
333 test_15a() { # was test_15
334 replay_barrier $SINGLEMDS
335 createmany -o $MOUNT1/$tfile- 25
336 createmany -o $MOUNT2/$tfile-2- 1
341 unlinkmany $MOUNT1/$tfile- 25 || return 2
342 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
344 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
347 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
350 replay_barrier $SINGLEMDS
351 for ((i = 0; i < 2000; i++)); do
352 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
358 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
361 run_test 15c "remove multiple OST orphans"
364 replay_barrier $SINGLEMDS
365 createmany -o $MOUNT1/$tfile- 25
366 createmany -o $MOUNT2/$tfile-2- 1
369 facet_failover $SINGLEMDS
373 unlinkmany $MOUNT1/$tfile- 25 || return 2
375 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
379 run_test 16 "fail MDS during recovery (3571)"
382 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
384 createmany -o $MOUNT1/$tfile- 25
385 createmany -o $MOUNT2/$tfile-2- 1
387 # Make sure the disconnect is lost
395 unlinkmany $MOUNT1/$tfile- 25 || return 2
397 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
401 run_test 17 "fail OST during recovery (3571)"
403 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
406 test_18() { # bug 3822 - evicting client with enqueued lock
408 mkdir -p $MOUNT1/$tdir
409 touch $MOUNT1/$tdir/f0
410 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
411 statmany -s $MOUNT1/$tdir/f 1 500 &
414 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
416 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
417 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
419 usleep 500 # wait to ensure first client is one that will be evicted
420 openfile -f O_RDONLY $MOUNT2/$tdir/f0
422 dmesg | grep "entering recovery in server" && \
423 error "client not evicted" || true
424 do_facet client "lctl set_param fail_loc=0"
425 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
427 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
429 test_19() { # Bug 10991 - resend of open request does not fail assertion.
430 replay_barrier $SINGLEMDS
431 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
433 checkstat $DIR2/${tfile}0 || return 2
434 rm $DIR/${tfile}0 || return 3
438 run_test 19 "resend of open request"
442 replay_barrier $SINGLEMDS
448 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
449 TIER1=$((`date +%s` - BEFORE))
451 replay_barrier $SINGLEMDS
457 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
458 TIER2=$((`date +%s` - BEFORE))
459 [ $TIER2 -ge $((TIER1 * 2)) ] && \
460 error "recovery time is growing $TIER2 > $TIER1"
463 run_test 20 "recovery time is not increasing"
465 # commit on sharing tests
467 local param_file=$TMP/$tfile-params
469 save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file
470 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
471 touch $MOUNT1/$tfile-1
472 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
473 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
474 replay_barrier_nosync $SINGLEMDS
477 facet_failover $SINGLEMDS
479 # all renames are replayed
480 unlink $MOUNT1/$tfile-3 || return 2
482 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
484 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
485 rm -rf $MOUNT1/$tfile-*
486 restore_lustre_params < $param_file
490 run_test 21a "commit on sharing"
494 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
497 do_node $CLIENT1 touch $MOUNT1/$tfile-1
498 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
499 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
501 replay_barrier_nosync $mds
502 shutdown_client $CLIENT2 $MOUNT1
506 # were renames replayed?
508 echo UNLINK $MOUNT1/$tfile-3
509 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \
510 { echo "unlink $tfile-3 fail!" && rc=1; }
513 zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail"
519 [ -z "$CLIENTS" ] && skip "Need two or more clients." && return
520 [ $CLIENTCOUNT -lt 2 ] && \
521 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
523 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
524 skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. "
529 zconf_umount_clients $CLIENTS $MOUNT2
530 zconf_mount_clients $CLIENTS $MOUNT1
532 local param_file=$TMP/$tfile-params
534 local num=$(get_mds_dir $MOUNT1)
536 save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file
540 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
542 test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS"
544 # COS disabled (should fail)
546 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
548 # there is still a window when transactions may be written to disk before
549 # the mds device is set R/O. To avoid such a rare test failure, the check
550 # is repeated several times.
553 test_21b_sub mds$num || break;
554 let n_attempts=n_attempts+1
555 [ $n_attempts -gt 3 ] &&
556 error "The test cannot check whether COS works or not: all renames are replied w/o COS"
558 restore_lustre_params < $param_file
562 run_test 21b "commit on sharing, two clients"
564 # end commit on sharing tests
566 complete $(basename $0) $SECONDS
567 SLEEP=$((`date +%s` - $NOW))
568 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
569 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
570 check_and_cleanup_lustre