6 ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT"
9 SETSTRIPE=${SETSTRIPE:-"$LFS setstripe"}
10 GETSTRIPE=${GETSTRIPE:-"$LFS getstripe"}
12 PTLDEBUG=${PTLDEBUG:--1}
13 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 CLEANUP=${CLEANUP:-""}
16 MOUNT_2=${MOUNT_2:-"yes"}
17 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
25 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
29 check_and_setup_lustre
30 MOUNTED=$(mounted_lustre_filesystems)
31 if ! $(echo $MOUNTED | grep -w -q $MOUNT2); then
32 zconf_mount $HOSTNAME $MOUNT2
37 rm -rf $DIR/[df][0-9]*
39 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
41 sleep 10 # Avert LVM and VM inability to flush caches in pre .33 kernels
44 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
45 replay_barrier $SINGLEMDS
46 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
48 createmany -o $MOUNT1/$tfile- 50
49 $LCTL set_param fail_loc=0x80000514
50 facet_failover $SINGLEMDS
54 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
55 unlinkmany $MOUNT1/$tfile- 50 || return 2
56 rm $MOUNT2/$tfile || return 3
57 rm $MOUNT2/$tfile-A || return 4
59 run_test 0a "expired recovery with lost client"
62 replay_barrier $SINGLEMDS
64 touch $MOUNT1/$tfile-2
66 facet_failover $SINGLEMDS
68 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
69 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
70 checkstat $MOUNT1/$tfile-2 && return 1
71 checkstat $MOUNT2/$tfile && return 2
74 run_test 0b "lost client during waiting for next transno"
78 replay_barrier $SINGLEMDS
82 checkstat $MOUNT2/a || return 1
83 checkstat $MOUNT1/b || return 2
84 rm $MOUNT2/a $MOUNT1/b
85 checkstat $MOUNT1/a && return 3
86 checkstat $MOUNT2/b && return 4
90 run_test 1 "|X| simple create"
94 replay_barrier $SINGLEMDS
98 checkstat $MOUNT2/adir || return 1
100 checkstat $MOUNT2/adir && return 2
103 run_test 2 "|X| mkdir adir"
106 replay_barrier $SINGLEMDS
108 mkdir $MOUNT2/adir/bdir
111 checkstat $MOUNT2/adir || return 1
112 checkstat $MOUNT1/adir/bdir || return 2
113 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
114 checkstat $MOUNT1/adir && return 3
115 checkstat $MOUNT2/adir/bdir && return 4
118 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
122 replay_barrier $SINGLEMDS
123 mkdir $MOUNT1/adir && return 1
124 mkdir $MOUNT2/adir/bdir
127 checkstat $MOUNT2/adir || return 2
128 checkstat $MOUNT1/adir/bdir || return 3
130 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
131 checkstat $MOUNT1/adir && return 4
132 checkstat $MOUNT2/adir/bdir && return 5
135 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
139 # multiclient version of replay_single.sh/test_8
141 multiop_bg_pause $MOUNT2/a o_tSc || return 1
144 replay_barrier $SINGLEMDS
146 wait $pid || return 1
149 [ -e $MOUNT2/a ] && return 2
152 run_test 5 "open, unlink |X| close"
157 multiop_bg_pause $MOUNT2/a o_c || return 1
159 multiop_bg_pause $MOUNT1/a o_c || return 1
162 replay_barrier $SINGLEMDS
164 wait $pid1 || return 1
168 wait $pid2 || return 1
169 [ -e $MOUNT2/a ] && return 2
172 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
175 replay_barrier $SINGLEMDS
176 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
178 checkstat $MOUNT2/$tfile || return 2
179 rm $MOUNT1/$tfile || return 3
183 run_test 8 "replay of resent request"
186 replay_barrier $SINGLEMDS
187 mcreate $MOUNT1/$tfile-1
188 mcreate $MOUNT2/$tfile-2
189 # drop first reint reply
190 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
192 do_facet $SINGLEMDS lctl set_param fail_loc=0
194 rm $MOUNT1/$tfile-[1,2] || return 1
198 run_test 9 "resending a replayed create"
201 mcreate $MOUNT1/$tfile-1
202 replay_barrier $SINGLEMDS
203 munlink $MOUNT1/$tfile-1
204 mcreate $MOUNT2/$tfile-2
205 # drop first reint reply
206 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
208 do_facet $SINGLEMDS lctl set_param fail_loc=0
210 checkstat $MOUNT1/$tfile-1 && return 1
211 checkstat $MOUNT1/$tfile-2 || return 2
216 run_test 10 "resending a replayed unlink"
219 replay_barrier $SINGLEMDS
220 mcreate $MOUNT1/$tfile-1
221 mcreate $MOUNT2/$tfile-2
222 mcreate $MOUNT1/$tfile-3
223 mcreate $MOUNT2/$tfile-4
224 mcreate $MOUNT1/$tfile-5
225 # drop all reint replies for a while
226 do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
227 # note that with this fail_loc set, facet_failover df will fail
228 facet_failover $SINGLEMDS
229 #sleep for while, let both clients reconnect and timeout
230 sleep $((TIMEOUT * 2))
231 do_facet $SINGLEMDS lctl set_param fail_loc=0
233 rm $MOUNT1/$tfile-[1-5] || return 1
237 run_test 11 "both clients timeout during replay"
240 replay_barrier $SINGLEMDS
242 multiop_bg_pause $DIR/$tfile mo_c || return 1
245 #define OBD_FAIL_LDLM_ENQUEUE 0x302
246 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
247 facet_failover $SINGLEMDS
248 do_facet $SINGLEMDS lctl set_param fail_loc=0
249 clients_up || return 1
252 kill -USR1 $MULTIPID || return 3
253 wait $MULTIPID || return 4
254 $CHECKSTAT -t file $DIR/$tfile || return 2
259 run_test 12 "open resend timeout"
262 multiop_bg_pause $DIR/$tfile mo_c || return 1
265 replay_barrier $SINGLEMDS
267 kill -USR1 $MULTIPID || return 3
268 wait $MULTIPID || return 4
271 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
272 facet_failover $SINGLEMDS
273 do_facet $SINGLEMDS lctl set_param fail_loc=0
274 clients_up || return 1
277 $CHECKSTAT -t file $DIR/$tfile || return 2
282 run_test 13 "close resend timeout"
284 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
289 wait_destroy_complete
290 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
291 mkdir -p $MOUNT1/$tdir
292 $SETSTRIPE -o 0 $MOUNT1/$tdir
293 replay_barrier $SINGLEMDS
294 createmany -o $MOUNT1/$tdir/$tfile- 5
296 $SETSTRIPE -o 0 $MOUNT2/f14b-3
297 echo "data" > $MOUNT2/f14b-3
298 createmany -o $MOUNT1/$tdir/$tfile-3- 5
302 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
304 # first 25 files should have been replayed
305 unlinkmany $MOUNT1/$tdir/$tfile- 5 || return 2
306 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || return 3
308 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
310 wait_mds_ost_sync || return 4
311 wait_destroy_complete || return 5
313 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
314 log "before $BEFOREUSED, after $AFTERUSED"
315 [ $AFTERUSED -ne $BEFOREUSED ] && \
316 error "after $AFTERUSED > before $BEFOREUSED" && return 4
319 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
321 test_15a() { # was test_15
322 replay_barrier $SINGLEMDS
323 createmany -o $MOUNT1/$tfile- 25
324 createmany -o $MOUNT2/$tfile-2- 1
329 unlinkmany $MOUNT1/$tfile- 25 || return 2
330 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
332 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
335 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
338 replay_barrier $SINGLEMDS
339 for ((i = 0; i < 2000; i++)); do
340 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
346 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
349 run_test 15c "remove multiple OST orphans"
352 replay_barrier $SINGLEMDS
353 createmany -o $MOUNT1/$tfile- 25
354 createmany -o $MOUNT2/$tfile-2- 1
357 facet_failover $SINGLEMDS
361 unlinkmany $MOUNT1/$tfile- 25 || return 2
363 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
367 run_test 16 "fail MDS during recovery (3571)"
370 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
372 createmany -o $MOUNT1/$tfile- 25
373 createmany -o $MOUNT2/$tfile-2- 1
375 # Make sure the disconnect is lost
383 unlinkmany $MOUNT1/$tfile- 25 || return 2
385 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
389 run_test 17 "fail OST during recovery (3571)"
391 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
394 test_18() { # bug 3822 - evicting client with enqueued lock
396 mkdir -p $MOUNT1/$tdir
397 touch $MOUNT1/$tdir/f0
398 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
399 statmany -s $MOUNT1/$tdir/f 1 500 &
402 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
404 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
405 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
407 usleep 500 # wait to ensure first client is one that will be evicted
408 openfile -f O_RDONLY $MOUNT2/$tdir/f0
410 dmesg | grep "entering recovery in server" && \
411 error "client not evicted" || true
412 do_facet client "lctl set_param fail_loc=0"
413 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
415 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
417 test_19() { # Bug 10991 - resend of open request does not fail assertion.
418 replay_barrier $SINGLEMDS
419 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
421 checkstat $DIR2/${tfile}0 || return 2
422 rm $DIR/${tfile}0 || return 3
426 run_test 19 "resend of open request"
430 replay_barrier $SINGLEMDS
436 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
437 TIER1=$((`date +%s` - BEFORE))
439 replay_barrier $SINGLEMDS
445 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
446 TIER2=$((`date +%s` - BEFORE))
447 [ $TIER2 -ge $((TIER1 * 2)) ] && \
448 error "recovery time is growing $TIER2 > $TIER1"
451 run_test 20 "recovery time is not increasing"
453 # commit on sharing tests
455 local param_file=$TMP/$tfile-params
457 save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file
458 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
459 touch $MOUNT1/$tfile-1
460 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
461 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
462 replay_barrier_nosync $SINGLEMDS
465 facet_failover $SINGLEMDS
467 # all renames are replayed
468 unlink $MOUNT1/$tfile-3 || return 2
470 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
472 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
473 rm -rf $MOUNT1/$tfile-*
474 restore_lustre_params < $param_file
478 run_test 21a "commit on sharing"
482 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
485 do_node $CLIENT1 touch $MOUNT1/$tfile-1
486 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
487 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
489 replay_barrier_nosync $mds
490 shutdown_client $CLIENT2 $MOUNT1
494 # were renames replayed?
496 echo UNLINK $MOUNT1/$tfile-3
497 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \
498 { echo "unlink $tfile-3 fail!" && rc=1; }
501 zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail"
507 [ -z "$CLIENTS" ] && skip "Need two or more clients." && return
508 [ $CLIENTCOUNT -lt 2 ] && \
509 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
511 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
512 skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. "
517 zconf_umount_clients $CLIENTS $MOUNT2
518 zconf_mount_clients $CLIENTS $MOUNT1
520 local param_file=$TMP/$tfile-params
522 local num=$(get_mds_dir $MOUNT1)
524 save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file
528 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
530 test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS"
532 # COS disabled (should fail)
534 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
536 # there is still a window when transactions may be written to disk before
537 # the mds device is set R/O. To avoid such a rare test failure, the check
538 # is repeated several times.
541 test_21b_sub mds$num || break;
542 let n_attempts=n_attempts+1
543 [ $n_attempts -gt 3 ] &&
544 error "The test cannot check whether COS works or not: all renames are replied w/o COS"
546 restore_lustre_params < $param_file
550 run_test 21b "commit on sharing, two clients"
552 # end commit on sharing tests
554 complete $(basename $0) $SECONDS
555 SLEEP=$((`date +%s` - $NOW))
556 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
557 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
558 check_and_cleanup_lustre