6 ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT"
9 SETSTRIPE=${SETSTRIPE:-"$LFS setstripe"}
10 GETSTRIPE=${GETSTRIPE:-"$LFS getstripe"}
12 PTLDEBUG=${PTLDEBUG:--1}
13 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
15 CLEANUP=${CLEANUP:-""}
16 MOUNT_2=${MOUNT_2:-"yes"}
17 . $LUSTRE/tests/test-framework.sh
20 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
23 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
25 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
29 check_and_setup_lustre
30 MOUNTED=$(mounted_lustre_filesystems)
31 if ! $(echo $MOUNTED | grep -w -q $MOUNT2); then
32 zconf_mount $HOSTNAME $MOUNT2
37 rm -rf $DIR/[df][0-9]*
39 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
42 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
43 replay_barrier $SINGLEMDS
44 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
46 createmany -o $MOUNT1/$tfile- 50
47 $LCTL set_param fail_loc=0x80000514
48 facet_failover $SINGLEMDS
52 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
53 unlinkmany $MOUNT1/$tfile- 50 || return 2
54 rm $MOUNT2/$tfile || return 3
55 rm $MOUNT2/$tfile-A || return 4
57 run_test 0a "expired recovery with lost client"
60 replay_barrier $SINGLEMDS
62 touch $MOUNT1/$tfile-2
64 facet_failover $SINGLEMDS
66 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
67 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
68 checkstat $MOUNT1/$tfile-2 && return 1
69 checkstat $MOUNT2/$tfile && return 2
72 run_test 0b "lost client during waiting for next transno"
76 replay_barrier $SINGLEMDS
80 checkstat $MOUNT2/a || return 1
81 checkstat $MOUNT1/b || return 2
82 rm $MOUNT2/a $MOUNT1/b
83 checkstat $MOUNT1/a && return 3
84 checkstat $MOUNT2/b && return 4
88 run_test 1 "|X| simple create"
92 replay_barrier $SINGLEMDS
96 checkstat $MOUNT2/adir || return 1
98 checkstat $MOUNT2/adir && return 2
101 run_test 2 "|X| mkdir adir"
104 replay_barrier $SINGLEMDS
106 mkdir $MOUNT2/adir/bdir
109 checkstat $MOUNT2/adir || return 1
110 checkstat $MOUNT1/adir/bdir || return 2
111 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
112 checkstat $MOUNT1/adir && return 3
113 checkstat $MOUNT2/adir/bdir && return 4
116 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
120 replay_barrier $SINGLEMDS
121 mkdir $MOUNT1/adir && return 1
122 mkdir $MOUNT2/adir/bdir
125 checkstat $MOUNT2/adir || return 2
126 checkstat $MOUNT1/adir/bdir || return 3
128 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
129 checkstat $MOUNT1/adir && return 4
130 checkstat $MOUNT2/adir/bdir && return 5
133 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
137 # multiclient version of replay_single.sh/test_8
139 multiop_bg_pause $MOUNT2/a o_tSc || return 1
142 replay_barrier $SINGLEMDS
144 wait $pid || return 1
147 [ -e $MOUNT2/a ] && return 2
150 run_test 5 "open, unlink |X| close"
155 multiop_bg_pause $MOUNT2/a o_c || return 1
157 multiop_bg_pause $MOUNT1/a o_c || return 1
160 replay_barrier $SINGLEMDS
162 wait $pid1 || return 1
166 wait $pid2 || return 1
167 [ -e $MOUNT2/a ] && return 2
170 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
173 replay_barrier $SINGLEMDS
174 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
176 checkstat $MOUNT2/$tfile || return 2
177 rm $MOUNT1/$tfile || return 3
181 run_test 8 "replay of resent request"
184 replay_barrier $SINGLEMDS
185 mcreate $MOUNT1/$tfile-1
186 mcreate $MOUNT2/$tfile-2
187 # drop first reint reply
188 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
190 do_facet $SINGLEMDS lctl set_param fail_loc=0
192 rm $MOUNT1/$tfile-[1,2] || return 1
196 run_test 9 "resending a replayed create"
199 mcreate $MOUNT1/$tfile-1
200 replay_barrier $SINGLEMDS
201 munlink $MOUNT1/$tfile-1
202 mcreate $MOUNT2/$tfile-2
203 # drop first reint reply
204 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
206 do_facet $SINGLEMDS lctl set_param fail_loc=0
208 checkstat $MOUNT1/$tfile-1 && return 1
209 checkstat $MOUNT1/$tfile-2 || return 2
214 run_test 10 "resending a replayed unlink"
217 replay_barrier $SINGLEMDS
218 mcreate $MOUNT1/$tfile-1
219 mcreate $MOUNT2/$tfile-2
220 mcreate $MOUNT1/$tfile-3
221 mcreate $MOUNT2/$tfile-4
222 mcreate $MOUNT1/$tfile-5
223 # drop all reint replies for a while
224 do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
225 # note that with this fail_loc set, facet_failover df will fail
226 facet_failover $SINGLEMDS
227 #sleep for while, let both clients reconnect and timeout
228 sleep $((TIMEOUT * 2))
229 do_facet $SINGLEMDS lctl set_param fail_loc=0
231 rm $MOUNT1/$tfile-[1-5] || return 1
235 run_test 11 "both clients timeout during replay"
238 replay_barrier $SINGLEMDS
240 multiop_bg_pause $DIR/$tfile mo_c || return 1
243 #define OBD_FAIL_LDLM_ENQUEUE 0x302
244 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
245 facet_failover $SINGLEMDS
246 do_facet $SINGLEMDS lctl set_param fail_loc=0
247 clients_up || return 1
250 kill -USR1 $MULTIPID || return 3
251 wait $MULTIPID || return 4
252 $CHECKSTAT -t file $DIR/$tfile || return 2
257 run_test 12 "open resend timeout"
260 multiop_bg_pause $DIR/$tfile mo_c || return 1
263 replay_barrier $SINGLEMDS
265 kill -USR1 $MULTIPID || return 3
266 wait $MULTIPID || return 4
269 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
270 facet_failover $SINGLEMDS
271 do_facet $SINGLEMDS lctl set_param fail_loc=0
272 clients_up || return 1
275 $CHECKSTAT -t file $DIR/$tfile || return 2
280 run_test 13 "close resend timeout"
282 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
287 wait_destroy_complete
288 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
289 mkdir -p $MOUNT1/$tdir
290 $SETSTRIPE -o 0 $MOUNT1/$tdir
291 replay_barrier $SINGLEMDS
292 createmany -o $MOUNT1/$tdir/$tfile- 5
294 $SETSTRIPE -o 0 $MOUNT2/f14b-3
295 echo "data" > $MOUNT2/f14b-3
296 createmany -o $MOUNT1/$tdir/$tfile-3- 5
300 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
302 # first 25 files should have been replayed
303 unlinkmany $MOUNT1/$tdir/$tfile- 5 || return 2
304 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || return 3
306 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
308 wait_mds_ost_sync || return 4
309 wait_destroy_complete || return 5
311 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
312 log "before $BEFOREUSED, after $AFTERUSED"
313 [ $AFTERUSED -ne $BEFOREUSED ] && \
314 error "after $AFTERUSED > before $BEFOREUSED" && return 4
317 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
319 test_15a() { # was test_15
320 replay_barrier $SINGLEMDS
321 createmany -o $MOUNT1/$tfile- 25
322 createmany -o $MOUNT2/$tfile-2- 1
327 unlinkmany $MOUNT1/$tfile- 25 || return 2
328 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
330 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
333 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
336 replay_barrier $SINGLEMDS
337 for ((i = 0; i < 2000; i++)); do
338 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
344 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
347 run_test 15c "remove multiple OST orphans"
350 replay_barrier $SINGLEMDS
351 createmany -o $MOUNT1/$tfile- 25
352 createmany -o $MOUNT2/$tfile-2- 1
355 facet_failover $SINGLEMDS
359 unlinkmany $MOUNT1/$tfile- 25 || return 2
361 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
365 run_test 16 "fail MDS during recovery (3571)"
368 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
370 createmany -o $MOUNT1/$tfile- 25
371 createmany -o $MOUNT2/$tfile-2- 1
373 # Make sure the disconnect is lost
381 unlinkmany $MOUNT1/$tfile- 25 || return 2
383 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
387 run_test 17 "fail OST during recovery (3571)"
389 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
392 test_18() { # bug 3822 - evicting client with enqueued lock
394 mkdir -p $MOUNT1/$tdir
395 touch $MOUNT1/$tdir/f0
396 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
397 statmany -s $MOUNT1/$tdir/f 1 500 &
400 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
402 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
403 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
405 usleep 500 # wait to ensure first client is one that will be evicted
406 openfile -f O_RDONLY $MOUNT2/$tdir/f0
408 dmesg | grep "entering recovery in server" && \
409 error "client not evicted" || true
410 do_facet client "lctl set_param fail_loc=0"
411 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
413 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
415 test_19() { # Bug 10991 - resend of open request does not fail assertion.
416 replay_barrier $SINGLEMDS
417 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
419 checkstat $DIR2/${tfile}0 || return 2
420 rm $DIR/${tfile}0 || return 3
424 run_test 19 "resend of open request"
428 replay_barrier $SINGLEMDS
434 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
435 TIER1=$((`date +%s` - BEFORE))
437 replay_barrier $SINGLEMDS
443 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
444 TIER2=$((`date +%s` - BEFORE))
445 [ $TIER2 -ge $((TIER1 * 2)) ] && \
446 error "recovery time is growing $TIER2 > $TIER1"
449 run_test 20 "recovery time is not increasing"
451 # commit on sharing tests
453 local param_file=$TMP/$tfile-params
455 save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file
456 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
457 touch $MOUNT1/$tfile-1
458 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
459 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
460 replay_barrier_nosync $SINGLEMDS
463 facet_failover $SINGLEMDS
465 # all renames are replayed
466 unlink $MOUNT1/$tfile-3 || return 2
468 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
470 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
471 rm -rf $MOUNT1/$tfile-*
472 restore_lustre_params < $param_file
476 run_test 21a "commit on sharing"
480 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
483 do_node $CLIENT1 touch $MOUNT1/$tfile-1
484 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
485 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
487 replay_barrier_nosync $mds
488 shutdown_client $CLIENT2 $MOUNT1
492 # were renames replayed?
494 echo UNLINK $MOUNT1/$tfile-3
495 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \
496 { echo "unlink $tfile-3 fail!" && rc=1; }
499 zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail"
505 [ -z "$CLIENTS" ] && skip "Need two or more clients." && return
506 [ $CLIENTCOUNT -lt 2 ] && \
507 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
509 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
510 skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. "
515 zconf_umount_clients $CLIENTS $MOUNT2
516 zconf_mount_clients $CLIENTS $MOUNT1
518 local param_file=$TMP/$tfile-params
520 local num=$(get_mds_dir $MOUNT1)
522 save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file
526 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
528 test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS"
530 # COS disabled (should fail)
532 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
534 # there is still a window when transactions may be written to disk before
535 # the mds device is set R/O. To avoid such a rare test failure, the check
536 # is repeated several times.
539 test_21b_sub mds$num || break;
540 let n_attempts=n_attempts+1
541 [ $n_attempts -gt 3 ] &&
542 error "The test cannot check whether COS works or not: all renames are replied w/o COS"
544 restore_lustre_params < $param_file
548 run_test 21b "commit on sharing, two clients"
550 # end commit on sharing tests
552 complete $(basename $0) $SECONDS
553 SLEEP=$((`date +%s` - $NOW))
554 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
555 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
556 check_and_cleanup_lustre