6 ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT"
9 PTLDEBUG=${PTLDEBUG:--1}
10 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
12 CLEANUP=${CLEANUP:-""}
13 MOUNT_2=${MOUNT_2:-"yes"}
14 . $LUSTRE/tests/test-framework.sh
17 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
20 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
22 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
26 check_and_setup_lustre
27 MOUNTED=$(mounted_lustre_filesystems)
28 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
29 zconf_mount $HOSTNAME $MOUNT2
34 rm -rf $DIR/[df][0-9]*
36 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
38 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
39 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
41 do_facet $SINGLEMDS sync
45 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
46 replay_barrier $SINGLEMDS
47 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
49 createmany -o $MOUNT1/$tfile- 50
50 $LCTL set_param fail_loc=0x80000514
51 facet_failover $SINGLEMDS
55 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
56 unlinkmany $MOUNT1/$tfile- 50 || return 2
57 rm $MOUNT2/$tfile || return 3
58 rm $MOUNT2/$tfile-A || return 4
60 run_test 0a "expired recovery with lost client"
63 replay_barrier $SINGLEMDS
65 touch $MOUNT1/$tfile-2
67 facet_failover $SINGLEMDS
69 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
70 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
71 checkstat $MOUNT1/$tfile-2 && return 1
72 checkstat $MOUNT2/$tfile && return 2
75 run_test 0b "lost client during waiting for next transno"
79 replay_barrier $SINGLEMDS
83 checkstat $MOUNT2/a || return 1
84 checkstat $MOUNT1/b || return 2
85 rm $MOUNT2/a $MOUNT1/b
86 checkstat $MOUNT1/a && return 3
87 checkstat $MOUNT2/b && return 4
91 run_test 1 "|X| simple create"
95 replay_barrier $SINGLEMDS
99 checkstat $MOUNT2/adir || return 1
101 checkstat $MOUNT2/adir && return 2
104 run_test 2 "|X| mkdir adir"
107 replay_barrier $SINGLEMDS
109 mkdir $MOUNT2/adir/bdir
112 checkstat $MOUNT2/adir || return 1
113 checkstat $MOUNT1/adir/bdir || return 2
114 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
115 checkstat $MOUNT1/adir && return 3
116 checkstat $MOUNT2/adir/bdir && return 4
119 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
123 replay_barrier $SINGLEMDS
124 mkdir $MOUNT1/adir && return 1
125 mkdir $MOUNT2/adir/bdir
128 checkstat $MOUNT2/adir || return 2
129 checkstat $MOUNT1/adir/bdir || return 3
131 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
132 checkstat $MOUNT1/adir && return 4
133 checkstat $MOUNT2/adir/bdir && return 5
136 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
140 # multiclient version of replay_single.sh/test_8
142 multiop_bg_pause $MOUNT2/a o_tSc || return 1
145 replay_barrier $SINGLEMDS
147 wait $pid || return 1
150 [ -e $MOUNT2/a ] && return 2
153 run_test 5 "open, unlink |X| close"
158 multiop_bg_pause $MOUNT2/a o_c || return 1
160 multiop_bg_pause $MOUNT1/a o_c || return 1
163 replay_barrier $SINGLEMDS
165 wait $pid1 || return 1
169 wait $pid2 || return 1
170 [ -e $MOUNT2/a ] && return 2
173 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
176 replay_barrier $SINGLEMDS
177 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
179 checkstat $MOUNT2/$tfile || return 2
180 rm $MOUNT1/$tfile || return 3
184 run_test 8 "replay of resent request"
187 replay_barrier $SINGLEMDS
188 mcreate $MOUNT1/$tfile-1
189 mcreate $MOUNT2/$tfile-2
190 # drop first reint reply
191 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
193 do_facet $SINGLEMDS lctl set_param fail_loc=0
195 rm $MOUNT1/$tfile-[1,2] || return 1
199 run_test 9 "resending a replayed create"
202 mcreate $MOUNT1/$tfile-1
203 replay_barrier $SINGLEMDS
204 munlink $MOUNT1/$tfile-1
205 mcreate $MOUNT2/$tfile-2
206 # drop first reint reply
207 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
209 do_facet $SINGLEMDS lctl set_param fail_loc=0
211 checkstat $MOUNT1/$tfile-1 && return 1
212 checkstat $MOUNT1/$tfile-2 || return 2
217 run_test 10 "resending a replayed unlink"
220 replay_barrier $SINGLEMDS
221 mcreate $MOUNT1/$tfile-1
222 mcreate $MOUNT2/$tfile-2
223 mcreate $MOUNT1/$tfile-3
224 mcreate $MOUNT2/$tfile-4
225 mcreate $MOUNT1/$tfile-5
226 # drop all reint replies for a while
227 do_facet $SINGLEMDS lctl set_param fail_loc=0x0119
228 # note that with this fail_loc set, facet_failover df will fail
229 facet_failover $SINGLEMDS
230 #sleep for while, let both clients reconnect and timeout
231 sleep $((TIMEOUT * 2))
232 do_facet $SINGLEMDS lctl set_param fail_loc=0
234 rm $MOUNT1/$tfile-[1-5] || return 1
238 run_test 11 "both clients timeout during replay"
241 replay_barrier $SINGLEMDS
243 multiop_bg_pause $DIR/$tfile mo_c || return 1
246 #define OBD_FAIL_LDLM_ENQUEUE 0x302
247 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
248 facet_failover $SINGLEMDS
249 do_facet $SINGLEMDS lctl set_param fail_loc=0
250 clients_up || return 1
253 kill -USR1 $MULTIPID || return 3
254 wait $MULTIPID || return 4
255 $CHECKSTAT -t file $DIR/$tfile || return 2
260 run_test 12 "open resend timeout"
263 multiop_bg_pause $DIR/$tfile mo_c || return 1
266 replay_barrier $SINGLEMDS
268 kill -USR1 $MULTIPID || return 3
269 wait $MULTIPID || return 4
272 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
273 facet_failover $SINGLEMDS
274 do_facet $SINGLEMDS lctl set_param fail_loc=0
275 clients_up || return 1
278 $CHECKSTAT -t file $DIR/$tfile || return 2
283 run_test 13 "close resend timeout"
285 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
290 wait_destroy_complete
291 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
292 mkdir -p $MOUNT1/$tdir
293 $SETSTRIPE -i 0 $MOUNT1/$tdir
294 replay_barrier $SINGLEMDS
295 createmany -o $MOUNT1/$tdir/$tfile- 5
297 $SETSTRIPE -i 0 $MOUNT2/f14b-3
298 echo "data" > $MOUNT2/f14b-3
299 createmany -o $MOUNT1/$tdir/$tfile-3- 5
303 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
305 # first 25 files should have been replayed
306 unlinkmany $MOUNT1/$tdir/$tfile- 5 || return 2
307 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || return 3
309 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
311 wait_mds_ost_sync || return 4
312 wait_destroy_complete || return 5
314 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
315 log "before $BEFOREUSED, after $AFTERUSED"
316 [ $AFTERUSED -ne $BEFOREUSED ] && \
317 error "after $AFTERUSED > before $BEFOREUSED" && return 4
320 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
322 test_15a() { # was test_15
323 replay_barrier $SINGLEMDS
324 createmany -o $MOUNT1/$tfile- 25
325 createmany -o $MOUNT2/$tfile-2- 1
330 unlinkmany $MOUNT1/$tfile- 25 || return 2
331 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
333 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
336 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
339 replay_barrier $SINGLEMDS
340 for ((i = 0; i < 2000; i++)); do
341 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
347 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
350 run_test 15c "remove multiple OST orphans"
353 replay_barrier $SINGLEMDS
354 createmany -o $MOUNT1/$tfile- 25
355 createmany -o $MOUNT2/$tfile-2- 1
358 facet_failover $SINGLEMDS
362 unlinkmany $MOUNT1/$tfile- 25 || return 2
364 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
368 run_test 16 "fail MDS during recovery (3571)"
371 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
373 createmany -o $MOUNT1/$tfile- 25
374 createmany -o $MOUNT2/$tfile-2- 1
376 # Make sure the disconnect is lost
384 unlinkmany $MOUNT1/$tfile- 25 || return 2
386 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
390 run_test 17 "fail OST during recovery (3571)"
392 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
395 test_18() { # bug 3822 - evicting client with enqueued lock
397 mkdir -p $MOUNT1/$tdir
398 touch $MOUNT1/$tdir/f0
399 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
400 statmany -s $MOUNT1/$tdir/f 1 500 &
403 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
405 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305
406 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
408 usleep 500 # wait to ensure first client is one that will be evicted
409 openfile -f O_RDONLY $MOUNT2/$tdir/f0
411 dmesg | grep "entering recovery in server" && \
412 error "client not evicted" || true
413 do_facet client "lctl set_param fail_loc=0"
414 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
416 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
418 test_19() { # Bug 10991 - resend of open request does not fail assertion.
419 replay_barrier $SINGLEMDS
420 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
422 checkstat $DIR2/${tfile}0 || return 2
423 rm $DIR/${tfile}0 || return 3
427 run_test 19 "resend of open request"
431 replay_barrier $SINGLEMDS
437 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
438 TIER1=$((`date +%s` - BEFORE))
440 replay_barrier $SINGLEMDS
446 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
447 TIER2=$((`date +%s` - BEFORE))
448 [ $TIER2 -ge $((TIER1 * 2)) ] && \
449 error "recovery time is growing $TIER2 > $TIER1"
452 run_test 20 "recovery time is not increasing"
454 # commit on sharing tests
456 local param_file=$TMP/$tfile-params
458 save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file
459 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
460 touch $MOUNT1/$tfile-1
461 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
462 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
463 replay_barrier_nosync $SINGLEMDS
466 facet_failover $SINGLEMDS
468 # all renames are replayed
469 unlink $MOUNT1/$tfile-3 || return 2
471 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
473 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
474 rm -rf $MOUNT1/$tfile-*
475 restore_lustre_params < $param_file
479 run_test 21a "commit on sharing"
483 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
486 do_node $CLIENT1 touch $MOUNT1/$tfile-1
487 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
488 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
490 replay_barrier_nosync $mds
491 shutdown_client $CLIENT2 $MOUNT1
495 # were renames replayed?
497 echo UNLINK $MOUNT1/$tfile-3
498 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \
499 { echo "unlink $tfile-3 fail!" && rc=1; }
502 zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail"
508 [ -z "$CLIENTS" ] && skip "Need two or more clients." && return
509 [ $CLIENTCOUNT -lt 2 ] && \
510 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
512 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
513 skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. "
518 zconf_umount_clients $CLIENTS $MOUNT2
519 zconf_mount_clients $CLIENTS $MOUNT1
521 local param_file=$TMP/$tfile-params
523 local num=$(get_mds_dir $MOUNT1)
525 save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file
529 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
531 test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS"
533 # COS disabled (should fail)
535 do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS
537 # there is still a window when transactions may be written to disk before
538 # the mds device is set R/O. To avoid such a rare test failure, the check
539 # is repeated several times.
542 test_21b_sub mds$num || break;
543 let n_attempts=n_attempts+1
544 [ $n_attempts -gt 3 ] &&
545 error "The test cannot check whether COS works or not: all renames are replied w/o COS"
547 restore_lustre_params < $param_file
551 run_test 21b "commit on sharing, two clients"
553 # end commit on sharing tests
555 complete $(basename $0) $SECONDS
556 SLEEP=$((`date +%s` - $NOW))
557 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
558 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
559 check_and_cleanup_lustre