5 PTLDEBUG=${PTLDEBUG:--1}
6 MOUNT_2=${MOUNT_2:-"yes"}
8 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
9 . $LUSTRE/tests/test-framework.sh
13 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
15 ALWAYS_EXCEPT="$REPLAY_DUAL_EXCEPT "
16 # bug number for skipped test: LU-2012 LU-8333
17 ALWAYS_EXCEPT+=" 14b 21b"
18 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
20 [[ "$mds1_FSTYPE" == zfs ]] &&
21 # bug number for skipped test: LU-2230
22 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 21b"
25 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b "
28 check_and_setup_lustre
30 MOUNTED=$(mounted_lustre_filesystems)
31 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
32 zconf_mount $HOSTNAME $MOUNT2
37 rm -rf $DIR/[df][0-9]*
39 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
41 # if there is no CLIENT1 defined, some tests can be ran on localhost
42 CLIENT1=${CLIENT1:-$HOSTNAME}
43 # if CLIENT2 doesn't exist then use CLIENT1 instead
44 # All tests should use CLIENT2 with MOUNT2 only therefore it will work if
46 # Exception is the test which need two separate nodes
47 CLIENT2=${CLIENT2:-$CLIENT1}
49 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
50 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
52 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
55 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
57 echo "Check file is LU482_FAILED=$LU482_FAILED"
58 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
59 replay_barrier $SINGLEMDS
60 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
62 createmany -o $MOUNT1/$tfile- 50
63 $LCTL set_param fail_loc=0x80000514
64 facet_failover $SINGLEMDS
65 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
69 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
70 unlinkmany $MOUNT1/$tfile- 50 || return 2
71 rm $MOUNT2/$tfile || return 3
72 rm $MOUNT2/$tfile-A || return 4
74 run_test 0a "expired recovery with lost client"
76 if [ -f "$LU482_FAILED" ]; then
77 log "Found check file $LU482_FAILED, aborting test script"
78 rm -vf "$LU482_FAILED"
80 do_nodes $CLIENTS umount -f $MOUNT2 || true
81 do_nodes $CLIENTS umount -f $MOUNT || true
82 # copied from stopall, but avoid the MDS recovery
83 for num in `seq $OSTCOUNT`; do
85 rm -f $TMP/ost${num}active
87 if ! combined_mgs_mds ; then
95 replay_barrier $SINGLEMDS
97 touch $MOUNT1/$tfile-2
99 facet_failover $SINGLEMDS
101 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
102 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
103 # it is uncertain if file-2 exists or not, remove it if it does
104 checkstat $MOUNT1/$tfile-2 && rm $MOUNT1/$tfile-2
105 checkstat $MOUNT2/$tfile && return 2
108 run_test 0b "lost client during waiting for next transno"
112 replay_barrier $SINGLEMDS
116 checkstat $MOUNT2/a || return 1
117 checkstat $MOUNT1/b || return 2
118 rm $MOUNT2/a $MOUNT1/b
119 checkstat $MOUNT1/a && return 3
120 checkstat $MOUNT2/b && return 4
124 run_test 1 "|X| simple create"
128 replay_barrier $SINGLEMDS
132 checkstat $MOUNT2/adir || return 1
134 checkstat $MOUNT2/adir && return 2
137 run_test 2 "|X| mkdir adir"
140 replay_barrier $SINGLEMDS
142 mkdir $MOUNT2/adir/bdir
145 checkstat $MOUNT2/adir || return 1
146 checkstat $MOUNT1/adir/bdir || return 2
147 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
148 checkstat $MOUNT1/adir && return 3
149 checkstat $MOUNT2/adir/bdir && return 4
152 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
156 replay_barrier $SINGLEMDS
157 mkdir $MOUNT1/adir && return 1
158 mkdir $MOUNT2/adir/bdir
161 checkstat $MOUNT2/adir || return 2
162 checkstat $MOUNT1/adir/bdir || return 3
164 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
165 checkstat $MOUNT1/adir && return 4
166 checkstat $MOUNT2/adir/bdir && return 5
169 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
173 # multiclient version of replay_single.sh/test_8
175 multiop_bg_pause $MOUNT2/a o_tSc || return 1
178 replay_barrier $SINGLEMDS
180 wait $pid || return 1
183 [ -e $MOUNT2/a ] && return 2
186 run_test 5 "open, unlink |X| close"
191 multiop_bg_pause $MOUNT2/a o_c || return 1
193 multiop_bg_pause $MOUNT1/a o_c || return 1
196 replay_barrier $SINGLEMDS
198 wait $pid1 || return 1
202 wait $pid2 || return 1
203 [ -e $MOUNT2/a ] && return 2
206 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
209 replay_barrier $SINGLEMDS
210 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
212 checkstat $MOUNT2/$tfile || return 2
213 rm $MOUNT1/$tfile || return 3
217 run_test 8 "replay of resent request"
220 replay_barrier $SINGLEMDS
221 mcreate $MOUNT1/$tfile-1
222 mcreate $MOUNT2/$tfile-2
223 # drop first reint reply
224 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
226 do_facet $SINGLEMDS lctl set_param fail_loc=0
228 rm $MOUNT1/$tfile-[1,2] || return 1
232 run_test 9 "resending a replayed create"
235 mcreate $MOUNT1/$tfile-1
236 replay_barrier $SINGLEMDS
237 munlink $MOUNT1/$tfile-1
238 mcreate $MOUNT2/$tfile-2
239 # drop first reint reply
240 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
242 do_facet $SINGLEMDS lctl set_param fail_loc=0
244 checkstat $MOUNT1/$tfile-1 && return 1
245 checkstat $MOUNT1/$tfile-2 || return 2
250 run_test 10 "resending a replayed unlink"
253 replay_barrier $SINGLEMDS
254 mcreate $DIR1/$tfile-1
255 mcreate $DIR2/$tfile-2
256 mcreate $DIR1/$tfile-3
257 mcreate $DIR2/$tfile-4
258 mcreate $DIR1/$tfile-5
259 # drop all reint replies for a while
260 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
261 # note that with this fail_loc set, facet_failover df will fail
262 facet_failover $SINGLEMDS
264 local clients=${CLIENTS:-$HOSTNAME}
265 wait_clients_import_state "$clients" $SINGLEMDS FULL
267 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
269 rm $DIR1/$tfile-[1-5] || return 1
273 run_test 11 "both clients timeout during replay"
276 replay_barrier $SINGLEMDS
278 multiop_bg_pause $DIR/$tfile mo_c || return 1
281 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
282 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
283 facet_failover $SINGLEMDS
284 do_facet $SINGLEMDS lctl set_param fail_loc=0
285 clients_up || return 1
288 kill -USR1 $MULTIPID || return 3
289 wait $MULTIPID || return 4
290 $CHECKSTAT -t file $DIR/$tfile || return 2
295 run_test 12 "open resend timeout"
298 multiop_bg_pause $DIR/$tfile mo_c || return 1
301 replay_barrier $SINGLEMDS
303 kill -USR1 $MULTIPID || return 3
304 wait $MULTIPID || return 4
307 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
308 facet_failover $SINGLEMDS
309 do_facet $SINGLEMDS lctl set_param fail_loc=0
310 clients_up || return 1
313 $CHECKSTAT -t file $DIR/$tfile || return 2
318 run_test 13 "close resend timeout"
320 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
325 wait_delete_completed
327 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
329 mkdir -p $MOUNT1/$tdir
330 $LFS setstripe -i 0 $MOUNT1/$tdir
331 replay_barrier $SINGLEMDS
332 createmany -o $MOUNT1/$tdir/$tfile- 5
334 $LFS setstripe -i 0 $MOUNT2/$tfile-2
335 dd if=/dev/zero of=$MOUNT2/$tfile-2 bs=1M count=5
336 createmany -o $MOUNT1/$tdir/$tfile-3- 5
340 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
342 # first set of files should have been replayed
343 unlinkmany $MOUNT1/$tdir/$tfile- 5 || error "first unlinks failed"
344 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || error "second unlinks failed"
346 zconf_mount $HOSTNAME $MOUNT2 || error "mount $MOUNT2 failed"
347 [ -f $MOUNT2/$tfile-2 ] && error "$MOUNT2/$tfile-2 exists!"
349 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
350 wait_delete_completed || error "wait_delete_complete failed"
352 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
353 log "before $beforeused, after $afterused"
354 # leave some margin for some files/dirs to be modified (OI, llog, etc)
355 [ $afterused -le $((beforeused + $(fs_log_size))) ] ||
356 error "after $afterused > before $beforeused"
358 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
360 test_15a() { # was test_15
361 replay_barrier $SINGLEMDS
362 createmany -o $MOUNT1/$tfile- 25
363 createmany -o $MOUNT2/$tfile-2- 1
368 unlinkmany $MOUNT1/$tfile- 25 || return 2
369 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
371 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
374 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
377 replay_barrier $SINGLEMDS
378 for ((i = 0; i < 2000; i++)); do
379 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
385 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
388 run_test 15c "remove multiple OST orphans"
391 replay_barrier $SINGLEMDS
392 createmany -o $MOUNT1/$tfile- 25
393 createmany -o $MOUNT2/$tfile-2- 1
396 facet_failover $SINGLEMDS
400 unlinkmany $MOUNT1/$tfile- 25 || return 2
402 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
406 run_test 16 "fail MDS during recovery (3571)"
409 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
411 createmany -o $MOUNT1/$tfile- 25
412 createmany -o $MOUNT2/$tfile-2- 1
414 # Make sure the disconnect is lost
422 unlinkmany $MOUNT1/$tfile- 25 || return 2
424 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
428 run_test 17 "fail OST during recovery (3571)"
430 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
433 test_18() { # bug 3822 - evicting client with enqueued lock
435 local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug)
436 do_facet $SINGLEMDS lctl set_param debug=+dlmtrace
437 mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed"
438 touch $MOUNT1/$tdir/${tfile}0 || error "touch file failed"
439 statmany -s $MOUNT1/$tdir/$tfile 1 500 &
442 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
443 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
445 #define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
446 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
447 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
449 sleep 0.1 # wait to ensure first client is one that will be evicted
450 openfile -f O_RDONLY $MOUNT2/$tdir/$tfile
452 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
453 do_facet $SINGLEMDS lctl debug_kernel |
454 grep "not entering recovery" && error "client not evicted"
455 do_facet client "lctl set_param fail_loc=0"
456 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
458 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
460 test_19() { # Bug 10991 - resend of open request does not fail assertion.
461 replay_barrier $SINGLEMDS
462 drop_mdt_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
464 checkstat $DIR2/${tfile}0 || return 2
465 rm $DIR/${tfile}0 || return 3
469 run_test 19 "resend of open request"
472 local before=$SECONDS
473 replay_barrier $SINGLEMDS
479 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
480 local tier1=$((SECONDS - before))
483 replay_barrier $SINGLEMDS
489 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
490 local tier2=$((SECONDS - before))
492 # timeout is more than 1.5x original timeout
493 ((tier2 < tier1 * 6 / 4)) ||
494 error "recovery time $tier2 >= 1.5x original time $tier1"
496 run_test 20 "recovery time is not increasing"
498 # commit on sharing tests
500 local param_file=$TMP/$tfile-params
502 save_lustre_params $SINGLEMDS "mdt.*.commit_on_sharing" > $param_file
503 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
504 touch $MOUNT1/$tfile-1
505 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
506 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
507 replay_barrier_nosync $SINGLEMDS
510 facet_failover $SINGLEMDS
512 # all renames are replayed
513 unlink $MOUNT1/$tfile-3 || return 2
515 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
517 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
518 rm -rf $MOUNT1/$tfile-*
519 restore_lustre_params < $param_file
523 run_test 21a "commit on sharing"
527 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
530 do_node $CLIENT1 touch $MOUNT1/$tfile-1
531 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
532 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
534 replay_barrier_nosync $mds
535 shutdown_client $CLIENT2 $MOUNT1
539 # were renames replayed?
541 echo UNLINK $MOUNT1/$tfile-3
542 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 ||
543 { echo "unlink $tfile-3 fail!" && rc=1; }
546 zconf_mount_clients $CLIENT2 $MOUNT1 ||
547 error "mount $CLIENT2 $MOUNT1 fail"
553 [ $CLIENTCOUNT -lt 2 ] &&
554 { skip "Need 2+ clients, have $CLIENTCOUNT" && return; }
556 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
557 skip "Several MDTs on one MDS with FAILURE_MODE=$FAILURE_MODE"
561 zconf_umount_clients $CLIENTS $MOUNT2
562 zconf_mount_clients $CLIENTS $MOUNT1
564 local param_file=$TMP/$tfile-params
566 local mdtidx=$($LFS getstripe -m $MOUNT1)
567 local facet=mds$((mdtidx + 1))
569 save_lustre_params $facet "mdt.*.commit_on_sharing" > $param_file
573 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
575 test_21b_sub $facet || error "Not all renames are replayed. COS=$COS"
577 # there is still a window when transactions may be written to disk
578 # before the mds device is set R/O. To avoid such a rare test failure,
579 # the check is repeated several times.
583 # COS disabled (should fail)
584 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
586 test_21b_sub $facet || break
587 n_attempts=$((n_attempts + 1))
588 [ $n_attempts -gt 3 ] &&
589 error "can't check if COS works: rename replied w/o COS"
591 zconf_mount_clients $CLIENTS $MOUNT2
592 restore_lustre_params < $param_file
596 run_test 21b "commit on sharing, two clients"
599 checkstat $MOUNT1/$remote_dir || return 1
600 checkstat $MOUNT1/$remote_dir/dir || return 2
601 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 3
602 checkstat $MOUNT1/$remote_dir/dir/$tfile-1 || return 4
606 create_remote_dir_files_22() {
607 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir/dir || return 1
608 do_node $CLIENT1 createmany -o $MOUNT1/$remote_dir/dir/$tfile- 2 ||
610 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 ||
616 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
617 ([ $FAILURE_MODE == "HARD" ] &&
618 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
619 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
623 local remote_dir=$tdir/remote_dir
625 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
626 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
628 # OBD_FAIL_MDS_REINT_NET_REP 0x119
629 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
630 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
634 fail mds$((MDTIDX + 1))
635 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
637 replay_barrier mds$MDTIDX
638 create_remote_dir_files_22 || error "Remote creation failed $?"
641 checkstat_22 || error "check stat failed $?"
643 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
646 run_test 22a "c1 lfs mkdir -i 1 dir1, M1 drop reply & fail, c2 mkdir dir1/dir"
649 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
651 local remote_dir=$tdir/remote_dir
653 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
654 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
656 # OBD_FAIL_MDS_REINT_NET_REP 0x119
657 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
658 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
662 fail mds${MDTIDX},mds$((MDTIDX + 1))
663 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
665 replay_barrier mds$MDTIDX
666 create_remote_dir_files_22 || error "Remote creation failed $?"
669 checkstat_22 || error "check stat failed $?"
671 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
674 run_test 22b "c1 lfs mkdir -i 1 d1, M1 drop reply & fail M0/M1, c2 mkdir d1/dir"
677 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
678 ([ $FAILURE_MODE == "HARD" ] &&
679 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
680 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
683 local remote_dir=$tdir/remote_dir
685 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
686 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
688 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
689 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
690 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
693 do_facet mds$MDTIDX lctl set_param fail_loc=0
696 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
698 replay_barrier mds$MDTIDX
699 create_remote_dir_files_22 || error "Remote creation failed $?"
702 checkstat_22 || error "check stat failed $?"
704 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
707 run_test 22c "c1 lfs mkdir -i 1 d1, M1 drop update & fail M1, c2 mkdir d1/dir"
710 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
712 local remote_dir=$tdir/remote_dir
714 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
715 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
717 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
718 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
719 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
722 do_facet mds$MDTIDX lctl set_param fail_loc=0
724 fail mds${MDTIDX},mds$((MDTIDX + 1))
725 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
727 replay_barrier mds$MDTIDX
728 create_remote_dir_files_22 || error "Remote creation failed $?"
731 checkstat_22 || error "check stat failed $?"
733 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
736 run_test 22d "c1 lfs mkdir -i 1 d1, M1 drop update & fail M0/M1,c2 mkdir d1/dir"
739 checkstat $MOUNT1/$remote_dir || return 1
740 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 2
744 create_remote_dir_files_23() {
745 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir || return 1
746 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 || return 2
751 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
752 ([ $FAILURE_MODE == "HARD" ] &&
753 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
754 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
757 local remote_dir=$tdir/remote_dir
759 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
760 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
761 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
762 error "lfs mkdir -i $MDTIDX failed"
763 # OBD_FAIL_MDS_REINT_NET_REP 0x119
764 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
765 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
768 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
770 fail mds$((MDTIDX + 1))
771 wait $CLIENT_PID || error "rmdir remote dir failed"
773 replay_barrier mds${MDTIDX}
774 create_remote_dir_files_23 || error "Remote creation failed $?"
777 checkstat_23 || error "check stat failed $?"
779 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
782 run_test 23a "c1 rmdir d1, M1 drop reply and fail, client2 mkdir d1"
785 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
787 local remote_dir=$tdir/remote_dir
789 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
790 error "lfs mkdir -i 0 $MOUNT/$tdir failed"
791 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
792 error "lfs mkdir -i $MDTIDX failed"
794 # OBD_FAIL_MDS_REINT_NET_REP 0x119
795 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
796 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
799 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
801 fail mds${MDTIDX},mds$((MDTIDX + 1))
802 wait $CLIENT_PID || error "rmdir remote dir failed"
804 replay_barrier mds${MDTIDX}
805 create_remote_dir_files_23 || error "Remote creation failed $?"
808 checkstat_23 || error "check stat failed $?"
810 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
813 run_test 23b "c1 rmdir d1, M1 drop reply and fail M0/M1, c2 mkdir d1"
816 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
818 ([ $FAILURE_MODE == "HARD" ] &&
819 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
820 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
823 local remote_dir=$tdir/remote_dir
825 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
826 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
827 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
828 error "lfs mkdir -i $MDTIDX failed"
830 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
831 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
832 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
835 do_facet mds${MDTIDX} lctl set_param fail_loc=0
838 wait $CLIENT_PID || error "rmdir remote dir failed"
840 replay_barrier mds${MDTIDX}
841 create_remote_dir_files_23 || error "Remote creation failed $?"
844 checkstat_23 || error "check stat failed $?"
846 rm -rf $MOUNT1/$tdir || return 6
849 run_test 23c "c1 rmdir d1, M0 drop update reply and fail M0, c2 mkdir d1"
852 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
854 local remote_dir=$tdir/remote_dir
856 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
857 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
858 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
859 error "lfs mkdir -i $MDTIDX failed"
861 # OBD_FAIL_UPDATE_OBJ_NET 0x1701
862 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
863 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
866 do_facet mds${MDTIDX} lctl set_param fail_loc=0
868 fail mds${MDTIDX},mds$((MDTIDX + 1))
869 wait $CLIENT_PID || error "rmdir remote dir failed"
871 replay_barrier mds${MDTIDX}
872 create_remote_dir_files_23 || error "Remote creation failed $?"
875 checkstat_23 || error "check stat failed $?"
877 rm -rf $MOUNT1/$tdir || return 6
880 run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1"
883 [[ "$MDS1_VERSION" -gt $(version_code 2.5.2) ]] ||
884 skip "Need MDS version newer than 2.5.2"
887 stat $MOUNT/$tfile >&/dev/null
888 # OBD_FAIL_MDS_REINT_NET_REP
889 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x119
890 $TRUNCATE $MOUNT/$tfile 100 &
893 do_facet $SINGLEMDS lctl set_param fail_loc=0
894 # sync to release rep-ack lock quickly
895 do_nodes $(comma_list $(mdts_nodes)) \
896 "lctl set_param -n osd*.*MDT*.force_sync 1"
900 run_test 24 "reconstruct on non-existing object"
902 # end commit on sharing tests
907 $LFS setstripe -i 0 -c 1 $DIR/$tfile
909 # get lock for the 1st client
910 dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null ||
911 error "failed to write data"
913 # get waiting locks for the 2nd client
914 drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
917 # failover, replay and resend replayed waiting locks
918 if [ "$OST1_VERSION" -ge $(version_code 2.6.90) ]; then
919 #define OBD_FAIL_LDLM_SRV_CP_AST 0x325
920 do_facet ost1 lctl set_param fail_loc=0x80000325
922 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
923 do_facet ost1 lctl set_param fail_loc=0x80000213
928 # multiop does not finish because CP AST is skipped;
929 # it is ok to kill it in the test, because CP AST is already re-sent
930 # and it does not hung forever in real life
934 run_test 25 "replay|resend"
939 kill -9 $dbench_26_pid
944 local clients=${CLIENTS:-$HOSTNAME}
946 zconf_mount_clients $clients $MOUNT
949 [ "$SLOW" = "no" ] && duration=200
950 # set duration to 900 because it takes some time to boot node
951 [ "$FAILURE_MODE" = HARD ] && duration=900
953 local start_ts=$SECONDS
958 local tar_dir=$DIR/$tdir/run_tar
960 test_mkdir -p -c$MDSCOUNT $tar_dir || break
961 if [ $MDSCOUNT -ge 2 ]; then
962 $LFS setdirstripe -D -c$MDSCOUNT $tar_dir ||
963 error "set default dirstripe failed"
966 tar cf - /etc | tar xf - || error "tar failed"
967 cd $DIR/$tdir || break
968 rm -rf $tar_dir || break
972 echo "Started tar $tar_26_pid"
975 local dbench_dir=$DIR2/$tdir/run_dbench
977 test_mkdir -p -c$MDSCOUNT $dbench_dir || break
978 if [ $MDSCOUNT -ge 2 ]; then
979 $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir ||
980 error "set default dirstripe failed"
982 cd $dbench_dir || break
983 rundbench 1 -D $dbench_dir -t 100 &>/dev/null || break
984 cd $DIR/$tdir || break
985 rm -rf $dbench_dir || break
989 echo "Started dbench $dbench_26_pid"
991 local num_failovers=0
993 while [ $((SECONDS - start_ts)) -lt $duration ]; do
994 kill -0 $tar_26_pid || error "tar $tar_26_pid missing"
995 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid missing"
997 replay_barrier mds$fail_index
998 sleep 2 # give clients a time to do operations
999 # Increment the number of failovers
1000 num_failovers=$((num_failovers + 1))
1001 log "$TESTNAME fail mds$fail_index $num_failovers times"
1003 if [ $fail_index -ge $MDSCOUNT ]; then
1006 fail_index=$((fail_index + 1))
1009 # stop the client loads
1010 kill -0 $tar_26_pid || error "tar $tar_26_pid stopped"
1011 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped"
1014 run_test 26 "dbench and tar with mds failover"
1017 $LFS setstripe -i 0 -c 1 $DIR2/$tfile
1018 dd if=/dev/zero of=$DIR2/$tfile bs=4096 count=1
1020 #define OBD_FAIL_LDLM_SRV_BL_AST 0x324
1021 do_facet ost1 $LCTL set_param fail_loc=0x80000324
1023 dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 &
1027 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
1028 do_facet ost1 $LCTL set_param fail_loc=0x32a
1033 cancel_lru_locks OST0000-osc
1034 wait $pid || error "dd failed"
1036 run_test 28 "lock replay should be ordered: waiting after granted"
1039 local dir0=$DIR/$tdir/d0
1040 local dir1=$DIR/$tdir/d1
1042 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
1043 [ $CLIENTCOUNT -lt 2 ] && skip "needs >= 2 clients" && return 0
1044 [ "$CLIENT1" == "$CLIENT2" ] &&
1045 skip "clients must be on different nodes" && return 0
1048 $LFS mkdir -i0 $dir0
1049 $LFS mkdir -i1 $dir1
1053 # create a remote dir, drop reply
1054 #define OBD_FAIL_PTLRPC_ROUND_XID 0x530
1055 $LCTL set_param fail_loc=0x530 fail_val=36
1056 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
1057 do_facet mds2 $LCTL set_param fail_loc=0x8000015a
1058 echo make remote dir d0 for $dir0
1059 $LFS mkdir -i1 -c1 $dir0/d3 &
1062 echo make local dir d1 for $dir1
1063 do_node $CLIENT2 $LCTL set_param fail_loc=0x530 fail_val=36
1064 do_node $CLIENT2 mkdir $dir1/d4
1068 run_test 29 "replay vs update with the same xid"
1071 $LFS setstripe -E 1m -L mdt -E -1 $DIR/$tfile
1072 #first write to have no problems with grants
1073 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 ||
1074 error "dd on client failed"
1075 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 seek=10 ||
1076 error "dd on client failed"
1078 #define OBD_FAIL_LDLM_REPLAY_PAUSE 0x32e
1079 lctl set_param fail_loc=0x32e fail_val=4
1080 dd of=/dev/null if=$DIR2/$tfile &
1086 wait $pid || error "dd on client failed"
1088 run_test 30 "layout lock replay is not blocked on IO"
1091 mkdir_on_mdt0 $DIR1/$tdir
1092 $LFS setstripe -c 1 -i 0 $DIR1/$tdir
1093 for (( i=0; i < 10; i++ )) ; do
1094 mkdir -p $DIR1/$tdir/d.${i}
1096 mkdir $DIR1/$tdir/mdtdir
1097 $LFS setstripe -E 1M -L mdt $DIR1/$tdir/mdtdir
1099 # failover has to take longer than blocking timeout involved
1100 # by second multiop below which is set to obd_timeout/2 by
1102 local timeout=$(do_facet mds1 $LCTL get_param -n timeout)
1104 timeout=$((timeout / 2 + 5))
1105 fail ost1 $timeout &
1110 # consume preallocated objects, precreate thread will be awakened
1111 consume_precreations $DIR1/$tdir mds1 0 1
1113 # disable AT so that blocking timeout gets set to obd_timeout/2
1114 local amm=$(at_max_get mds1)
1117 stack_trap "at_max_set $amm mds1"
1121 #define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420
1122 $LCTL set_param fail_loc=0x80001420
1123 $MULTIOP $DIR1/$tdir/mdtdir/$tfile Osw4096c &
1126 $MULTIOP $DIR2/$tdir/mdtdir/$tfile oO_WRONLY:w4096c &
1129 local mmrif=$($LCTL get_param -n \
1130 mdc.$FSNAME-MDT0000-mdc-*.max_mod_rpcs_in_flight | tail -1)
1131 # these are blocked by precreation until ost failover is in progress
1132 for (( i=0; i < $mmrif; i++ )) ; do
1133 $MULTIOP $DIR1/$tdir/d.${i}/parallel Oc &
1139 for pid in ${multiops[@]}; do
1140 wait $pid || ((failed++))
1142 ((failed == 0)) || error "$failed multiops failed"
1144 run_test 31 "deadlock on file_remove_privs and occupied mod rpc slots"
1147 (( $MDSCOUNT < 2 )) && skip_env "needs >= 2 MDTs"
1149 # inject a gap with 10th transaction
1150 #define OBD_FAIL_LLOG_ADD_GAP 0x131d
1151 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0000131d fail_val=10
1152 for ((i=0; i < 20; i++)); do
1153 $LFS setdirstripe -i1 $DIR/$tdir-$i ||
1154 error "can't mkdir $DIR/$tdir-$i"
1157 # prevent update llog cancellation, so next boot MDS has
1158 # process the update llog with gap injected
1159 #define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726
1160 $LCTL set_param fail_loc=0x726
1165 $LCTL set_param fail_loc=0
1172 local testid=$(echo $TESTNAME | tr '_' ' ')
1173 dmesg | tac | sed "/$testid/,$ d" | grep "This client was evicted" &&
1174 error "client got evicted due to aborted recovery"
1177 run_test 32 "gap in update llog shouldn't break recovery"
1180 SLEEP=$((SECONDS - $NOW))
1181 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
1182 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
1183 check_and_cleanup_lustre