5 # bug number: LU-2012 LU-8333 LU-7372
6 ALWAYS_EXCEPT="14b 21b 26 $REPLAY_DUAL_EXCEPT"
7 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
10 PTLDEBUG=${PTLDEBUG:--1}
11 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
13 CLEANUP=${CLEANUP:-""}
14 MOUNT_2=${MOUNT_2:-"yes"}
15 export MULTIOP=${MULTIOP:-multiop}
16 . $LUSTRE/tests/test-framework.sh
19 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
22 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
25 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
27 [[ $(facet_fstype $SINGLEMDS) == zfs ]] &&
28 # bug number for skipped test: LU-2230
29 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 21b"
31 # bug number for skipped tests: LU-9795 LU-9795
32 ALWAYS_EXCEPT=" 0a 0b $ALWAYS_EXCEPT"
37 check_and_setup_lustre
38 MOUNTED=$(mounted_lustre_filesystems)
39 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
40 zconf_mount $HOSTNAME $MOUNT2
45 rm -rf $DIR/[df][0-9]*
47 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
49 # if there is no CLIENT1 defined, some tests can be ran on localhost
50 CLIENT1=${CLIENT1:-$HOSTNAME}
51 # if CLIENT2 doesn't exist then use CLIENT1 instead
52 # All tests should use CLIENT2 with MOUNT2 only therefore it will work if
54 # Exception is the test which need two separate nodes
55 CLIENT2=${CLIENT2:-$CLIENT1}
57 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
58 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
60 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
63 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
65 echo "Check file is LU482_FAILED=$LU482_FAILED"
66 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
67 replay_barrier $SINGLEMDS
68 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
70 createmany -o $MOUNT1/$tfile- 50
71 $LCTL set_param fail_loc=0x80000514
72 facet_failover $SINGLEMDS
73 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
77 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
78 unlinkmany $MOUNT1/$tfile- 50 || return 2
79 rm $MOUNT2/$tfile || return 3
80 rm $MOUNT2/$tfile-A || return 4
82 run_test 0a "expired recovery with lost client"
84 if [ -f "$LU482_FAILED" ]; then
85 log "Found check file $LU482_FAILED, aborting test script"
86 rm -vf "$LU482_FAILED"
88 do_nodes $CLIENTS umount -f $MOUNT2 || true
89 do_nodes $CLIENTS umount -f $MOUNT || true
90 # copied from stopall, but avoid the MDS recovery
91 for num in `seq $OSTCOUNT`; do
93 rm -f $TMP/ost${num}active
95 if ! combined_mgs_mds ; then
103 replay_barrier $SINGLEMDS
105 touch $MOUNT1/$tfile-2
107 facet_failover $SINGLEMDS
109 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
110 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
111 # it is uncertain if file-2 exists or not, remove it if it does
112 checkstat $MOUNT1/$tfile-2 && rm $MOUNT1/$tfile-2
113 checkstat $MOUNT2/$tfile && return 2
116 run_test 0b "lost client during waiting for next transno"
120 replay_barrier $SINGLEMDS
124 checkstat $MOUNT2/a || return 1
125 checkstat $MOUNT1/b || return 2
126 rm $MOUNT2/a $MOUNT1/b
127 checkstat $MOUNT1/a && return 3
128 checkstat $MOUNT2/b && return 4
132 run_test 1 "|X| simple create"
136 replay_barrier $SINGLEMDS
140 checkstat $MOUNT2/adir || return 1
142 checkstat $MOUNT2/adir && return 2
145 run_test 2 "|X| mkdir adir"
148 replay_barrier $SINGLEMDS
150 mkdir $MOUNT2/adir/bdir
153 checkstat $MOUNT2/adir || return 1
154 checkstat $MOUNT1/adir/bdir || return 2
155 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
156 checkstat $MOUNT1/adir && return 3
157 checkstat $MOUNT2/adir/bdir && return 4
160 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
164 replay_barrier $SINGLEMDS
165 mkdir $MOUNT1/adir && return 1
166 mkdir $MOUNT2/adir/bdir
169 checkstat $MOUNT2/adir || return 2
170 checkstat $MOUNT1/adir/bdir || return 3
172 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
173 checkstat $MOUNT1/adir && return 4
174 checkstat $MOUNT2/adir/bdir && return 5
177 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
181 # multiclient version of replay_single.sh/test_8
183 multiop_bg_pause $MOUNT2/a o_tSc || return 1
186 replay_barrier $SINGLEMDS
188 wait $pid || return 1
191 [ -e $MOUNT2/a ] && return 2
194 run_test 5 "open, unlink |X| close"
199 multiop_bg_pause $MOUNT2/a o_c || return 1
201 multiop_bg_pause $MOUNT1/a o_c || return 1
204 replay_barrier $SINGLEMDS
206 wait $pid1 || return 1
210 wait $pid2 || return 1
211 [ -e $MOUNT2/a ] && return 2
214 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
217 replay_barrier $SINGLEMDS
218 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
220 checkstat $MOUNT2/$tfile || return 2
221 rm $MOUNT1/$tfile || return 3
225 run_test 8 "replay of resent request"
228 replay_barrier $SINGLEMDS
229 mcreate $MOUNT1/$tfile-1
230 mcreate $MOUNT2/$tfile-2
231 # drop first reint reply
232 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
234 do_facet $SINGLEMDS lctl set_param fail_loc=0
236 rm $MOUNT1/$tfile-[1,2] || return 1
240 run_test 9 "resending a replayed create"
243 mcreate $MOUNT1/$tfile-1
244 replay_barrier $SINGLEMDS
245 munlink $MOUNT1/$tfile-1
246 mcreate $MOUNT2/$tfile-2
247 # drop first reint reply
248 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
250 do_facet $SINGLEMDS lctl set_param fail_loc=0
252 checkstat $MOUNT1/$tfile-1 && return 1
253 checkstat $MOUNT1/$tfile-2 || return 2
258 run_test 10 "resending a replayed unlink"
261 replay_barrier $SINGLEMDS
262 mcreate $DIR1/$tfile-1
263 mcreate $DIR2/$tfile-2
264 mcreate $DIR1/$tfile-3
265 mcreate $DIR2/$tfile-4
266 mcreate $DIR1/$tfile-5
267 # drop all reint replies for a while
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
269 # note that with this fail_loc set, facet_failover df will fail
270 facet_failover $SINGLEMDS
272 local clients=${CLIENTS:-$HOSTNAME}
273 wait_clients_import_state "$clients" $SINGLEMDS FULL
275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
277 rm $DIR1/$tfile-[1-5] || return 1
281 run_test 11 "both clients timeout during replay"
284 replay_barrier $SINGLEMDS
286 multiop_bg_pause $DIR/$tfile mo_c || return 1
289 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
290 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
291 facet_failover $SINGLEMDS
292 do_facet $SINGLEMDS lctl set_param fail_loc=0
293 clients_up || return 1
296 kill -USR1 $MULTIPID || return 3
297 wait $MULTIPID || return 4
298 $CHECKSTAT -t file $DIR/$tfile || return 2
303 run_test 12 "open resend timeout"
306 multiop_bg_pause $DIR/$tfile mo_c || return 1
309 replay_barrier $SINGLEMDS
311 kill -USR1 $MULTIPID || return 3
312 wait $MULTIPID || return 4
315 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
316 facet_failover $SINGLEMDS
317 do_facet $SINGLEMDS lctl set_param fail_loc=0
318 clients_up || return 1
321 $CHECKSTAT -t file $DIR/$tfile || return 2
326 run_test 13 "close resend timeout"
328 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
333 wait_delete_completed
335 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
337 mkdir -p $MOUNT1/$tdir
338 $SETSTRIPE -i 0 $MOUNT1/$tdir
339 replay_barrier $SINGLEMDS
340 createmany -o $MOUNT1/$tdir/$tfile- 5
342 $SETSTRIPE -i 0 $MOUNT2/$tfile-2
343 dd if=/dev/zero of=$MOUNT2/$tfile-2 bs=1M count=5
344 createmany -o $MOUNT1/$tdir/$tfile-3- 5
348 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
350 # first set of files should have been replayed
351 unlinkmany $MOUNT1/$tdir/$tfile- 5 || error "first unlinks failed"
352 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || error "second unlinks failed"
354 zconf_mount $HOSTNAME $MOUNT2 || error "mount $MOUNT2 failed"
355 [ -f $MOUNT2/$tfile-2 ] && error "$MOUNT2/$tfile-2 exists!"
357 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
358 wait_delete_completed || error "wait_delete_complete failed"
360 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
361 log "before $beforeused, after $afterused"
362 # leave some margin for some files/dirs to be modified (OI, llog, etc)
363 [ $afterused -le $((beforeused + $(fs_log_size))) ] ||
364 error "after $afterused > before $beforeused"
366 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
368 test_15a() { # was test_15
369 replay_barrier $SINGLEMDS
370 createmany -o $MOUNT1/$tfile- 25
371 createmany -o $MOUNT2/$tfile-2- 1
376 unlinkmany $MOUNT1/$tfile- 25 || return 2
377 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
379 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
382 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
385 replay_barrier $SINGLEMDS
386 for ((i = 0; i < 2000; i++)); do
387 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
393 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
396 run_test 15c "remove multiple OST orphans"
399 replay_barrier $SINGLEMDS
400 createmany -o $MOUNT1/$tfile- 25
401 createmany -o $MOUNT2/$tfile-2- 1
404 facet_failover $SINGLEMDS
408 unlinkmany $MOUNT1/$tfile- 25 || return 2
410 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
414 run_test 16 "fail MDS during recovery (3571)"
417 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
419 createmany -o $MOUNT1/$tfile- 25
420 createmany -o $MOUNT2/$tfile-2- 1
422 # Make sure the disconnect is lost
430 unlinkmany $MOUNT1/$tfile- 25 || return 2
432 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
436 run_test 17 "fail OST during recovery (3571)"
438 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
441 test_18() { # bug 3822 - evicting client with enqueued lock
443 local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug)
444 do_facet $SINGLEMDS lctl set_param debug=+dlmtrace
445 mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed"
446 touch $MOUNT1/$tdir/${tfile}0 || error "touch file failed"
447 statmany -s $MOUNT1/$tdir/$tfile 1 500 &
450 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
451 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
453 #define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
454 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
455 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
457 usleep 500 # wait to ensure first client is one that will be evicted
458 openfile -f O_RDONLY $MOUNT2/$tdir/$tfile
460 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
461 do_facet $SINGLEMDS lctl debug_kernel |
462 grep "not entering recovery" && error "client not evicted"
463 do_facet client "lctl set_param fail_loc=0"
464 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
466 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
468 test_19() { # Bug 10991 - resend of open request does not fail assertion.
469 replay_barrier $SINGLEMDS
470 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
472 checkstat $DIR2/${tfile}0 || return 2
473 rm $DIR/${tfile}0 || return 3
477 run_test 19 "resend of open request"
480 local before=$SECONDS
481 replay_barrier $SINGLEMDS
487 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
488 local tier1=$((SECONDS - before))
491 replay_barrier $SINGLEMDS
497 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
498 local tier2=$((SECONDS - before))
500 # timeout is more than 1.5x original timeout
501 ((tier2 < tier1 * 6 / 4)) ||
502 error "recovery time $tier2 >= 1.5x original time $tier1"
504 run_test 20 "recovery time is not increasing"
506 # commit on sharing tests
508 local param_file=$TMP/$tfile-params
510 save_lustre_params $SINGLEMDS "mdt.*.commit_on_sharing" > $param_file
511 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
512 touch $MOUNT1/$tfile-1
513 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
514 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
515 replay_barrier_nosync $SINGLEMDS
518 facet_failover $SINGLEMDS
520 # all renames are replayed
521 unlink $MOUNT1/$tfile-3 || return 2
523 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
525 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
526 rm -rf $MOUNT1/$tfile-*
527 restore_lustre_params < $param_file
531 run_test 21a "commit on sharing"
535 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
538 do_node $CLIENT1 touch $MOUNT1/$tfile-1
539 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
540 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
542 replay_barrier_nosync $mds
543 shutdown_client $CLIENT2 $MOUNT1
547 # were renames replayed?
549 echo UNLINK $MOUNT1/$tfile-3
550 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 ||
551 { echo "unlink $tfile-3 fail!" && rc=1; }
554 zconf_mount_clients $CLIENT2 $MOUNT1 ||
555 error "mount $CLIENT2 $MOUNT1 fail"
561 [ -z "$CLIENTS" ] && skip "Need two or more clients" && return
562 [ $CLIENTCOUNT -lt 2 ] &&
563 { skip "Need 2+ clients, have $CLIENTCOUNT" && return; }
565 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
566 skip "Several MDTs on one MDS with FAILURE_MODE=$FAILURE_MODE"
570 zconf_umount_clients $CLIENTS $MOUNT2
571 zconf_mount_clients $CLIENTS $MOUNT1
573 local param_file=$TMP/$tfile-params
575 local mdtidx=$($LFS getstripe -m $MOUNT1)
576 local facet=mds$((mdtidx + 1))
578 save_lustre_params $facet "mdt.*.commit_on_sharing" > $param_file
582 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
584 test_21b_sub $facet || error "Not all renames are replayed. COS=$COS"
586 # there is still a window when transactions may be written to disk
587 # before the mds device is set R/O. To avoid such a rare test failure,
588 # the check is repeated several times.
592 # COS disabled (should fail)
593 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
595 test_21b_sub $facet || break
596 n_attempts=$((n_attempts + 1))
597 [ $n_attempts -gt 3 ] &&
598 error "can't check if COS works: rename replied w/o COS"
600 zconf_mount_clients $CLIENTS $MOUNT2
601 restore_lustre_params < $param_file
605 run_test 21b "commit on sharing, two clients"
608 checkstat $MOUNT1/$remote_dir || return 1
609 checkstat $MOUNT1/$remote_dir/dir || return 2
610 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 3
611 checkstat $MOUNT1/$remote_dir/dir/$tfile-1 || return 4
615 create_remote_dir_files_22() {
616 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir/dir || return 1
617 do_node $CLIENT1 createmany -o $MOUNT1/$remote_dir/dir/$tfile- 2 ||
619 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 ||
625 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
626 ([ $FAILURE_MODE == "HARD" ] &&
627 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
628 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
632 local remote_dir=${tdir}/remote_dir
634 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
636 # OBD_FAIL_MDS_REINT_NET_REP 0x119
637 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
638 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
641 fail mds$((MDTIDX + 1))
642 wait $CLIENT_PID || error "lfs mkdir failed"
644 replay_barrier mds$MDTIDX
645 create_remote_dir_files_22 || error "Remote creation failed $?"
648 checkstat_22 || error "check stat failed $?"
650 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
653 run_test 22a "c1 lfs mkdir -i 1 dir1, M1 drop reply & fail, c2 mkdir dir1/dir"
656 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
658 local remote_dir=$tdir/remote_dir
660 # OBD_FAIL_MDS_REINT_NET_REP 0x119
661 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
663 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
664 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
667 fail mds${MDTIDX},mds$((MDTIDX + 1))
668 wait $CLIENT_PID || error "lfs mkdir failed"
670 replay_barrier mds$MDTIDX
671 create_remote_dir_files_22 || error "Remote creation failed $?"
674 checkstat_22 || error "check stat failed $?"
676 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
679 run_test 22b "c1 lfs mkdir -i 1 d1, M1 drop reply & fail M0/M1, c2 mkdir d1/dir"
682 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
683 ([ $FAILURE_MODE == "HARD" ] &&
684 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
685 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
688 local remote_dir=${tdir}/remote_dir
690 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
692 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
693 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
694 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
696 do_facet mds$MDTIDX lctl set_param fail_loc=0
699 wait $CLIENT_PID || error "lfs mkdir failed"
701 replay_barrier mds$MDTIDX
702 create_remote_dir_files_22 || error "Remote creation failed $?"
705 checkstat_22 || error "check stat failed $?"
707 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
710 run_test 22c "c1 lfs mkdir -i 1 d1, M1 drop update & fail M1, c2 mkdir d1/dir"
713 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
715 local remote_dir=${tdir}/remote_dir
717 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
719 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
720 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
721 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
723 do_facet mds$MDTIDX lctl set_param fail_loc=0
725 fail mds${MDTIDX},mds$((MDTIDX + 1))
726 wait $CLIENT_PID || error "lfs mkdir failed"
728 replay_barrier mds$MDTIDX
729 create_remote_dir_files_22 || error "Remote creation failed $?"
732 checkstat_22 || error "check stat failed $?"
734 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
737 run_test 22d "c1 lfs mkdir -i 1 d1, M1 drop update & fail M0/M1,c2 mkdir d1/dir"
740 checkstat $MOUNT1/$remote_dir || return 1
741 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 2
745 create_remote_dir_files_23() {
746 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir || return 1
747 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 || return 2
752 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
753 ([ $FAILURE_MODE == "HARD" ] &&
754 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
755 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
758 local remote_dir=$tdir/remote_dir
760 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
761 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
762 error "lfs mkdir failed"
763 # OBD_FAIL_MDS_REINT_NET_REP 0x119
764 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
765 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
767 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
769 fail mds$((MDTIDX + 1))
770 wait $CLIENT_PID || error "rmdir remote dir failed"
772 replay_barrier mds${MDTIDX}
773 create_remote_dir_files_23 || error "Remote creation failed $?"
776 checkstat_23 || error "check stat failed $?"
778 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
781 run_test 23a "c1 rmdir d1, M1 drop reply and fail, client2 mkdir d1"
784 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
786 local remote_dir=$tdir/remote_dir
788 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
789 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
790 error "lfs mkdir failed"
792 # OBD_FAIL_MDS_REINT_NET_REP 0x119
793 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
794 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
796 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
798 fail mds${MDTIDX},mds$((MDTIDX + 1))
799 wait $CLIENT_PID || error "rmdir remote dir failed"
801 replay_barrier mds${MDTIDX}
802 create_remote_dir_files_23 || error "Remote creation failed $?"
805 checkstat_23 || error "check stat failed $?"
807 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
810 run_test 23b "c1 rmdir d1, M1 drop reply and fail M0/M1, c2 mkdir d1"
813 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
815 ([ $FAILURE_MODE == "HARD" ] &&
816 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
817 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
820 local remote_dir=$tdir/remote_dir
822 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
823 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
824 error "lfs mkdir failed"
826 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
827 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
828 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
830 do_facet mds${MDTIDX} lctl set_param fail_loc=0
833 wait $CLIENT_PID || error "rmdir remote dir failed"
835 replay_barrier mds${MDTIDX}
836 create_remote_dir_files_23 || error "Remote creation failed $?"
839 checkstat_23 || error "check stat failed $?"
841 rm -rf $MOUNT1/$tdir || return 6
844 run_test 23c "c1 rmdir d1, M0 drop update reply and fail M0, c2 mkdir d1"
847 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
849 local remote_dir=$tdir/remote_dir
851 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
852 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
853 error "lfs mkdir failed"
855 # OBD_FAIL_UPDATE_OBJ_NET 0x1701
856 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
857 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
859 do_facet mds${MDTIDX} lctl set_param fail_loc=0
861 fail mds${MDTIDX},mds$((MDTIDX + 1))
862 wait $CLIENT_PID || error "rmdir remote dir failed"
864 replay_barrier mds${MDTIDX}
865 create_remote_dir_files_23 || error "Remote creation failed $?"
868 checkstat_23 || error "check stat failed $?"
870 rm -rf $MOUNT1/$tdir || return 6
873 run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1"
876 [[ $(lustre_version_code $SINGLEMDS) -gt $(version_code 2.5.2) ]] ||
877 { skip "Need MDS version newer than 2.5.2"; return 0; }
880 stat $MOUNT/$tfile >&/dev/null
881 # OBD_FAIL_MDS_REINT_NET_REP
882 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x119
883 $TRUNCATE $MOUNT/$tfile 100 &
886 do_facet $SINGLEMDS lctl set_param fail_loc=0
887 # sync to release rep-ack lock quickly
888 do_nodes $(comma_list $(mdts_nodes)) \
889 "lctl set_param -n osd*.*MDT*.force_sync 1"
893 run_test 24 "reconstruct on non-existing object"
895 # end commit on sharing tests
900 $SETSTRIPE -i 0 -c 1 $DIR/$tfile
902 # get lock for the 1st client
903 dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null ||
904 error "failed to write data"
906 # get waiting locks for the 2nd client
907 drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
910 # failover, replay and resend replayed waiting locks
911 if [ $(lustre_version_code ost1) -ge $(version_code 2.6.90) ]; then
912 #define OBD_FAIL_LDLM_SRV_CP_AST 0x325
913 do_facet ost1 lctl set_param fail_loc=0x80000325
915 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
916 do_facet ost1 lctl set_param fail_loc=0x80000213
921 # multiop does not finish because CP AST is skipped;
922 # it is ok to kill it in the test, because CP AST is already re-sent
923 # and it does not hung forever in real life
927 run_test 25 "replay|resend"
932 kill -9 $dbench_26_pid
937 local clients=${CLIENTS:-$HOSTNAME}
939 zconf_mount_clients $clients $MOUNT
942 [ "$SLOW" = "no" ] && duration=200
943 # set duration to 900 because it takes some time to boot node
944 [ "$FAILURE_MODE" = HARD ] && duration=900
946 local start_ts=$SECONDS
951 local tar_dir=$DIR/$tdir/run_tar
953 test_mkdir -p -c$MDSCOUNT $tar_dir || break
954 if [ $MDSCOUNT -ge 2 ]; then
955 $LFS setdirstripe -D -c$MDSCOUNT $tar_dir ||
956 error "set default dirstripe failed"
959 tar cf - /etc | tar xf - || error "tar failed"
960 cd $DIR/$tdir || break
961 rm -rf $tar_dir || break
965 echo "Started tar $tar_26_pid"
968 local dbench_dir=$DIR2/$tdir/run_dbench
970 test_mkdir -p -c$MDSCOUNT $dbench_dir || break
971 if [ $MDSCOUNT -ge 2 ]; then
972 $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir ||
973 error "set default dirstripe failed"
975 cd $dbench_dir || break
976 rundbench 1 -D $dbench_dir -t 100 &>/dev/null || break
977 cd $DIR/$tdir || break
978 rm -rf $dbench_dir || break
982 echo "Started dbench $dbench_26_pid"
984 local num_failovers=0
986 while [ $((SECONDS - start_ts)) -lt $duration ]; do
987 kill -0 $tar_26_pid || error "tar $tar_26_pid missing"
988 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid missing"
990 replay_barrier mds$fail_index
991 sleep 2 # give clients a time to do operations
992 # Increment the number of failovers
993 num_failovers=$((num_failovers + 1))
994 log "$TESTNAME fail mds$fail_index $num_failovers times"
996 if [ $fail_index -ge $MDSCOUNT ]; then
999 fail_index=$((fail_index + 1))
1002 # stop the client loads
1003 kill -0 $tar_26_pid || error "tar $tar_26_pid stopped"
1004 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped"
1007 run_test 26 "dbench and tar with mds failover"
1010 $SETSTRIPE -i 0 -c 1 $DIR2/$tfile
1011 dd if=/dev/zero of=$DIR2/$tfile bs=4096 count=1
1013 #define OBD_FAIL_LDLM_SRV_BL_AST 0x324
1014 do_facet ost1 $LCTL set_param fail_loc=0x80000324
1016 dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 &
1020 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
1021 do_facet ost1 $LCTL set_param fail_loc=0x32a
1026 cancel_lru_locks OST0000-osc
1027 wait $pid || error "dd failed"
1029 run_test 28 "lock replay should be ordered: waiting after granted"
1032 SLEEP=$((SECONDS - $NOW))
1033 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
1034 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
1035 check_and_cleanup_lustre