5 # bug number: LU-2012 LU-8333 LU-7372
6 ALWAYS_EXCEPT="14b 21b 24 25 $REPLAY_DUAL_EXCEPT"
7 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
10 PTLDEBUG=${PTLDEBUG:--1}
11 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
13 CLEANUP=${CLEANUP:-""}
14 MOUNT_2=${MOUNT_2:-"yes"}
15 export MULTIOP=${MULTIOP:-multiop}
16 . $LUSTRE/tests/test-framework.sh
19 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
22 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
25 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b"
27 [[ $(facet_fstype $SINGLEMDS) == zfs ]] &&
28 # bug number for skipped test: LU-2230
29 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 21b"
31 # bug number for skipped tests: LU-9795 LU-9795
32 ALWAYS_EXCEPT=" 0a 0b $ALWAYS_EXCEPT"
37 check_and_setup_lustre
38 MOUNTED=$(mounted_lustre_filesystems)
39 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
40 zconf_mount $HOSTNAME $MOUNT2
45 rm -rf $DIR/[df][0-9]*
47 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
49 # if there is no CLIENT1 defined, some tests can be ran on localhost
50 CLIENT1=${CLIENT1:-$HOSTNAME}
51 # if CLIENT2 doesn't exist then use CLIENT1 instead
52 # All tests should use CLIENT2 with MOUNT2 only therefore it will work if
54 # Exception is the test which need two separate nodes
55 CLIENT2=${CLIENT2:-$CLIENT1}
57 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
58 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
60 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
63 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
65 echo "Check file is LU482_FAILED=$LU482_FAILED"
66 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
67 replay_barrier $SINGLEMDS
68 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
70 createmany -o $MOUNT1/$tfile- 50
71 $LCTL set_param fail_loc=0x80000514
72 facet_failover $SINGLEMDS
73 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
77 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
78 unlinkmany $MOUNT1/$tfile- 50 || return 2
79 rm $MOUNT2/$tfile || return 3
80 rm $MOUNT2/$tfile-A || return 4
82 run_test 0a "expired recovery with lost client"
84 if [ -f "$LU482_FAILED" ]; then
85 log "Found check file $LU482_FAILED, aborting test script"
86 rm -vf "$LU482_FAILED"
88 do_nodes $CLIENTS umount -f $MOUNT2 || true
89 do_nodes $CLIENTS umount -f $MOUNT || true
90 # copied from stopall, but avoid the MDS recovery
91 for num in `seq $OSTCOUNT`; do
93 rm -f $TMP/ost${num}active
95 if ! combined_mgs_mds ; then
103 replay_barrier $SINGLEMDS
105 touch $MOUNT1/$tfile-2
107 facet_failover $SINGLEMDS
109 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
110 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
111 # it is uncertain if file-2 exists or not, remove it if it does
112 checkstat $MOUNT1/$tfile-2 && rm $MOUNT1/$tfile-2
113 checkstat $MOUNT2/$tfile && return 2
116 run_test 0b "lost client during waiting for next transno"
120 replay_barrier $SINGLEMDS
124 checkstat $MOUNT2/a || return 1
125 checkstat $MOUNT1/b || return 2
126 rm $MOUNT2/a $MOUNT1/b
127 checkstat $MOUNT1/a && return 3
128 checkstat $MOUNT2/b && return 4
132 run_test 1 "|X| simple create"
136 replay_barrier $SINGLEMDS
140 checkstat $MOUNT2/adir || return 1
142 checkstat $MOUNT2/adir && return 2
145 run_test 2 "|X| mkdir adir"
148 replay_barrier $SINGLEMDS
150 mkdir $MOUNT2/adir/bdir
153 checkstat $MOUNT2/adir || return 1
154 checkstat $MOUNT1/adir/bdir || return 2
155 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
156 checkstat $MOUNT1/adir && return 3
157 checkstat $MOUNT2/adir/bdir && return 4
160 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
164 replay_barrier $SINGLEMDS
165 mkdir $MOUNT1/adir && return 1
166 mkdir $MOUNT2/adir/bdir
169 checkstat $MOUNT2/adir || return 2
170 checkstat $MOUNT1/adir/bdir || return 3
172 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
173 checkstat $MOUNT1/adir && return 4
174 checkstat $MOUNT2/adir/bdir && return 5
177 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
181 # multiclient version of replay_single.sh/test_8
183 multiop_bg_pause $MOUNT2/a o_tSc || return 1
186 replay_barrier $SINGLEMDS
188 wait $pid || return 1
191 [ -e $MOUNT2/a ] && return 2
194 run_test 5 "open, unlink |X| close"
199 multiop_bg_pause $MOUNT2/a o_c || return 1
201 multiop_bg_pause $MOUNT1/a o_c || return 1
204 replay_barrier $SINGLEMDS
206 wait $pid1 || return 1
210 wait $pid2 || return 1
211 [ -e $MOUNT2/a ] && return 2
214 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
217 replay_barrier $SINGLEMDS
218 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
220 checkstat $MOUNT2/$tfile || return 2
221 rm $MOUNT1/$tfile || return 3
225 run_test 8 "replay of resent request"
228 replay_barrier $SINGLEMDS
229 mcreate $MOUNT1/$tfile-1
230 mcreate $MOUNT2/$tfile-2
231 # drop first reint reply
232 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
234 do_facet $SINGLEMDS lctl set_param fail_loc=0
236 rm $MOUNT1/$tfile-[1,2] || return 1
240 run_test 9 "resending a replayed create"
243 mcreate $MOUNT1/$tfile-1
244 replay_barrier $SINGLEMDS
245 munlink $MOUNT1/$tfile-1
246 mcreate $MOUNT2/$tfile-2
247 # drop first reint reply
248 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
250 do_facet $SINGLEMDS lctl set_param fail_loc=0
252 checkstat $MOUNT1/$tfile-1 && return 1
253 checkstat $MOUNT1/$tfile-2 || return 2
258 run_test 10 "resending a replayed unlink"
261 replay_barrier $SINGLEMDS
262 mcreate $DIR1/$tfile-1
263 mcreate $DIR2/$tfile-2
264 mcreate $DIR1/$tfile-3
265 mcreate $DIR2/$tfile-4
266 mcreate $DIR1/$tfile-5
267 # drop all reint replies for a while
268 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
269 # note that with this fail_loc set, facet_failover df will fail
270 facet_failover $SINGLEMDS
272 local clients=${CLIENTS:-$HOSTNAME}
273 wait_clients_import_state "$clients" $SINGLEMDS FULL
275 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
277 rm $DIR1/$tfile-[1-5] || return 1
281 run_test 11 "both clients timeout during replay"
284 replay_barrier $SINGLEMDS
286 multiop_bg_pause $DIR/$tfile mo_c || return 1
289 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
290 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
291 facet_failover $SINGLEMDS
292 do_facet $SINGLEMDS lctl set_param fail_loc=0
293 clients_up || return 1
296 kill -USR1 $MULTIPID || return 3
297 wait $MULTIPID || return 4
298 $CHECKSTAT -t file $DIR/$tfile || return 2
303 run_test 12 "open resend timeout"
306 multiop_bg_pause $DIR/$tfile mo_c || return 1
309 replay_barrier $SINGLEMDS
311 kill -USR1 $MULTIPID || return 3
312 wait $MULTIPID || return 4
315 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
316 facet_failover $SINGLEMDS
317 do_facet $SINGLEMDS lctl set_param fail_loc=0
318 clients_up || return 1
321 $CHECKSTAT -t file $DIR/$tfile || return 2
326 run_test 13 "close resend timeout"
328 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
333 wait_delete_completed
335 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
337 mkdir -p $MOUNT1/$tdir
338 $SETSTRIPE -i 0 $MOUNT1/$tdir
339 replay_barrier $SINGLEMDS
340 createmany -o $MOUNT1/$tdir/$tfile- 5
342 $SETSTRIPE -i 0 $MOUNT2/$tfile-2
343 dd if=/dev/zero of=$MOUNT2/$tfile-2 bs=1M count=5
344 createmany -o $MOUNT1/$tdir/$tfile-3- 5
348 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
350 # first set of files should have been replayed
351 unlinkmany $MOUNT1/$tdir/$tfile- 5 || error "first unlinks failed"
352 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || error "second unlinks failed"
354 zconf_mount $HOSTNAME $MOUNT2 || error "mount $MOUNT2 failed"
355 [ -f $MOUNT2/$tfile-2 ] && error "$MOUNT2/$tfile-2 exists!"
357 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
358 wait_delete_completed || error "wait_delete_complete failed"
360 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
361 log "before $beforeused, after $afterused"
362 # leave some margin for some files/dirs to be modified (OI, llog, etc)
363 [ $afterused -le $((beforeused + $(fs_log_size))) ] ||
364 error "after $afterused > before $beforeused"
366 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
368 test_15a() { # was test_15
369 replay_barrier $SINGLEMDS
370 createmany -o $MOUNT1/$tfile- 25
371 createmany -o $MOUNT2/$tfile-2- 1
376 unlinkmany $MOUNT1/$tfile- 25 || return 2
377 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
379 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
382 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
385 replay_barrier $SINGLEMDS
386 for ((i = 0; i < 2000; i++)); do
387 echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed"
393 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
396 run_test 15c "remove multiple OST orphans"
399 replay_barrier $SINGLEMDS
400 createmany -o $MOUNT1/$tfile- 25
401 createmany -o $MOUNT2/$tfile-2- 1
404 facet_failover $SINGLEMDS
408 unlinkmany $MOUNT1/$tfile- 25 || return 2
410 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
414 run_test 16 "fail MDS during recovery (3571)"
417 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
419 createmany -o $MOUNT1/$tfile- 25
420 createmany -o $MOUNT2/$tfile-2- 1
422 # Make sure the disconnect is lost
430 unlinkmany $MOUNT1/$tfile- 25 || return 2
432 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
436 run_test 17 "fail OST during recovery (3571)"
438 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
441 test_18() { # bug 3822 - evicting client with enqueued lock
443 local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug)
444 do_facet $SINGLEMDS lctl set_param debug=+dlmtrace
445 mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed"
446 touch $MOUNT1/$tdir/${tfile}0 || error "touch file failed"
447 statmany -s $MOUNT1/$tdir/$tfile 1 500 &
450 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
451 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
453 #define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
454 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
455 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
457 usleep 500 # wait to ensure first client is one that will be evicted
458 openfile -f O_RDONLY $MOUNT2/$tdir/$tfile
460 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
461 do_facet $SINGLEMDS lctl debug_kernel |
462 grep "not entering recovery" && error "client not evicted"
463 do_facet client "lctl set_param fail_loc=0"
464 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
466 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
468 test_19() { # Bug 10991 - resend of open request does not fail assertion.
469 replay_barrier $SINGLEMDS
470 drop_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
472 checkstat $DIR2/${tfile}0 || return 2
473 rm $DIR/${tfile}0 || return 3
477 run_test 19 "resend of open request"
480 local before=$SECONDS
481 replay_barrier $SINGLEMDS
487 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
488 local tier1=$((SECONDS - before))
491 replay_barrier $SINGLEMDS
497 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
498 local tier2=$((SECONDS - before))
500 # timeout is more than 1.5x original timeout
501 ((tier2 < tier1 * 6 / 4)) ||
502 error "recovery time $tier2 >= 1.5x original time $tier1"
504 run_test 20 "recovery time is not increasing"
506 # commit on sharing tests
508 local param_file=$TMP/$tfile-params
510 save_lustre_params $SINGLEMDS "mdt.*.commit_on_sharing" > $param_file
511 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
512 touch $MOUNT1/$tfile-1
513 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
514 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
515 replay_barrier_nosync $SINGLEMDS
518 facet_failover $SINGLEMDS
520 # all renames are replayed
521 unlink $MOUNT1/$tfile-3 || return 2
523 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
525 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
526 rm -rf $MOUNT1/$tfile-*
527 restore_lustre_params < $param_file
531 run_test 21a "commit on sharing"
535 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
538 do_node $CLIENT1 touch $MOUNT1/$tfile-1
539 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
540 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
542 replay_barrier_nosync $mds
543 shutdown_client $CLIENT2 $MOUNT1
547 # were renames replayed?
549 echo UNLINK $MOUNT1/$tfile-3
550 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 ||
551 { echo "unlink $tfile-3 fail!" && rc=1; }
554 zconf_mount_clients $CLIENT2 $MOUNT1 ||
555 error "mount $CLIENT2 $MOUNT1 fail"
561 [ -z "$CLIENTS" ] && skip "Need two or more clients" && return
562 [ $CLIENTCOUNT -lt 2 ] &&
563 { skip "Need 2+ clients, have $CLIENTCOUNT" && return; }
565 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
566 skip "Several MDTs on one MDS with FAILURE_MODE=$FAILURE_MODE"
570 zconf_umount_clients $CLIENTS $MOUNT2
571 zconf_mount_clients $CLIENTS $MOUNT1
573 local param_file=$TMP/$tfile-params
575 local mdtidx=$($LFS getstripe -m $MOUNT1)
576 local facet=mds$((mdtidx + 1))
578 save_lustre_params $facet "mdt.*.commit_on_sharing" > $param_file
582 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
584 test_21b_sub $facet || error "Not all renames are replayed. COS=$COS"
586 # there is still a window when transactions may be written to disk
587 # before the mds device is set R/O. To avoid such a rare test failure,
588 # the check is repeated several times.
592 # COS disabled (should fail)
593 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
595 test_21b_sub $facet || break
596 n_attempts=$((n_attempts + 1))
597 [ $n_attempts -gt 3 ] &&
598 error "can't check if COS works: rename replied w/o COS"
600 zconf_mount_clients $CLIENTS $MOUNT2
601 restore_lustre_params < $param_file
605 run_test 21b "commit on sharing, two clients"
608 checkstat $MOUNT1/$remote_dir || return 1
609 checkstat $MOUNT1/$remote_dir/dir || return 2
610 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 3
611 checkstat $MOUNT1/$remote_dir/dir/$tfile-1 || return 4
615 create_remote_dir_files_22() {
616 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir/dir || return 1
617 do_node $CLIENT1 createmany -o $MOUNT1/$remote_dir/dir/$tfile- 2 ||
619 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 ||
625 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
626 ([ $FAILURE_MODE == "HARD" ] &&
627 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
628 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
632 local remote_dir=${tdir}/remote_dir
634 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
636 # OBD_FAIL_MDS_REINT_NET_REP 0x119
637 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
638 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
642 fail mds$((MDTIDX + 1))
643 wait $CLIENT_PID || error "lfs mkdir failed"
645 replay_barrier mds$MDTIDX
646 create_remote_dir_files_22 || error "Remote creation failed $?"
649 checkstat_22 || error "check stat failed $?"
651 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
654 run_test 22a "c1 lfs mkdir -i 1 dir1, M1 drop reply & fail, c2 mkdir dir1/dir"
657 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
659 local remote_dir=$tdir/remote_dir
661 # OBD_FAIL_MDS_REINT_NET_REP 0x119
662 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
664 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
665 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
669 fail mds${MDTIDX},mds$((MDTIDX + 1))
670 wait $CLIENT_PID || error "lfs mkdir failed"
672 replay_barrier mds$MDTIDX
673 create_remote_dir_files_22 || error "Remote creation failed $?"
676 checkstat_22 || error "check stat failed $?"
678 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
681 run_test 22b "c1 lfs mkdir -i 1 d1, M1 drop reply & fail M0/M1, c2 mkdir d1/dir"
684 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
685 ([ $FAILURE_MODE == "HARD" ] &&
686 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
687 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
690 local remote_dir=${tdir}/remote_dir
692 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
694 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
695 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
696 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
699 do_facet mds$MDTIDX lctl set_param fail_loc=0
702 wait $CLIENT_PID || error "lfs mkdir failed"
704 replay_barrier mds$MDTIDX
705 create_remote_dir_files_22 || error "Remote creation failed $?"
708 checkstat_22 || error "check stat failed $?"
710 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
713 run_test 22c "c1 lfs mkdir -i 1 d1, M1 drop update & fail M1, c2 mkdir d1/dir"
716 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
718 local remote_dir=${tdir}/remote_dir
720 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
722 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
723 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
724 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
727 do_facet mds$MDTIDX lctl set_param fail_loc=0
729 fail mds${MDTIDX},mds$((MDTIDX + 1))
730 wait $CLIENT_PID || error "lfs mkdir failed"
732 replay_barrier mds$MDTIDX
733 create_remote_dir_files_22 || error "Remote creation failed $?"
736 checkstat_22 || error "check stat failed $?"
738 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
741 run_test 22d "c1 lfs mkdir -i 1 d1, M1 drop update & fail M0/M1,c2 mkdir d1/dir"
744 checkstat $MOUNT1/$remote_dir || return 1
745 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 2
749 create_remote_dir_files_23() {
750 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir || return 1
751 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 || return 2
756 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
757 ([ $FAILURE_MODE == "HARD" ] &&
758 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
759 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
762 local remote_dir=$tdir/remote_dir
764 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
765 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
766 error "lfs mkdir failed"
767 # OBD_FAIL_MDS_REINT_NET_REP 0x119
768 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
769 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
772 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
774 fail mds$((MDTIDX + 1))
775 wait $CLIENT_PID || error "rmdir remote dir failed"
777 replay_barrier mds${MDTIDX}
778 create_remote_dir_files_23 || error "Remote creation failed $?"
781 checkstat_23 || error "check stat failed $?"
783 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
786 run_test 23a "c1 rmdir d1, M1 drop reply and fail, client2 mkdir d1"
789 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
791 local remote_dir=$tdir/remote_dir
793 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
794 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
795 error "lfs mkdir failed"
797 # OBD_FAIL_MDS_REINT_NET_REP 0x119
798 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
799 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
802 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
804 fail mds${MDTIDX},mds$((MDTIDX + 1))
805 wait $CLIENT_PID || error "rmdir remote dir failed"
807 replay_barrier mds${MDTIDX}
808 create_remote_dir_files_23 || error "Remote creation failed $?"
811 checkstat_23 || error "check stat failed $?"
813 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
816 run_test 23b "c1 rmdir d1, M1 drop reply and fail M0/M1, c2 mkdir d1"
819 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
821 ([ $FAILURE_MODE == "HARD" ] &&
822 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
823 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
826 local remote_dir=$tdir/remote_dir
828 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
829 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
830 error "lfs mkdir failed"
832 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
833 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
834 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
837 do_facet mds${MDTIDX} lctl set_param fail_loc=0
840 wait $CLIENT_PID || error "rmdir remote dir failed"
842 replay_barrier mds${MDTIDX}
843 create_remote_dir_files_23 || error "Remote creation failed $?"
846 checkstat_23 || error "check stat failed $?"
848 rm -rf $MOUNT1/$tdir || return 6
851 run_test 23c "c1 rmdir d1, M0 drop update reply and fail M0, c2 mkdir d1"
854 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
856 local remote_dir=$tdir/remote_dir
858 do_node $CLIENT1 mkdir -p $MOUNT1/${tdir}
859 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
860 error "lfs mkdir failed"
862 # OBD_FAIL_UPDATE_OBJ_NET 0x1701
863 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
864 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
867 do_facet mds${MDTIDX} lctl set_param fail_loc=0
869 fail mds${MDTIDX},mds$((MDTIDX + 1))
870 wait $CLIENT_PID || error "rmdir remote dir failed"
872 replay_barrier mds${MDTIDX}
873 create_remote_dir_files_23 || error "Remote creation failed $?"
876 checkstat_23 || error "check stat failed $?"
878 rm -rf $MOUNT1/$tdir || return 6
881 run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1"
884 [[ $(lustre_version_code $SINGLEMDS) -gt $(version_code 2.5.2) ]] ||
885 { skip "Need MDS version newer than 2.5.2"; return 0; }
888 stat $MOUNT/$tfile >&/dev/null
889 # OBD_FAIL_MDS_REINT_NET_REP
890 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x119
891 $TRUNCATE $MOUNT/$tfile 100 &
894 do_facet $SINGLEMDS lctl set_param fail_loc=0
895 # sync to release rep-ack lock quickly
896 do_nodes $(comma_list $(mdts_nodes)) \
897 "lctl set_param -n osd*.*MDT*.force_sync 1"
901 run_test 24 "reconstruct on non-existing object"
903 # end commit on sharing tests
908 $SETSTRIPE -i 0 -c 1 $DIR/$tfile
910 # get lock for the 1st client
911 dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null ||
912 error "failed to write data"
914 # get waiting locks for the 2nd client
915 drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
918 # failover, replay and resend replayed waiting locks
919 if [ $(lustre_version_code ost1) -ge $(version_code 2.6.90) ]; then
920 #define OBD_FAIL_LDLM_SRV_CP_AST 0x325
921 do_facet ost1 lctl set_param fail_loc=0x80000325
923 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
924 do_facet ost1 lctl set_param fail_loc=0x80000213
929 # multiop does not finish because CP AST is skipped;
930 # it is ok to kill it in the test, because CP AST is already re-sent
931 # and it does not hung forever in real life
935 run_test 25 "replay|resend"
940 kill -9 $dbench_26_pid
945 local clients=${CLIENTS:-$HOSTNAME}
947 zconf_mount_clients $clients $MOUNT
950 [ "$SLOW" = "no" ] && duration=200
951 # set duration to 900 because it takes some time to boot node
952 [ "$FAILURE_MODE" = HARD ] && duration=900
954 local start_ts=$SECONDS
959 local tar_dir=$DIR/$tdir/run_tar
961 test_mkdir -p -c$MDSCOUNT $tar_dir || break
962 if [ $MDSCOUNT -ge 2 ]; then
963 $LFS setdirstripe -D -c$MDSCOUNT $tar_dir ||
964 error "set default dirstripe failed"
967 tar cf - /etc | tar xf - || error "tar failed"
968 cd $DIR/$tdir || break
969 rm -rf $tar_dir || break
973 echo "Started tar $tar_26_pid"
976 local dbench_dir=$DIR2/$tdir/run_dbench
978 test_mkdir -p -c$MDSCOUNT $dbench_dir || break
979 if [ $MDSCOUNT -ge 2 ]; then
980 $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir ||
981 error "set default dirstripe failed"
983 cd $dbench_dir || break
984 rundbench 1 -D $dbench_dir -t 100 &>/dev/null || break
985 cd $DIR/$tdir || break
986 rm -rf $dbench_dir || break
990 echo "Started dbench $dbench_26_pid"
992 local num_failovers=0
994 while [ $((SECONDS - start_ts)) -lt $duration ]; do
995 kill -0 $tar_26_pid || error "tar $tar_26_pid missing"
996 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid missing"
998 replay_barrier mds$fail_index
999 sleep 2 # give clients a time to do operations
1000 # Increment the number of failovers
1001 num_failovers=$((num_failovers + 1))
1002 log "$TESTNAME fail mds$fail_index $num_failovers times"
1004 if [ $fail_index -ge $MDSCOUNT ]; then
1007 fail_index=$((fail_index + 1))
1010 # stop the client loads
1011 kill -0 $tar_26_pid || error "tar $tar_26_pid stopped"
1012 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped"
1015 run_test 26 "dbench and tar with mds failover"
1018 $SETSTRIPE -i 0 -c 1 $DIR2/$tfile
1019 dd if=/dev/zero of=$DIR2/$tfile bs=4096 count=1
1021 #define OBD_FAIL_LDLM_SRV_BL_AST 0x324
1022 do_facet ost1 $LCTL set_param fail_loc=0x80000324
1024 dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 &
1028 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
1029 do_facet ost1 $LCTL set_param fail_loc=0x32a
1034 cancel_lru_locks OST0000-osc
1035 wait $pid || error "dd failed"
1037 run_test 28 "lock replay should be ordered: waiting after granted"
1040 (( $MDSCOUNT < 2 )) && skip_env "needs >= 2 MDTs"
1042 # inject a gap with 10th transaction
1043 #define OBD_FAIL_LLOG_ADD_GAP 0x131d
1044 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0000131d fail_val=10
1045 for ((i=0; i < 20; i++)); do
1046 $LFS setdirstripe -i1 $DIR/$tdir-$i ||
1047 error "can't mkdir $DIR/$tdir-$i"
1050 # prevent update llog cancellation, so next boot MDS has
1051 # process the update llog with gap injected
1052 #define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726
1053 $LCTL set_param fail_loc=0x726
1058 $LCTL set_param fail_loc=0
1065 local testid=$(echo $TESTNAME | tr '_' ' ')
1066 dmesg | tac | sed "/$testid/,$ d" | grep "This client was evicted" &&
1067 error "client got evicted due to aborted recovery"
1070 run_test 32 "gap in update llog shouldn't break recovery"
1073 SLEEP=$((SECONDS - $NOW))
1074 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
1075 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
1076 check_and_cleanup_lustre