5 PTLDEBUG=${PTLDEBUG:--1}
6 MOUNT_2=${MOUNT_2:-"yes"}
7 LR_READER=${LR_READER:-"$LUSTRE/utils/lr_reader"}
9 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
10 . $LUSTRE/tests/test-framework.sh
14 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
16 ALWAYS_EXCEPT="$REPLAY_DUAL_EXCEPT "
17 # bug number for skipped test: LU-2012 LU-8333
18 ALWAYS_EXCEPT+=" 14b 21b"
19 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
21 [[ "$mds1_FSTYPE" == zfs ]] &&
22 # bug number for skipped test: LU-2230
23 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 21b"
26 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b "
29 check_and_setup_lustre
31 MOUNTED=$(mounted_lustre_filesystems)
32 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
33 zconf_mount $HOSTNAME $MOUNT2
38 rm -rf $DIR/[df][0-9]*
40 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
42 # if there is no CLIENT1 defined, some tests can be ran on localhost
43 CLIENT1=${CLIENT1:-$HOSTNAME}
44 # if CLIENT2 doesn't exist then use CLIENT1 instead
45 # All tests should use CLIENT2 with MOUNT2 only therefore it will work if
47 # Exception is the test which need two separate nodes
48 CLIENT2=${CLIENT2:-$CLIENT1}
50 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
51 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
53 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
56 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
58 echo "Check file is LU482_FAILED=$LU482_FAILED"
59 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
60 replay_barrier $SINGLEMDS
61 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
63 createmany -o $MOUNT1/$tfile- 50
64 $LCTL set_param fail_loc=0x80000514
65 facet_failover $SINGLEMDS
66 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
67 client_up || (sleep 10; client_up) || (sleep 10; client_up) ||
68 error "reconnect failed"
70 client_up || (sleep 10; client_up) || (sleep 10; client_up) ||
71 error "reconnect failed"
72 zconf_mount `hostname` $MOUNT2 || error "mount2 failed"
73 unlinkmany $MOUNT1/$tfile- 50 || errot "unlinkmany failed"
74 rm $MOUNT2/$tfile || error "rm $MOUNT2/$tfile failed"
75 rm $MOUNT2/$tfile-A || error "rm $MOUNT2/$tfile-A failed"
77 run_test 0a "expired recovery with lost client"
79 if [ -f "$LU482_FAILED" ]; then
80 log "Found check file $LU482_FAILED, aborting test script"
81 rm -vf "$LU482_FAILED"
83 do_nodes $CLIENTS umount -f $MOUNT2 || true
84 do_nodes $CLIENTS umount -f $MOUNT || true
85 # copied from stopall, but avoid the MDS recovery
86 for num in `seq $OSTCOUNT`; do
88 rm -f $TMP/ost${num}active
90 if ! combined_mgs_mds ; then
98 replay_barrier $SINGLEMDS
100 touch $MOUNT1/$tfile-2
102 facet_failover $SINGLEMDS
104 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
105 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
106 # it is uncertain if file-2 exists or not, remove it if it does
107 checkstat $MOUNT1/$tfile-2 && rm $MOUNT1/$tfile-2
108 checkstat $MOUNT2/$tfile && return 2
111 run_test 0b "lost client during waiting for next transno"
115 replay_barrier $SINGLEMDS
119 checkstat $MOUNT2/a || return 1
120 checkstat $MOUNT1/b || return 2
121 rm $MOUNT2/a $MOUNT1/b
122 checkstat $MOUNT1/a && return 3
123 checkstat $MOUNT2/b && return 4
127 run_test 1 "|X| simple create"
131 replay_barrier $SINGLEMDS
135 checkstat $MOUNT2/adir || return 1
137 checkstat $MOUNT2/adir && return 2
140 run_test 2 "|X| mkdir adir"
143 replay_barrier $SINGLEMDS
145 mkdir $MOUNT2/adir/bdir
148 checkstat $MOUNT2/adir || return 1
149 checkstat $MOUNT1/adir/bdir || return 2
150 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
151 checkstat $MOUNT1/adir && return 3
152 checkstat $MOUNT2/adir/bdir && return 4
155 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
159 replay_barrier $SINGLEMDS
160 mkdir $MOUNT1/adir && return 1
161 mkdir $MOUNT2/adir/bdir
164 checkstat $MOUNT2/adir || return 2
165 checkstat $MOUNT1/adir/bdir || return 3
167 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
168 checkstat $MOUNT1/adir && return 4
169 checkstat $MOUNT2/adir/bdir && return 5
172 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
176 # multiclient version of replay_single.sh/test_8
178 multiop_bg_pause $MOUNT2/a o_tSc || return 1
181 replay_barrier $SINGLEMDS
183 wait $pid || return 1
186 [ -e $MOUNT2/a ] && return 2
189 run_test 5 "open, unlink |X| close"
194 multiop_bg_pause $MOUNT2/a o_c || return 1
196 multiop_bg_pause $MOUNT1/a o_c || return 1
199 replay_barrier $SINGLEMDS
201 wait $pid1 || return 1
205 wait $pid2 || return 1
206 [ -e $MOUNT2/a ] && return 2
209 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
212 replay_barrier $SINGLEMDS
213 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
215 checkstat $MOUNT2/$tfile || return 2
216 rm $MOUNT1/$tfile || return 3
220 run_test 8 "replay of resent request"
223 replay_barrier $SINGLEMDS
224 mcreate $MOUNT1/$tfile-1
225 mcreate $MOUNT2/$tfile-2
226 # drop first reint reply
227 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
229 do_facet $SINGLEMDS lctl set_param fail_loc=0
231 rm $MOUNT1/$tfile-[1,2] || return 1
235 run_test 9 "resending a replayed create"
238 mcreate $MOUNT1/$tfile-1
239 replay_barrier $SINGLEMDS
240 munlink $MOUNT1/$tfile-1
241 mcreate $MOUNT2/$tfile-2
242 # drop first reint reply
243 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
245 do_facet $SINGLEMDS lctl set_param fail_loc=0
247 checkstat $MOUNT1/$tfile-1 && return 1
248 checkstat $MOUNT1/$tfile-2 || return 2
253 run_test 10 "resending a replayed unlink"
256 replay_barrier $SINGLEMDS
257 mcreate $DIR1/$tfile-1
258 mcreate $DIR2/$tfile-2
259 mcreate $DIR1/$tfile-3
260 mcreate $DIR2/$tfile-4
261 mcreate $DIR1/$tfile-5
262 # drop all reint replies for a while
263 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
264 # note that with this fail_loc set, facet_failover df will fail
265 facet_failover $SINGLEMDS
267 local clients=${CLIENTS:-$HOSTNAME}
268 wait_clients_import_state "$clients" $SINGLEMDS FULL
270 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
272 rm $DIR1/$tfile-[1-5] || return 1
276 run_test 11 "both clients timeout during replay"
279 replay_barrier $SINGLEMDS
281 multiop_bg_pause $DIR/$tfile mo_c || return 1
284 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
285 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
286 facet_failover $SINGLEMDS
287 do_facet $SINGLEMDS lctl set_param fail_loc=0
288 clients_up || return 1
291 kill -USR1 $MULTIPID || return 3
292 wait $MULTIPID || return 4
293 $CHECKSTAT -t file $DIR/$tfile || return 2
298 run_test 12 "open resend timeout"
301 multiop_bg_pause $DIR/$tfile mo_c || return 1
304 replay_barrier $SINGLEMDS
306 kill -USR1 $MULTIPID || return 3
307 wait $MULTIPID || return 4
310 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
311 facet_failover $SINGLEMDS
312 do_facet $SINGLEMDS lctl set_param fail_loc=0
313 clients_up || return 1
316 $CHECKSTAT -t file $DIR/$tfile || return 2
321 run_test 13 "close resend timeout"
323 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
328 wait_delete_completed
330 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
332 mkdir -p $MOUNT1/$tdir
333 $LFS setstripe -i 0 $MOUNT1/$tdir
334 replay_barrier $SINGLEMDS
335 createmany -o $MOUNT1/$tdir/$tfile- 5
337 $LFS setstripe -i 0 $MOUNT2/$tfile-2
338 dd if=/dev/zero of=$MOUNT2/$tfile-2 bs=1M count=5
339 createmany -o $MOUNT1/$tdir/$tfile-3- 5
343 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
345 # first set of files should have been replayed
346 unlinkmany $MOUNT1/$tdir/$tfile- 5 || error "first unlinks failed"
347 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || error "second unlinks failed"
349 zconf_mount $HOSTNAME $MOUNT2 || error "mount $MOUNT2 failed"
350 [ -f $MOUNT2/$tfile-2 ] && error "$MOUNT2/$tfile-2 exists!"
352 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
353 wait_delete_completed || error "wait_delete_complete failed"
355 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
356 log "before $beforeused, after $afterused"
357 # leave some margin for some files/dirs to be modified (OI, llog, etc)
358 [ $afterused -le $((beforeused + $(fs_log_size))) ] ||
359 error "after $afterused > before $beforeused"
361 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
363 test_15a() { # was test_15
364 replay_barrier $SINGLEMDS
365 createmany -o $MOUNT1/$tfile- 25
366 createmany -o $MOUNT2/$tfile-2- 1
371 unlinkmany $MOUNT1/$tfile- 25 || return 2
372 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
374 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
377 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
380 replay_barrier $SINGLEMDS
381 for ((i = 0; i < 2000; i++)); do
382 echo "data" > "$MOUNT2/${tfile}-$i" ||
383 error "create ${tfile}-$i failed"
389 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
392 run_test 15c "remove multiple OST orphans"
395 replay_barrier $SINGLEMDS
396 createmany -o $MOUNT1/$tfile- 25
397 createmany -o $MOUNT2/$tfile-2- 1
400 facet_failover $SINGLEMDS
404 unlinkmany $MOUNT1/$tfile- 25 || return 2
406 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
409 run_test 16 "fail MDS during recovery (3571)"
412 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
414 createmany -o $MOUNT1/$tfile- 25
415 createmany -o $MOUNT2/$tfile-2- 1
417 # Make sure the disconnect is lost
425 unlinkmany $MOUNT1/$tfile- 25 || return 2
427 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
431 run_test 17 "fail OST during recovery (3571)"
433 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
436 test_18() { # bug 3822 - evicting client with enqueued lock
438 local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug)
439 do_facet $SINGLEMDS lctl set_param debug=+dlmtrace
440 mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed"
441 touch $MOUNT1/$tdir/${tfile}0 || error "touch file failed"
442 statmany -s $MOUNT1/$tdir/$tfile 1 500 &
445 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
446 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
448 #define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
449 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
450 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
452 sleep 0.1 # wait to ensure first client is one that will be evicted
453 openfile -f O_RDONLY $MOUNT2/$tdir/$tfile
455 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
456 do_facet $SINGLEMDS lctl debug_kernel |
457 grep "not entering recovery" && error "client not evicted"
458 do_facet client "lctl set_param fail_loc=0"
459 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
461 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
463 test_19() { # Bug 10991 - resend of open request does not fail assertion.
464 replay_barrier $SINGLEMDS
465 drop_mdt_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
467 checkstat $DIR2/${tfile}0 || return 2
468 rm $DIR/${tfile}0 || return 3
472 run_test 19 "resend of open request"
475 local before=$SECONDS
476 replay_barrier $SINGLEMDS
482 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
483 local tier1=$((SECONDS - before))
486 replay_barrier $SINGLEMDS
492 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
493 local tier2=$((SECONDS - before))
495 # timeout is more than 1.5x original timeout
496 ((tier2 < tier1 * 6 / 4)) ||
497 error "recovery time $tier2 >= 1.5x original time $tier1"
499 run_test 20 "recovery time is not increasing"
501 # commit on sharing tests
503 local param_file=$TMP/$tfile-params
505 save_lustre_params $SINGLEMDS "mdt.*.commit_on_sharing" > $param_file
506 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
507 touch $MOUNT1/$tfile-1
508 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
509 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
510 replay_barrier_nosync $SINGLEMDS
513 facet_failover $SINGLEMDS
515 # all renames are replayed
516 unlink $MOUNT1/$tfile-3 || return 2
518 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
520 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
521 rm -rf $MOUNT1/$tfile-*
522 restore_lustre_params < $param_file
526 run_test 21a "commit on sharing"
530 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
533 do_node $CLIENT1 touch $MOUNT1/$tfile-1
534 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
535 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
537 replay_barrier_nosync $mds
538 shutdown_client $CLIENT2 $MOUNT1
542 # were renames replayed?
544 echo UNLINK $MOUNT1/$tfile-3
545 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 ||
546 { echo "unlink $tfile-3 fail!" && rc=1; }
549 zconf_mount_clients $CLIENT2 $MOUNT1 ||
550 error "mount $CLIENT2 $MOUNT1 fail"
556 [ $CLIENTCOUNT -lt 2 ] &&
557 { skip "Need 2+ clients, have $CLIENTCOUNT" && return; }
559 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
560 skip "Several MDTs on one MDS with FAILURE_MODE=$FAILURE_MODE"
564 zconf_umount_clients $CLIENTS $MOUNT2
565 zconf_mount_clients $CLIENTS $MOUNT1
567 local param_file=$TMP/$tfile-params
569 local mdtidx=$($LFS getstripe -m $MOUNT1)
570 local facet=mds$((mdtidx + 1))
572 save_lustre_params $facet "mdt.*.commit_on_sharing" > $param_file
576 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
578 test_21b_sub $facet || error "Not all renames are replayed. COS=$COS"
580 # there is still a window when transactions may be written to disk
581 # before the mds device is set R/O. To avoid such a rare test failure,
582 # the check is repeated several times.
586 # COS disabled (should fail)
587 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
589 test_21b_sub $facet || break
590 n_attempts=$((n_attempts + 1))
591 [ $n_attempts -gt 3 ] &&
592 error "can't check if COS works: rename replied w/o COS"
594 zconf_mount_clients $CLIENTS $MOUNT2
595 restore_lustre_params < $param_file
599 run_test 21b "commit on sharing, two clients"
602 checkstat $MOUNT1/$remote_dir || return 1
603 checkstat $MOUNT1/$remote_dir/dir || return 2
604 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 3
605 checkstat $MOUNT1/$remote_dir/dir/$tfile-1 || return 4
609 create_remote_dir_files_22() {
610 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir/dir || return 1
611 do_node $CLIENT1 createmany -o $MOUNT1/$remote_dir/dir/$tfile- 2 ||
613 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 ||
619 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
620 ([ $FAILURE_MODE == "HARD" ] &&
621 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
622 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
626 local remote_dir=$tdir/remote_dir
628 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
629 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
631 # OBD_FAIL_MDS_REINT_NET_REP 0x119
632 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
633 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
637 fail mds$((MDTIDX + 1))
638 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
640 replay_barrier mds$MDTIDX
641 create_remote_dir_files_22 || error "Remote creation failed $?"
644 checkstat_22 || error "check stat failed $?"
646 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
649 run_test 22a "c1 lfs mkdir -i 1 dir1, M1 drop reply & fail, c2 mkdir dir1/dir"
652 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
654 local remote_dir=$tdir/remote_dir
656 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
657 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
659 # OBD_FAIL_MDS_REINT_NET_REP 0x119
660 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
661 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
665 fail mds${MDTIDX},mds$((MDTIDX + 1))
666 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
668 replay_barrier mds$MDTIDX
669 create_remote_dir_files_22 || error "Remote creation failed $?"
672 checkstat_22 || error "check stat failed $?"
674 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
677 run_test 22b "c1 lfs mkdir -i 1 d1, M1 drop reply & fail M0/M1, c2 mkdir d1/dir"
680 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
681 ([ $FAILURE_MODE == "HARD" ] &&
682 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
683 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
686 local remote_dir=$tdir/remote_dir
688 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
689 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
691 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
692 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
693 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
696 do_facet mds$MDTIDX lctl set_param fail_loc=0
699 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
701 replay_barrier mds$MDTIDX
702 create_remote_dir_files_22 || error "Remote creation failed $?"
705 checkstat_22 || error "check stat failed $?"
707 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
710 run_test 22c "c1 lfs mkdir -i 1 d1, M1 drop update & fail M1, c2 mkdir d1/dir"
713 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
715 local remote_dir=$tdir/remote_dir
717 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
718 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
720 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
721 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
722 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
725 do_facet mds$MDTIDX lctl set_param fail_loc=0
727 fail mds${MDTIDX},mds$((MDTIDX + 1))
728 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
730 replay_barrier mds$MDTIDX
731 create_remote_dir_files_22 || error "Remote creation failed $?"
734 checkstat_22 || error "check stat failed $?"
736 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
739 run_test 22d "c1 lfs mkdir -i 1 d1, M1 drop update & fail M0/M1,c2 mkdir d1/dir"
742 checkstat $MOUNT1/$remote_dir || return 1
743 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 2
747 create_remote_dir_files_23() {
748 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir || return 1
749 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 || return 2
754 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
755 ([ $FAILURE_MODE == "HARD" ] &&
756 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
757 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
760 local remote_dir=$tdir/remote_dir
762 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
763 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
764 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
765 error "lfs mkdir -i $MDTIDX failed"
766 # OBD_FAIL_MDS_REINT_NET_REP 0x119
767 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
768 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
771 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
773 fail mds$((MDTIDX + 1))
774 wait $CLIENT_PID || error "rmdir remote dir failed"
776 replay_barrier mds${MDTIDX}
777 create_remote_dir_files_23 || error "Remote creation failed $?"
780 checkstat_23 || error "check stat failed $?"
782 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
785 run_test 23a "c1 rmdir d1, M1 drop reply and fail, client2 mkdir d1"
788 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
790 local remote_dir=$tdir/remote_dir
792 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
793 error "lfs mkdir -i 0 $MOUNT/$tdir failed"
794 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
795 error "lfs mkdir -i $MDTIDX failed"
797 # OBD_FAIL_MDS_REINT_NET_REP 0x119
798 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
799 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
802 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
804 fail mds${MDTIDX},mds$((MDTIDX + 1))
805 wait $CLIENT_PID || error "rmdir remote dir failed"
807 replay_barrier mds${MDTIDX}
808 create_remote_dir_files_23 || error "Remote creation failed $?"
811 checkstat_23 || error "check stat failed $?"
813 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
816 run_test 23b "c1 rmdir d1, M1 drop reply and fail M0/M1, c2 mkdir d1"
819 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
821 ([ $FAILURE_MODE == "HARD" ] &&
822 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
823 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
826 local remote_dir=$tdir/remote_dir
828 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
829 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
830 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
831 error "lfs mkdir -i $MDTIDX failed"
833 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
834 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
835 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
838 do_facet mds${MDTIDX} lctl set_param fail_loc=0
841 wait $CLIENT_PID || error "rmdir remote dir failed"
843 replay_barrier mds${MDTIDX}
844 create_remote_dir_files_23 || error "Remote creation failed $?"
847 checkstat_23 || error "check stat failed $?"
849 rm -rf $MOUNT1/$tdir || return 6
852 run_test 23c "c1 rmdir d1, M0 drop update reply and fail M0, c2 mkdir d1"
855 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
857 local remote_dir=$tdir/remote_dir
859 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
860 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
861 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
862 error "lfs mkdir -i $MDTIDX failed"
864 # OBD_FAIL_UPDATE_OBJ_NET 0x1701
865 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
866 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
869 do_facet mds${MDTIDX} lctl set_param fail_loc=0
871 fail mds${MDTIDX},mds$((MDTIDX + 1))
872 wait $CLIENT_PID || error "rmdir remote dir failed"
874 replay_barrier mds${MDTIDX}
875 create_remote_dir_files_23 || error "Remote creation failed $?"
878 checkstat_23 || error "check stat failed $?"
880 rm -rf $MOUNT1/$tdir || return 6
883 run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1"
886 [[ "$MDS1_VERSION" -gt $(version_code 2.5.2) ]] ||
887 skip "Need MDS version newer than 2.5.2"
890 stat $MOUNT/$tfile >&/dev/null
891 # OBD_FAIL_MDS_REINT_NET_REP
892 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x119
893 $TRUNCATE $MOUNT/$tfile 100 &
896 do_facet $SINGLEMDS lctl set_param fail_loc=0
897 # sync to release rep-ack lock quickly
898 do_nodes $(comma_list $(mdts_nodes)) \
899 "lctl set_param -n osd*.*MDT*.force_sync 1"
903 run_test 24 "reconstruct on non-existing object"
905 # end commit on sharing tests
910 $LFS setstripe -i 0 -c 1 $DIR/$tfile
912 # get lock for the 1st client
913 dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null ||
914 error "failed to write data"
916 # get waiting locks for the 2nd client
917 drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
920 # failover, replay and resend replayed waiting locks
921 if [ "$OST1_VERSION" -ge $(version_code 2.6.90) ]; then
922 #define OBD_FAIL_LDLM_SRV_CP_AST 0x325
923 do_facet ost1 lctl set_param fail_loc=0x80000325
925 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
926 do_facet ost1 lctl set_param fail_loc=0x80000213
931 # multiop does not finish because CP AST is skipped;
932 # it is ok to kill it in the test, because CP AST is already re-sent
933 # and it does not hung forever in real life
937 run_test 25 "replay|resend"
942 kill -9 $dbench_26_pid
947 local clients=${CLIENTS:-$HOSTNAME}
949 zconf_mount_clients $clients $MOUNT
952 [ "$SLOW" = "no" ] && duration=200
953 # set duration to 900 because it takes some time to boot node
954 [ "$FAILURE_MODE" = HARD ] && duration=900
956 local start_ts=$SECONDS
961 local tar_dir=$DIR/$tdir/run_tar
963 test_mkdir -p -c$MDSCOUNT $tar_dir || break
964 if [ $MDSCOUNT -ge 2 ]; then
965 $LFS setdirstripe -D -c$MDSCOUNT $tar_dir ||
966 error "set default dirstripe failed"
969 tar cf - /etc | tar xf - || error "tar failed"
970 cd $DIR/$tdir || break
971 rm -rf $tar_dir || break
975 echo "Started tar $tar_26_pid"
978 local dbench_dir=$DIR2/$tdir/run_dbench
980 test_mkdir -p -c$MDSCOUNT $dbench_dir || break
981 if [ $MDSCOUNT -ge 2 ]; then
982 $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir ||
983 error "set default dirstripe failed"
985 cd $dbench_dir || break
986 rundbench 1 -D $dbench_dir -t 100 &>/dev/null || break
987 cd $DIR/$tdir || break
988 rm -rf $dbench_dir || break
992 echo "Started dbench $dbench_26_pid"
994 local num_failovers=0
996 while [ $((SECONDS - start_ts)) -lt $duration ]; do
997 kill -0 $tar_26_pid || error "tar $tar_26_pid missing"
998 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid missing"
1000 replay_barrier mds$fail_index
1001 sleep 2 # give clients a time to do operations
1002 # Increment the number of failovers
1003 num_failovers=$((num_failovers + 1))
1004 log "$TESTNAME fail mds$fail_index $num_failovers times"
1006 if [ $fail_index -ge $MDSCOUNT ]; then
1009 fail_index=$((fail_index + 1))
1012 # stop the client loads
1013 kill -0 $tar_26_pid || error "tar $tar_26_pid stopped"
1014 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped"
1017 run_test 26 "dbench and tar with mds failover"
1020 $LFS setstripe -i 0 -c 1 $DIR2/$tfile
1021 dd if=/dev/zero of=$DIR2/$tfile bs=4096 count=1
1023 #define OBD_FAIL_LDLM_SRV_BL_AST 0x324
1024 do_facet ost1 $LCTL set_param fail_loc=0x80000324
1026 dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 &
1030 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
1031 do_facet ost1 $LCTL set_param fail_loc=0x32a
1036 cancel_lru_locks OST0000-osc
1037 wait $pid || error "dd failed"
1039 run_test 28 "lock replay should be ordered: waiting after granted"
1042 local dir0=$DIR/$tdir/d0
1043 local dir1=$DIR/$tdir/d1
1045 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
1046 [ $CLIENTCOUNT -lt 2 ] && skip "needs >= 2 clients" && return 0
1047 [ "$CLIENT1" == "$CLIENT2" ] &&
1048 skip "clients must be on different nodes" && return 0
1051 $LFS mkdir -i0 $dir0
1052 $LFS mkdir -i1 $dir1
1056 # create a remote dir, drop reply
1057 #define OBD_FAIL_PTLRPC_ROUND_XID 0x530
1058 $LCTL set_param fail_loc=0x530 fail_val=36
1059 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
1060 do_facet mds2 $LCTL set_param fail_loc=0x8000015a
1061 echo make remote dir d0 for $dir0
1062 $LFS mkdir -i1 -c1 $dir0/d3 &
1065 echo make local dir d1 for $dir1
1066 do_node $CLIENT2 $LCTL set_param fail_loc=0x530 fail_val=36
1067 do_node $CLIENT2 mkdir $dir1/d4
1071 run_test 29 "replay vs update with the same xid"
1074 $LFS setstripe -E 1m -L mdt -E -1 $DIR/$tfile
1075 #first write to have no problems with grants
1076 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 ||
1077 error "dd on client failed"
1078 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 seek=10 ||
1079 error "dd on client failed"
1081 #define OBD_FAIL_LDLM_REPLAY_PAUSE 0x32e
1082 lctl set_param fail_loc=0x32e fail_val=4
1083 dd of=/dev/null if=$DIR2/$tfile &
1089 wait $pid || error "dd on client failed"
1091 run_test 30 "layout lock replay is not blocked on IO"
1094 mkdir_on_mdt0 $DIR1/$tdir
1095 $LFS setstripe -c 1 -i 0 $DIR1/$tdir
1096 for (( i=0; i < 10; i++ )) ; do
1097 mkdir -p $DIR1/$tdir/d.${i}
1099 mkdir $DIR1/$tdir/mdtdir
1100 $LFS setstripe -E 1M -L mdt $DIR1/$tdir/mdtdir
1102 # failover has to take longer than blocking timeout involved
1103 # by second multiop below which is set to obd_timeout/2 by
1105 local timeout=$(do_facet mds1 $LCTL get_param -n timeout)
1107 timeout=$((timeout / 2 + 5))
1108 fail ost1 $timeout &
1113 # consume preallocated objects, precreate thread will be awakened
1114 consume_precreations $DIR1/$tdir mds1 0 1
1116 # disable AT so that blocking timeout gets set to obd_timeout/2
1117 local amm=$(at_max_get mds1)
1120 stack_trap "at_max_set $amm mds1"
1124 #define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420
1125 $LCTL set_param fail_loc=0x80001420
1126 $MULTIOP $DIR1/$tdir/mdtdir/$tfile Osw4096c &
1129 $MULTIOP $DIR2/$tdir/mdtdir/$tfile oO_WRONLY:w4096c &
1132 local mmrif=$($LCTL get_param -n \
1133 mdc.$FSNAME-MDT0000-mdc-*.max_mod_rpcs_in_flight | tail -1)
1134 # these are blocked by precreation until ost failover is in progress
1135 for (( i=0; i < $mmrif; i++ )) ; do
1136 $MULTIOP $DIR1/$tdir/d.${i}/parallel Oc &
1142 for pid in "${multiops[@]}"; do
1143 wait $pid || ((failed++))
1145 ((failed == 0)) || error "$failed multiops failed"
1147 run_test 31 "deadlock on file_remove_privs and occupied mod rpc slots"
1150 (( $MDSCOUNT < 2 )) && skip_env "needs >= 2 MDTs"
1152 # inject a gap with 10th transaction
1153 #define OBD_FAIL_LLOG_ADD_GAP 0x131d
1154 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0000131d fail_val=10
1155 for ((i=0; i < 20; i++)); do
1156 $LFS setdirstripe -i1 $DIR/$tdir-$i ||
1157 error "can't mkdir $DIR/$tdir-$i"
1160 # prevent update llog cancellation, so next boot MDS has
1161 # process the update llog with gap injected
1162 #define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726
1163 $LCTL set_param fail_loc=0x726
1168 $LCTL set_param fail_loc=0
1175 local testid=$(echo $TESTNAME | tr '_' ' ')
1176 dmesg | tac | sed "/$testid/,$ d" | grep "This client was evicted" &&
1177 error "client got evicted due to aborted recovery"
1180 run_test 32 "gap in update llog shouldn't break recovery"
1182 last_rcvd_check_incompat_flag() {
1184 local flag2check="$2"
1185 local dev=$(facet_device $facet)
1188 incompat=$(do_facet $facet $LR_READER $dev |
1189 awk '/feature_incompat:/ {print $2}')
1190 echo "last_rcvd in $dev: incompat = $incompat"
1192 return $(( (incompat & flag2check) != flag2check ))
1196 test_33() { # LU-15935
1197 [[ "$mds1_FSTYPE" == "ldiskfs" ]] || skip "ldiskfs only test"
1202 # check for OBD_INCOMPAT_MULTI_RPCS (0x400) in last_rcvd
1203 last_rcvd_check_incompat_flag mds1 0x400 ||
1204 error "1st failover: OBD_INCOMPAT_MULTI_RPCS is not set on MDT0000 last_rcvd"
1206 # lose 1 client while the MDT failover
1210 wait_clients_import_state "$HOSTNAME" mds1 "\(REPLAY_WAIT\|REPLAY_LOCKS\)"
1212 do_facet mds1 $LCTL --device $(convert_facet2label mds1) abort_recovery
1213 wait_clients_import_state "$HOSTNAME" mds1 "FULL"
1216 last_rcvd_check_incompat_flag mds1 0x400 ||
1217 error "2sd failover: OBD_INCOMPAT_MULTI_RPCS is not set on MDT0000 last_rcvd"
1220 zconf_mount $HOSTNAME $MOUNT2
1221 wait_clients_import_state "$HOSTNAME" mds1 "FULL"
1223 run_test 33 "Check for OBD_INCOMPAT_MULTI_RPCS in last_rcvd after abort_recovery"
1226 SLEEP=$((SECONDS - $NOW))
1227 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
1228 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
1229 check_and_cleanup_lustre