5 PTLDEBUG=${PTLDEBUG:--1}
6 MOUNT_2=${MOUNT_2:-"yes"}
7 LR_READER=${LR_READER:-"$LUSTRE/utils/lr_reader"}
9 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
10 . $LUSTRE/tests/test-framework.sh
14 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
16 ALWAYS_EXCEPT="$REPLAY_DUAL_EXCEPT "
17 # bug number for skipped test: LU-2012 LU-8333
18 ALWAYS_EXCEPT+=" 14b 21b"
19 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
21 [[ "$mds1_FSTYPE" == zfs ]] &&
22 # bug number for skipped test: LU-2230
23 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 21b"
26 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b "
29 check_and_setup_lustre
31 MOUNTED=$(mounted_lustre_filesystems)
32 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
33 zconf_mount $HOSTNAME $MOUNT2
38 rm -rf $DIR/[df][0-9]*
40 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
42 # if there is no CLIENT1 defined, some tests can be ran on localhost
43 CLIENT1=${CLIENT1:-$HOSTNAME}
44 # if CLIENT2 doesn't exist then use CLIENT1 instead
45 # All tests should use CLIENT2 with MOUNT2 only therefore it will work if
47 # Exception is the test which need two separate nodes
48 CLIENT2=${CLIENT2:-$CLIENT1}
50 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
52 echo "Check file is LU482_FAILED=$LU482_FAILED"
53 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
54 replay_barrier $SINGLEMDS
55 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | CFS_FAIL_ONCE
57 createmany -o $MOUNT1/$tfile- 50
58 $LCTL set_param fail_loc=0x80000514
59 facet_failover $SINGLEMDS
60 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
61 client_up || (sleep 10; client_up) || (sleep 10; client_up) ||
62 error "reconnect failed"
64 client_up || (sleep 10; client_up) || (sleep 10; client_up) ||
65 error "reconnect failed"
66 zconf_mount `hostname` $MOUNT2 || error "mount2 failed"
67 unlinkmany $MOUNT1/$tfile- 50 || errot "unlinkmany failed"
68 rm $MOUNT2/$tfile || error "rm $MOUNT2/$tfile failed"
69 rm $MOUNT2/$tfile-A || error "rm $MOUNT2/$tfile-A failed"
71 run_test 0a "expired recovery with lost client"
73 if [ -f "$LU482_FAILED" ]; then
74 log "Found check file $LU482_FAILED, aborting test script"
75 rm -vf "$LU482_FAILED"
76 complete_test $SECONDS
77 do_nodes $CLIENTS umount -f $MOUNT2 || true
78 do_nodes $CLIENTS umount -f $MOUNT || true
79 # copied from stopall, but avoid the MDS recovery
80 for num in `seq $OSTCOUNT`; do
82 rm -f $TMP/ost${num}active
84 if ! combined_mgs_mds ; then
92 replay_barrier $SINGLEMDS
94 touch $MOUNT1/$tfile-2
96 facet_failover $SINGLEMDS
98 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
99 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
100 # it is uncertain if file-2 exists or not, remove it if it does
101 checkstat $MOUNT1/$tfile-2 && rm $MOUNT1/$tfile-2
102 checkstat $MOUNT2/$tfile && return 2
105 run_test 0b "lost client during waiting for next transno"
109 replay_barrier $SINGLEMDS
113 checkstat $MOUNT2/a || return 1
114 checkstat $MOUNT1/b || return 2
115 rm $MOUNT2/a $MOUNT1/b
116 checkstat $MOUNT1/a && return 3
117 checkstat $MOUNT2/b && return 4
121 run_test 1 "|X| simple create"
125 replay_barrier $SINGLEMDS
129 checkstat $MOUNT2/adir || return 1
131 checkstat $MOUNT2/adir && return 2
134 run_test 2 "|X| mkdir adir"
137 replay_barrier $SINGLEMDS
139 mkdir $MOUNT2/adir/bdir
142 checkstat $MOUNT2/adir || return 1
143 checkstat $MOUNT1/adir/bdir || return 2
144 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
145 checkstat $MOUNT1/adir && return 3
146 checkstat $MOUNT2/adir/bdir && return 4
149 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
153 replay_barrier $SINGLEMDS
154 mkdir $MOUNT1/adir && return 1
155 mkdir $MOUNT2/adir/bdir
158 checkstat $MOUNT2/adir || return 2
159 checkstat $MOUNT1/adir/bdir || return 3
161 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
162 checkstat $MOUNT1/adir && return 4
163 checkstat $MOUNT2/adir/bdir && return 5
166 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
170 # multiclient version of replay_single.sh/test_8
172 multiop_bg_pause $MOUNT2/a o_tSc || return 1
175 replay_barrier $SINGLEMDS
177 wait $pid || return 1
180 [ -e $MOUNT2/a ] && return 2
183 run_test 5 "open, unlink |X| close"
188 multiop_bg_pause $MOUNT2/a o_c || return 1
190 multiop_bg_pause $MOUNT1/a o_c || return 1
193 replay_barrier $SINGLEMDS
195 wait $pid1 || return 1
199 wait $pid2 || return 1
200 [ -e $MOUNT2/a ] && return 2
203 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
206 replay_barrier $SINGLEMDS
207 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
209 checkstat $MOUNT2/$tfile || return 2
210 rm $MOUNT1/$tfile || return 3
214 run_test 8 "replay of resent request"
217 replay_barrier $SINGLEMDS
218 mcreate $MOUNT1/$tfile-1
219 mcreate $MOUNT2/$tfile-2
220 # drop first reint reply
221 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
223 do_facet $SINGLEMDS lctl set_param fail_loc=0
225 rm $MOUNT1/$tfile-[1,2] || return 1
229 run_test 9 "resending a replayed create"
232 mcreate $MOUNT1/$tfile-1
233 replay_barrier $SINGLEMDS
234 unlink $MOUNT1/$tfile-1
235 mcreate $MOUNT2/$tfile-2
236 # drop first reint reply
237 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
239 do_facet $SINGLEMDS lctl set_param fail_loc=0
241 checkstat $MOUNT1/$tfile-1 && return 1
242 checkstat $MOUNT1/$tfile-2 || return 2
247 run_test 10 "resending a replayed unlink"
250 replay_barrier $SINGLEMDS
251 mcreate $DIR1/$tfile-1
252 mcreate $DIR2/$tfile-2
253 mcreate $DIR1/$tfile-3
254 mcreate $DIR2/$tfile-4
255 mcreate $DIR1/$tfile-5
256 # drop all reint replies for a while
257 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
258 # note that with this fail_loc set, facet_failover df will fail
259 facet_failover $SINGLEMDS
261 local clients=${CLIENTS:-$HOSTNAME}
262 wait_clients_import_state "$clients" $SINGLEMDS FULL
264 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
266 rm $DIR1/$tfile-[1-5] || return 1
270 run_test 11 "both clients timeout during replay"
273 replay_barrier $SINGLEMDS
275 multiop_bg_pause $DIR/$tfile mo_c || return 1
278 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
279 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
280 facet_failover $SINGLEMDS
281 do_facet $SINGLEMDS lctl set_param fail_loc=0
282 clients_up || return 1
285 kill -USR1 $MULTIPID || return 3
286 wait $MULTIPID || return 4
287 $CHECKSTAT -t file $DIR/$tfile || return 2
292 run_test 12 "open resend timeout"
295 multiop_bg_pause $DIR/$tfile mo_c || return 1
298 replay_barrier $SINGLEMDS
300 kill -USR1 $MULTIPID || return 3
301 wait $MULTIPID || return 4
304 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
305 facet_failover $SINGLEMDS
306 do_facet $SINGLEMDS lctl set_param fail_loc=0
307 clients_up || return 1
310 $CHECKSTAT -t file $DIR/$tfile || return 2
315 run_test 13 "close resend timeout"
317 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
322 wait_delete_completed
324 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
326 mkdir -p $MOUNT1/$tdir
327 $LFS setstripe -i 0 $MOUNT1/$tdir
328 replay_barrier $SINGLEMDS
329 createmany -o $MOUNT1/$tdir/$tfile- 5
331 $LFS setstripe -i 0 $MOUNT2/$tfile-2
332 dd if=/dev/zero of=$MOUNT2/$tfile-2 bs=1M count=5
333 createmany -o $MOUNT1/$tdir/$tfile-3- 5
337 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
339 # first set of files should have been replayed
340 unlinkmany $MOUNT1/$tdir/$tfile- 5 || error "first unlinks failed"
341 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || error "second unlinks failed"
343 zconf_mount $HOSTNAME $MOUNT2 || error "mount $MOUNT2 failed"
344 [ -f $MOUNT2/$tfile-2 ] && error "$MOUNT2/$tfile-2 exists!"
346 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
347 wait_delete_completed || error "wait_delete_complete failed"
349 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
350 log "before $beforeused, after $afterused"
351 # leave some margin for some files/dirs to be modified (OI, llog, etc)
352 [ $afterused -le $((beforeused + $(fs_log_size))) ] ||
353 error "after $afterused > before $beforeused"
355 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
357 test_15a() { # was test_15
358 replay_barrier $SINGLEMDS
359 createmany -o $MOUNT1/$tfile- 25
360 createmany -o $MOUNT2/$tfile-2- 1
365 unlinkmany $MOUNT1/$tfile- 25 || return 2
366 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
368 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
371 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
374 replay_barrier $SINGLEMDS
375 for ((i = 0; i < 2000; i++)); do
376 echo "data" > "$MOUNT2/${tfile}-$i" ||
377 error "create ${tfile}-$i failed"
383 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
386 run_test 15c "remove multiple OST orphans"
389 replay_barrier $SINGLEMDS
390 createmany -o $MOUNT1/$tfile- 25
391 createmany -o $MOUNT2/$tfile-2- 1
394 facet_failover $SINGLEMDS
398 unlinkmany $MOUNT1/$tfile- 25 || return 2
400 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
403 run_test 16 "fail MDS during recovery (3571)"
406 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
408 createmany -o $MOUNT1/$tfile- 25
409 createmany -o $MOUNT2/$tfile-2- 1
411 # Make sure the disconnect is lost
419 unlinkmany $MOUNT1/$tfile- 25 || return 2
421 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
425 run_test 17 "fail OST during recovery (3571)"
427 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
430 test_18() { # bug 3822 - evicting client with enqueued lock
432 local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug)
433 do_facet $SINGLEMDS lctl set_param debug=+dlmtrace
434 mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed"
435 touch $MOUNT1/$tdir/${tfile}0 || error "touch file failed"
436 statmany -s $MOUNT1/$tdir/$tfile 1 500 &
439 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
440 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
442 #define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
443 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
444 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
446 sleep 0.1 # wait to ensure first client is one that will be evicted
447 openfile -f O_RDONLY $MOUNT2/$tdir/$tfile
449 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
450 do_facet $SINGLEMDS lctl debug_kernel |
451 grep "not entering recovery" && error "client not evicted"
452 do_facet client "lctl set_param fail_loc=0"
453 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
455 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
457 test_19() { # Bug 10991 - resend of open request does not fail assertion.
458 replay_barrier $SINGLEMDS
459 drop_mdt_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
461 checkstat $DIR2/${tfile}0 || return 2
462 rm $DIR/${tfile}0 || return 3
466 run_test 19 "resend of open request"
469 local before=$SECONDS
470 replay_barrier $SINGLEMDS
476 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
477 local tier1=$((SECONDS - before))
480 replay_barrier $SINGLEMDS
486 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
487 local tier2=$((SECONDS - before))
489 # timeout is more than 1.5x original timeout
490 ((tier2 < tier1 * 6 / 4)) ||
491 error "recovery time $tier2 >= 1.5x original time $tier1"
493 run_test 20 "recovery time is not increasing"
495 # commit on sharing tests
497 local param_file=$TMP/$tfile-params
499 save_lustre_params $SINGLEMDS "mdt.*.commit_on_sharing" > $param_file
500 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
501 touch $MOUNT1/$tfile-1
502 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
503 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
504 replay_barrier_nosync $SINGLEMDS
507 facet_failover $SINGLEMDS
509 # all renames are replayed
510 unlink $MOUNT1/$tfile-3 || return 2
512 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
514 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
515 rm -rf $MOUNT1/$tfile-*
516 restore_lustre_params < $param_file
520 run_test 21a "commit on sharing"
524 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
527 do_node $CLIENT1 touch $MOUNT1/$tfile-1
528 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
529 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
531 replay_barrier_nosync $mds
532 shutdown_client $CLIENT2 $MOUNT1
536 # were renames replayed?
538 echo UNLINK $MOUNT1/$tfile-3
539 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 ||
540 { echo "unlink $tfile-3 fail!" && rc=1; }
543 zconf_mount_clients $CLIENT2 $MOUNT1 ||
544 error "mount $CLIENT2 $MOUNT1 fail"
550 [ $CLIENTCOUNT -lt 2 ] &&
551 { skip "Need 2+ clients, have $CLIENTCOUNT" && return; }
553 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
554 skip "Several MDTs on one MDS with FAILURE_MODE=$FAILURE_MODE"
558 zconf_umount_clients $CLIENTS $MOUNT2
559 zconf_mount_clients $CLIENTS $MOUNT1
561 local param_file=$TMP/$tfile-params
563 local mdtidx=$($LFS getstripe -m $MOUNT1)
564 local facet=mds$((mdtidx + 1))
566 save_lustre_params $facet "mdt.*.commit_on_sharing" > $param_file
570 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
572 test_21b_sub $facet || error "Not all renames are replayed. COS=$COS"
574 # there is still a window when transactions may be written to disk
575 # before the mds device is set R/O. To avoid such a rare test failure,
576 # the check is repeated several times.
580 # COS disabled (should fail)
581 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
583 test_21b_sub $facet || break
584 n_attempts=$((n_attempts + 1))
585 [ $n_attempts -gt 3 ] &&
586 error "can't check if COS works: rename replied w/o COS"
588 zconf_mount_clients $CLIENTS $MOUNT2
589 restore_lustre_params < $param_file
593 run_test 21b "commit on sharing, two clients"
596 checkstat $MOUNT1/$remote_dir || return 1
597 checkstat $MOUNT1/$remote_dir/dir || return 2
598 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 3
599 checkstat $MOUNT1/$remote_dir/dir/$tfile-1 || return 4
603 create_remote_dir_files_22() {
604 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir/dir || return 1
605 do_node $CLIENT1 createmany -o $MOUNT1/$remote_dir/dir/$tfile- 2 ||
607 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 ||
613 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
614 ([ $FAILURE_MODE == "HARD" ] &&
615 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
616 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
620 local remote_dir=$tdir/remote_dir
622 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
623 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
625 # OBD_FAIL_MDS_REINT_NET_REP 0x119
626 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
627 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
631 fail mds$((MDTIDX + 1))
632 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
634 replay_barrier mds$MDTIDX
635 create_remote_dir_files_22 || error "Remote creation failed $?"
638 checkstat_22 || error "check stat failed $?"
640 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
643 run_test 22a "c1 lfs mkdir -i 1 dir1, M1 drop reply & fail, c2 mkdir dir1/dir"
646 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
648 local remote_dir=$tdir/remote_dir
650 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
651 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
653 # OBD_FAIL_MDS_REINT_NET_REP 0x119
654 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
655 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
659 fail mds${MDTIDX},mds$((MDTIDX + 1))
660 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
662 replay_barrier mds$MDTIDX
663 create_remote_dir_files_22 || error "Remote creation failed $?"
666 checkstat_22 || error "check stat failed $?"
668 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
671 run_test 22b "c1 lfs mkdir -i 1 d1, M1 drop reply & fail M0/M1, c2 mkdir d1/dir"
674 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
675 ([ $FAILURE_MODE == "HARD" ] &&
676 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
677 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
680 local remote_dir=$tdir/remote_dir
682 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
683 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
685 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
686 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
687 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
690 do_facet mds$MDTIDX lctl set_param fail_loc=0
693 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
695 replay_barrier mds$MDTIDX
696 create_remote_dir_files_22 || error "Remote creation failed $?"
699 checkstat_22 || error "check stat failed $?"
701 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
704 run_test 22c "c1 lfs mkdir -i 1 d1, M1 drop update & fail M1, c2 mkdir d1/dir"
707 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
709 local remote_dir=$tdir/remote_dir
711 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
712 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
714 # let previous transactions to complete
715 # (distributed llog cancels, etc)
716 do_nodes $(comma_list $(mdts_nodes)) \
717 "$LCTL set_param -n osd*.*MDT*.force_sync=1"
720 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
721 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
722 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
725 do_facet mds$MDTIDX lctl set_param fail_loc=0
727 fail mds${MDTIDX},mds$((MDTIDX + 1))
728 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
730 replay_barrier mds$MDTIDX
731 create_remote_dir_files_22 || error "Remote creation failed $?"
734 checkstat_22 || error "check stat failed $?"
736 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
739 run_test 22d "c1 lfs mkdir -i 1 d1, M1 drop update & fail M0/M1,c2 mkdir d1/dir"
742 checkstat $MOUNT1/$remote_dir || return 1
743 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 2
747 create_remote_dir_files_23() {
748 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir || return 1
749 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 || return 2
754 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
755 ([ $FAILURE_MODE == "HARD" ] &&
756 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
757 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
760 local remote_dir=$tdir/remote_dir
762 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
763 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
764 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
765 error "lfs mkdir -i $MDTIDX failed"
766 # OBD_FAIL_MDS_REINT_NET_REP 0x119
767 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
768 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
771 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
773 fail mds$((MDTIDX + 1))
774 wait $CLIENT_PID || error "rmdir remote dir failed"
776 replay_barrier mds${MDTIDX}
777 create_remote_dir_files_23 || error "Remote creation failed $?"
780 checkstat_23 || error "check stat failed $?"
782 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
785 run_test 23a "c1 rmdir d1, M1 drop reply and fail, client2 mkdir d1"
788 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
790 local remote_dir=$tdir/remote_dir
792 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
793 error "lfs mkdir -i 0 $MOUNT/$tdir failed"
794 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
795 error "lfs mkdir -i $MDTIDX failed"
797 # OBD_FAIL_MDS_REINT_NET_REP 0x119
798 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
799 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
802 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
804 fail mds${MDTIDX},mds$((MDTIDX + 1))
805 wait $CLIENT_PID || error "rmdir remote dir failed"
807 replay_barrier mds${MDTIDX}
808 create_remote_dir_files_23 || error "Remote creation failed $?"
811 checkstat_23 || error "check stat failed $?"
813 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
816 run_test 23b "c1 rmdir d1, M1 drop reply and fail M0/M1, c2 mkdir d1"
819 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
821 ([ $FAILURE_MODE == "HARD" ] &&
822 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
823 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
826 local remote_dir=$tdir/remote_dir
828 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
829 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
830 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
831 error "lfs mkdir -i $MDTIDX failed"
833 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
834 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
835 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
838 do_facet mds${MDTIDX} lctl set_param fail_loc=0
841 wait $CLIENT_PID || error "rmdir remote dir failed"
843 replay_barrier mds${MDTIDX}
844 create_remote_dir_files_23 || error "Remote creation failed $?"
847 checkstat_23 || error "check stat failed $?"
849 rm -rf $MOUNT1/$tdir || return 6
852 run_test 23c "c1 rmdir d1, M0 drop update reply and fail M0, c2 mkdir d1"
855 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
857 local remote_dir=$tdir/remote_dir
859 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
860 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
861 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
862 error "lfs mkdir -i $MDTIDX failed"
864 # let previous transactions to complete
865 # (distributed llog cancels, etc)
866 do_nodes $(comma_list $(mdts_nodes)) \
867 "$LCTL set_param -n osd*.*MDT*.force_sync=1"
870 # OBD_FAIL_UPDATE_OBJ_NET 0x1701
871 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
872 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
875 do_facet mds${MDTIDX} lctl set_param fail_loc=0
877 fail mds${MDTIDX},mds$((MDTIDX + 1))
878 wait $CLIENT_PID || error "rmdir remote dir failed"
880 replay_barrier mds${MDTIDX}
881 create_remote_dir_files_23 || error "Remote creation failed $?"
884 checkstat_23 || error "check stat failed $?"
886 rm -rf $MOUNT1/$tdir || return 6
889 run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1"
892 [[ "$MDS1_VERSION" -gt $(version_code 2.5.2) ]] ||
893 skip "Need MDS version newer than 2.5.2"
896 stat $MOUNT/$tfile >&/dev/null
897 # OBD_FAIL_MDS_REINT_NET_REP
898 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x119
899 $TRUNCATE $MOUNT/$tfile 100 &
902 do_facet $SINGLEMDS lctl set_param fail_loc=0
903 # sync to release rep-ack lock quickly
904 do_nodes $(comma_list $(mdts_nodes)) \
905 "lctl set_param -n osd*.*MDT*.force_sync 1"
909 run_test 24 "reconstruct on non-existing object"
911 # end commit on sharing tests
916 $LFS setstripe -i 0 -c 1 $DIR/$tfile
918 # get lock for the 1st client
919 dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null ||
920 error "failed to write data"
922 # get waiting locks for the 2nd client
923 drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
926 # failover, replay and resend replayed waiting locks
927 if [ "$OST1_VERSION" -ge $(version_code 2.6.90) ]; then
928 #define OBD_FAIL_LDLM_SRV_CP_AST 0x325
929 do_facet ost1 lctl set_param fail_loc=0x80000325
931 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
932 do_facet ost1 lctl set_param fail_loc=0x80000213
937 # multiop does not finish because CP AST is skipped;
938 # it is ok to kill it in the test, because CP AST is already re-sent
939 # and it does not hung forever in real life
940 killall -r "(multiop)$"
943 run_test 25 "replay|resend"
951 for pid_26 in "${pids_26[@]}"; do
952 if [[ -n "$pid_26" ]]; then
953 kill -0 "$pid_26" && kill "$pid_26" && \
954 wait "$pid_26" || true
958 for dir_26 in "${dirs_26[@]}"; do
959 if [[ -n "$dir_26" && -d "$dir_26" ]]; then
966 local clients=${CLIENTS:-$HOSTNAME}
968 zconf_mount_clients $clients $MOUNT
971 [[ "$SLOW" == "no" ]] && duration=200
972 # set duration to 900 because it takes some time to boot node
973 [[ "$FAILURE_MODE" == HARD ]] && duration=900
975 local start_ts=$SECONDS
978 stack_trap cleanup_26
980 local tar_dir=$DIR/$tdir/run_tar
984 stack_trap 'set +e; jobs -p | xargs -r kill; wait; exit' \
988 test_mkdir -p -c$MDSCOUNT $tar_dir || break
990 if (( MDSCOUNT >= 2 )); then
991 $LFS setdirstripe -D -c$MDSCOUNT $tar_dir ||
992 error "set default dirstripe failed"
996 tar -C / -cf - etc | tar -xf - &
1001 wait $tar_pid || tar_rc=$?
1003 if (( tar_rc > 0 && tar_rc <= 128 )); then
1004 error "tar failed with rc $tar_rc"
1007 cd $DIR/$tdir || break
1008 rm -rf $tar_dir || break
1014 echo "Started tar loop with pid $tar_26_pid"
1015 pids_26+=($tar_26_pid)
1017 local dbench_dir=$DIR2/$tdir/run_dbench
1019 dirs_26+=($dbench_dir)
1021 stack_trap 'set +e; jobs -p | xargs -r kill; wait; exit' \
1025 test_mkdir -p -c$MDSCOUNT $dbench_dir || break
1027 if (( MDSCOUNT >= 2 )); then
1028 $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir ||
1029 error "set default dirstripe failed"
1032 cd $dbench_dir || break
1033 bash rundbench 1 -D $dbench_dir -t 100 &
1038 wait $dbench_pid || dbench_rc=$?
1040 if (( dbench_rc > 0 && dbench_rc <= 128 )); then
1041 error "dbench failed with rc $dbench_rc"
1044 cd $DIR/$tdir || break
1045 rm -rf $dbench_dir || break
1049 local dbench_26_pid=$!
1051 echo "Started dbench loop with $dbench_26_pid"
1052 pids_26+=($dbench_26_pid)
1054 local num_failovers=0
1057 while (( (SECONDS - start_ts) < duration )); do
1058 kill -0 $tar_26_pid || error "tar $tar_26_pid missing"
1059 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid missing"
1061 replay_barrier mds$fail_index
1062 sleep 2 # give clients a time to do operations
1063 # Increment the number of failovers
1064 num_failovers=$((num_failovers + 1))
1065 log "$TESTNAME fail mds$fail_index $num_failovers times"
1067 if (( fail_index < MDSCOUNT )); then
1068 fail_index=$((fail_index + 1))
1074 # stop the client loads
1075 kill -0 $tar_26_pid || error "tar $tar_26_pid stopped"
1076 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped"
1080 run_test 26 "dbench and tar with mds failover"
1083 $LFS setstripe -i 0 -c 1 $DIR2/$tfile
1084 dd if=/dev/zero of=$DIR2/$tfile bs=4096 count=1
1086 #define OBD_FAIL_LDLM_SRV_BL_AST 0x324
1087 do_facet ost1 $LCTL set_param fail_loc=0x80000324
1089 dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 &
1093 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
1094 do_facet ost1 $LCTL set_param fail_loc=0x32a
1099 cancel_lru_locks OST0000-osc
1100 wait $pid || error "dd failed"
1102 run_test 28 "lock replay should be ordered: waiting after granted"
1105 local dir0=$DIR/$tdir/d0
1106 local dir1=$DIR/$tdir/d1
1108 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
1109 [ $CLIENTCOUNT -lt 2 ] && skip "needs >= 2 clients" && return 0
1110 [ "$CLIENT1" == "$CLIENT2" ] &&
1111 skip "clients must be on different nodes" && return 0
1114 $LFS mkdir -i0 $dir0
1115 $LFS mkdir -i1 $dir1
1119 # create a remote dir, drop reply
1120 #define OBD_FAIL_PTLRPC_ROUND_XID 0x530
1121 $LCTL set_param fail_loc=0x530 fail_val=36
1122 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
1123 do_facet mds2 $LCTL set_param fail_loc=0x8000015a
1124 echo make remote dir d0 for $dir0
1125 $LFS mkdir -i1 -c1 $dir0/d3 &
1128 echo make local dir d1 for $dir1
1129 do_node $CLIENT2 $LCTL set_param fail_loc=0x530 fail_val=36
1130 do_node $CLIENT2 mkdir $dir1/d4
1134 run_test 29 "replay vs update with the same xid"
1137 $LFS setstripe -E 1m -L mdt -E -1 $DIR/$tfile
1138 #first write to have no problems with grants
1139 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 ||
1140 error "dd on client failed"
1141 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 seek=10 ||
1142 error "dd on client failed"
1144 #define OBD_FAIL_LDLM_REPLAY_PAUSE 0x32e
1145 lctl set_param fail_loc=0x32e fail_val=4
1146 dd of=/dev/null if=$DIR2/$tfile &
1152 wait $pid || error "dd on client failed"
1154 run_test 30 "layout lock replay is not blocked on IO"
1157 mkdir_on_mdt0 $DIR1/$tdir
1158 $LFS setstripe -c 1 -i 0 $DIR1/$tdir
1159 for (( i=0; i < 10; i++ )) ; do
1160 mkdir -p $DIR1/$tdir/d.${i}
1162 mkdir $DIR1/$tdir/mdtdir
1163 $LFS setstripe -E 1M -L mdt $DIR1/$tdir/mdtdir
1165 # failover has to take longer than blocking timeout involved
1166 # by second multiop below which is set to obd_timeout/2 by
1168 local timeout=$(do_facet mds1 $LCTL get_param -n timeout)
1170 timeout=$((timeout / 2 + 5))
1171 fail ost1 $timeout &
1176 # consume preallocated objects, precreate thread will be awakened
1177 consume_precreations $DIR1/$tdir mds1 0 1
1179 # disable AT so that blocking timeout gets set to obd_timeout/2
1180 local amm=$(at_max_get mds1)
1183 stack_trap "at_max_set $amm mds1"
1187 #define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420
1188 $LCTL set_param fail_loc=0x80001420
1189 $MULTIOP $DIR1/$tdir/mdtdir/$tfile Osw4096c &
1192 for (( i=0; i<10; i++ )); do
1193 if [ -w $DIR2/$tdir/mdtdir/$tfile ]; then
1194 echo "file $DIR2/$tdir/mdtdir/$tfile is ready"
1197 echo "file $DIR2/$tdir/mdtdir/$tfile is not ready, wait 0.5 second..."
1202 $MULTIOP $DIR2/$tdir/mdtdir/$tfile oO_WRONLY:w4096c &
1206 local mmrif=$($LCTL get_param -n \
1207 mdc.$FSNAME-MDT0000-mdc-*.max_mod_rpcs_in_flight | tail -1)
1208 # these are blocked by precreation until ost failover is in progress
1209 for (( i=0; i < $mmrif; i++ )) ; do
1210 $MULTIOP $DIR1/$tdir/d.${i}/parallel Oc &
1216 echo "pids: ${multiops[@]}"
1217 for pid in "${multiops[@]}"; do
1221 if (( $rc != 0 )); then
1222 echo "wait $pid failed, rc = $rc"
1226 ((failed == 0)) || error "$failed multiops failed"
1228 run_test 31 "deadlock on file_remove_privs and occupied mod rpc slots"
1231 (( $MDSCOUNT < 2 )) && skip_env "needs >= 2 MDTs"
1233 # inject a gap with 10th transaction
1234 #define OBD_FAIL_LLOG_ADD_GAP 0x131d
1235 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0000131d fail_val=10
1236 for ((i=0; i < 20; i++)); do
1237 $LFS setdirstripe -i1 $DIR/$tdir-$i ||
1238 error "can't mkdir $DIR/$tdir-$i"
1241 # prevent update llog cancellation, so next boot MDS has
1242 # process the update llog with gap injected
1243 #define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726
1244 $LCTL set_param fail_loc=0x726
1249 $LCTL set_param fail_loc=0
1256 local testid=$(echo $TESTNAME | tr '_' ' ')
1257 dmesg | tac | sed "/$testid/,$ d" | grep "This client was evicted" &&
1258 error "client got evicted due to aborted recovery"
1261 run_test 32 "gap in update llog shouldn't break recovery"
1263 last_rcvd_check_incompat_flag() {
1265 local flag2check="$2"
1266 local dev=$(facet_device $facet)
1269 incompat=$(do_facet $facet $LR_READER $dev |
1270 awk '/feature_incompat:/ {print $2}')
1271 echo "last_rcvd in $dev: incompat = $incompat"
1273 return $(( (incompat & flag2check) != flag2check ))
1277 test_33() { # LU-15935
1278 (( $MDS1_VERSION >= $(version_code 2.15.52.86) )) ||
1279 (( $MDS1_VERSION >= $(version_code 2.15.2) &&
1280 $MDS1_VERSION < $(version_code 2.15.50) )) ||
1281 skip "Need MDS version at least 2.15.52.86 or 2.15.2"
1283 [[ "$mds1_FSTYPE" == "ldiskfs" ]] || skip "ldiskfs only test"
1284 (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTs"
1287 at_min_old=$(at_min_get ost1)
1289 stack_trap "at_min_set $at_min_old ost"
1292 cancel_lru_locks mdc
1297 # check for OBD_INCOMPAT_MULTI_RPCS (0x400) in last_rcvd
1298 last_rcvd_check_incompat_flag mds2 0x400 ||
1299 error "1st failover: OBD_INCOMPAT_MULTI_RPCS is not set on MDT0000 last_rcvd"
1301 # lose 1 client while the MDT failover
1305 wait_clients_import_state "$HOSTNAME" mds2 "REPLAY_WAIT"
1307 do_facet mds2 $LCTL --device $(convert_facet2label mds2) abort_recovery
1308 wait_clients_import_state "$HOSTNAME" mds2 "FULL"
1309 wait_recovery_complete mds2
1313 last_rcvd_check_incompat_flag mds2 0x400 ||
1314 error "2sd failover: OBD_INCOMPAT_MULTI_RPCS is not set on MDT0000 last_rcvd"
1318 zconf_mount $HOSTNAME $MOUNT2 || error "Fail to mount $MOUNT2"
1319 wait_clients_import_state "$HOSTNAME" mds2 "FULL"
1320 wait_recovery_complete mds2
1322 run_test 33 "Check for OBD_INCOMPAT_MULTI_RPCS in last_rcvd after abort_recovery"
1324 complete_test $SECONDS
1325 SLEEP=$((SECONDS - $NOW))
1326 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
1327 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
1328 check_and_cleanup_lustre