5 PTLDEBUG=${PTLDEBUG:--1}
6 MOUNT_2=${MOUNT_2:-"yes"}
7 LR_READER=${LR_READER:-"$LUSTRE/utils/lr_reader"}
9 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
10 . $LUSTRE/tests/test-framework.sh
14 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0
16 ALWAYS_EXCEPT="$REPLAY_DUAL_EXCEPT "
17 # bug number for skipped test: LU-2012 LU-8333
18 ALWAYS_EXCEPT+=" 14b 21b"
19 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
21 [[ "$mds1_FSTYPE" == zfs ]] &&
22 # bug number for skipped test: LU-2230
23 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 21b"
26 [ "$SLOW" = "no" ] && EXCEPT_SLOW="21b "
29 check_and_setup_lustre
31 MOUNTED=$(mounted_lustre_filesystems)
32 if ! $(echo $MOUNTED' ' | grep -w -q $MOUNT2' '); then
33 zconf_mount $HOSTNAME $MOUNT2
38 rm -rf $DIR/[df][0-9]*
40 [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
42 # if there is no CLIENT1 defined, some tests can be ran on localhost
43 CLIENT1=${CLIENT1:-$HOSTNAME}
44 # if CLIENT2 doesn't exist then use CLIENT1 instead
45 # All tests should use CLIENT2 with MOUNT2 only therefore it will work if
47 # Exception is the test which need two separate nodes
48 CLIENT2=${CLIENT2:-$CLIENT1}
50 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
51 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
53 do_facet $SINGLEMDS "sync; sleep 10; sync; sleep 10; sync"
58 LU482_FAILED=$(mktemp -u $TMP/$TESTSUITE.lu482.XXXXXX)
60 echo "Check file is LU482_FAILED=$LU482_FAILED"
61 touch $MOUNT2/$tfile-A # force sync FLD/SEQ update before barrier
62 replay_barrier $SINGLEMDS
63 #define OBD_FAIL_PTLRPC_FINISH_REPLAY | OBD_FAIL_ONCE
65 createmany -o $MOUNT1/$tfile- 50
66 $LCTL set_param fail_loc=0x80000514
67 facet_failover $SINGLEMDS
68 [ -f "$LU482_FAILED" ] && skip "LU-482 failure" && return 0
69 client_up || (sleep 10; client_up) || (sleep 10; client_up) ||
70 error "reconnect failed"
72 client_up || (sleep 10; client_up) || (sleep 10; client_up) ||
73 error "reconnect failed"
74 zconf_mount `hostname` $MOUNT2 || error "mount2 failed"
75 unlinkmany $MOUNT1/$tfile- 50 || errot "unlinkmany failed"
76 rm $MOUNT2/$tfile || error "rm $MOUNT2/$tfile failed"
77 rm $MOUNT2/$tfile-A || error "rm $MOUNT2/$tfile-A failed"
79 run_test 0a "expired recovery with lost client"
81 if [ -f "$LU482_FAILED" ]; then
82 log "Found check file $LU482_FAILED, aborting test script"
83 rm -vf "$LU482_FAILED"
85 do_nodes $CLIENTS umount -f $MOUNT2 || true
86 do_nodes $CLIENTS umount -f $MOUNT || true
87 # copied from stopall, but avoid the MDS recovery
88 for num in `seq $OSTCOUNT`; do
90 rm -f $TMP/ost${num}active
92 if ! combined_mgs_mds ; then
100 replay_barrier $SINGLEMDS
102 touch $MOUNT1/$tfile-2
104 facet_failover $SINGLEMDS
106 zconf_mount `hostname` $MOUNT1 || error "mount1 fais"
107 zconf_mount `hostname` $MOUNT2 || error "mount2 fais"
108 # it is uncertain if file-2 exists or not, remove it if it does
109 checkstat $MOUNT1/$tfile-2 && rm $MOUNT1/$tfile-2
110 checkstat $MOUNT2/$tfile && return 2
113 run_test 0b "lost client during waiting for next transno"
117 replay_barrier $SINGLEMDS
121 checkstat $MOUNT2/a || return 1
122 checkstat $MOUNT1/b || return 2
123 rm $MOUNT2/a $MOUNT1/b
124 checkstat $MOUNT1/a && return 3
125 checkstat $MOUNT2/b && return 4
129 run_test 1 "|X| simple create"
133 replay_barrier $SINGLEMDS
137 checkstat $MOUNT2/adir || return 1
139 checkstat $MOUNT2/adir && return 2
142 run_test 2 "|X| mkdir adir"
145 replay_barrier $SINGLEMDS
147 mkdir $MOUNT2/adir/bdir
150 checkstat $MOUNT2/adir || return 1
151 checkstat $MOUNT1/adir/bdir || return 2
152 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
153 checkstat $MOUNT1/adir && return 3
154 checkstat $MOUNT2/adir/bdir && return 4
157 run_test 3 "|X| mkdir adir, mkdir adir/bdir "
161 replay_barrier $SINGLEMDS
162 mkdir $MOUNT1/adir && return 1
163 mkdir $MOUNT2/adir/bdir
166 checkstat $MOUNT2/adir || return 2
167 checkstat $MOUNT1/adir/bdir || return 3
169 rmdir $MOUNT2/adir/bdir $MOUNT1/adir
170 checkstat $MOUNT1/adir && return 4
171 checkstat $MOUNT2/adir/bdir && return 5
174 run_test 4 "|X| mkdir adir (-EEXIST), mkdir adir/bdir "
178 # multiclient version of replay_single.sh/test_8
180 multiop_bg_pause $MOUNT2/a o_tSc || return 1
183 replay_barrier $SINGLEMDS
185 wait $pid || return 1
188 [ -e $MOUNT2/a ] && return 2
191 run_test 5 "open, unlink |X| close"
196 multiop_bg_pause $MOUNT2/a o_c || return 1
198 multiop_bg_pause $MOUNT1/a o_c || return 1
201 replay_barrier $SINGLEMDS
203 wait $pid1 || return 1
207 wait $pid2 || return 1
208 [ -e $MOUNT2/a ] && return 2
211 run_test 6 "open1, open2, unlink |X| close1 [fail $SINGLEMDS] close2"
214 replay_barrier $SINGLEMDS
215 drop_reint_reply "mcreate $MOUNT1/$tfile" || return 1
217 checkstat $MOUNT2/$tfile || return 2
218 rm $MOUNT1/$tfile || return 3
222 run_test 8 "replay of resent request"
225 replay_barrier $SINGLEMDS
226 mcreate $MOUNT1/$tfile-1
227 mcreate $MOUNT2/$tfile-2
228 # drop first reint reply
229 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
231 do_facet $SINGLEMDS lctl set_param fail_loc=0
233 rm $MOUNT1/$tfile-[1,2] || return 1
237 run_test 9 "resending a replayed create"
240 mcreate $MOUNT1/$tfile-1
241 replay_barrier $SINGLEMDS
242 munlink $MOUNT1/$tfile-1
243 mcreate $MOUNT2/$tfile-2
244 # drop first reint reply
245 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000119
247 do_facet $SINGLEMDS lctl set_param fail_loc=0
249 checkstat $MOUNT1/$tfile-1 && return 1
250 checkstat $MOUNT1/$tfile-2 || return 2
255 run_test 10 "resending a replayed unlink"
258 replay_barrier $SINGLEMDS
259 mcreate $DIR1/$tfile-1
260 mcreate $DIR2/$tfile-2
261 mcreate $DIR1/$tfile-3
262 mcreate $DIR2/$tfile-4
263 mcreate $DIR1/$tfile-5
264 # drop all reint replies for a while
265 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0119
266 # note that with this fail_loc set, facet_failover df will fail
267 facet_failover $SINGLEMDS
269 local clients=${CLIENTS:-$HOSTNAME}
270 wait_clients_import_state "$clients" $SINGLEMDS FULL
272 do_facet $SINGLEMDS $LCTL set_param fail_loc=0
274 rm $DIR1/$tfile-[1-5] || return 1
278 run_test 11 "both clients timeout during replay"
281 replay_barrier $SINGLEMDS
283 multiop_bg_pause $DIR/$tfile mo_c || return 1
286 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
287 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000302
288 facet_failover $SINGLEMDS
289 do_facet $SINGLEMDS lctl set_param fail_loc=0
290 clients_up || return 1
293 kill -USR1 $MULTIPID || return 3
294 wait $MULTIPID || return 4
295 $CHECKSTAT -t file $DIR/$tfile || return 2
300 run_test 12 "open resend timeout"
303 multiop_bg_pause $DIR/$tfile mo_c || return 1
306 replay_barrier $SINGLEMDS
308 kill -USR1 $MULTIPID || return 3
309 wait $MULTIPID || return 4
312 do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115
313 facet_failover $SINGLEMDS
314 do_facet $SINGLEMDS lctl set_param fail_loc=0
315 clients_up || return 1
318 $CHECKSTAT -t file $DIR/$tfile || return 2
323 run_test 13 "close resend timeout"
325 # test 14a removed after 18143 because it shouldn't fail anymore and do the same
330 wait_delete_completed
332 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
334 mkdir -p $MOUNT1/$tdir
335 $LFS setstripe -i 0 $MOUNT1/$tdir
336 replay_barrier $SINGLEMDS
337 createmany -o $MOUNT1/$tdir/$tfile- 5
339 $LFS setstripe -i 0 $MOUNT2/$tfile-2
340 dd if=/dev/zero of=$MOUNT2/$tfile-2 bs=1M count=5
341 createmany -o $MOUNT1/$tdir/$tfile-3- 5
345 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
347 # first set of files should have been replayed
348 unlinkmany $MOUNT1/$tdir/$tfile- 5 || error "first unlinks failed"
349 unlinkmany $MOUNT1/$tdir/$tfile-3- 5 || error "second unlinks failed"
351 zconf_mount $HOSTNAME $MOUNT2 || error "mount $MOUNT2 failed"
352 [ -f $MOUNT2/$tfile-2 ] && error "$MOUNT2/$tfile-2 exists!"
354 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
355 wait_delete_completed || error "wait_delete_complete failed"
357 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
358 log "before $beforeused, after $afterused"
359 # leave some margin for some files/dirs to be modified (OI, llog, etc)
360 [ $afterused -le $((beforeused + $(fs_log_size))) ] ||
361 error "after $afterused > before $beforeused"
363 run_test 14b "delete ost orphans if gap occured in objids due to VBR"
365 test_15a() { # was test_15
366 replay_barrier $SINGLEMDS
367 createmany -o $MOUNT1/$tfile- 25
368 createmany -o $MOUNT2/$tfile-2- 1
373 unlinkmany $MOUNT1/$tfile- 25 || return 2
374 [ -e $MOUNT1/$tfile-2-0 ] && error "$tfile-2-0 exists"
376 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
379 run_test 15a "timeout waiting for lost client during replay, 1 client completes"
382 replay_barrier $SINGLEMDS
383 for ((i = 0; i < 2000; i++)); do
384 echo "data" > "$MOUNT2/${tfile}-$i" ||
385 error "create ${tfile}-$i failed"
391 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
394 run_test 15c "remove multiple OST orphans"
397 replay_barrier $SINGLEMDS
398 createmany -o $MOUNT1/$tfile- 25
399 createmany -o $MOUNT2/$tfile-2- 1
402 facet_failover $SINGLEMDS
406 unlinkmany $MOUNT1/$tfile- 25 || return 2
408 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
411 run_test 16 "fail MDS during recovery (3571)"
414 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
416 createmany -o $MOUNT1/$tfile- 25
417 createmany -o $MOUNT2/$tfile-2- 1
419 # Make sure the disconnect is lost
427 unlinkmany $MOUNT1/$tfile- 25 || return 2
429 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
433 run_test 17 "fail OST during recovery (3571)"
435 # cleanup with blocked enqueue fails until timer elapses (MDS busy), wait for it
438 test_18() { # bug 3822 - evicting client with enqueued lock
440 local DLMTRACE=$(do_facet $SINGLEMDS lctl get_param debug)
441 do_facet $SINGLEMDS lctl set_param debug=+dlmtrace
442 mkdir -p $MOUNT1/$tdir || error "mkdir $MOUNT1/$tdir failed"
443 touch $MOUNT1/$tdir/${tfile}0 || error "touch file failed"
444 statmany -s $MOUNT1/$tdir/$tfile 1 500 &
447 #define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b
448 do_facet $SINGLEMDS lctl set_param fail_loc=0x8000030b # hold enqueue
450 #define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305
451 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
452 do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict
454 sleep 0.1 # wait to ensure first client is one that will be evicted
455 openfile -f O_RDONLY $MOUNT2/$tdir/$tfile
457 do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
458 do_facet $SINGLEMDS lctl debug_kernel |
459 grep "not entering recovery" && error "client not evicted"
460 do_facet client "lctl set_param fail_loc=0"
461 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
463 run_test 18 "ldlm_handle_enqueue succeeds on evicted export (3822)"
465 test_19() { # Bug 10991 - resend of open request does not fail assertion.
466 replay_barrier $SINGLEMDS
467 drop_mdt_ldlm_reply "createmany -o $DIR/$tfile 1" || return 1
469 checkstat $DIR2/${tfile}0 || return 2
470 rm $DIR/${tfile}0 || return 3
474 run_test 19 "resend of open request"
477 local before=$SECONDS
478 replay_barrier $SINGLEMDS
484 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
485 local tier1=$((SECONDS - before))
488 replay_barrier $SINGLEMDS
494 zconf_mount $HOSTNAME $DIR2 || error "mount $DIR2 fail"
495 local tier2=$((SECONDS - before))
497 # timeout is more than 1.5x original timeout
498 ((tier2 < tier1 * 6 / 4)) ||
499 error "recovery time $tier2 >= 1.5x original time $tier1"
501 run_test 20 "recovery time is not increasing"
503 # commit on sharing tests
505 local param_file=$TMP/$tfile-params
507 save_lustre_params $SINGLEMDS "mdt.*.commit_on_sharing" > $param_file
508 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1
509 touch $MOUNT1/$tfile-1
510 mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2
511 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
512 replay_barrier_nosync $SINGLEMDS
515 facet_failover $SINGLEMDS
517 # all renames are replayed
518 unlink $MOUNT1/$tfile-3 || return 2
520 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail"
522 do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0
523 rm -rf $MOUNT1/$tfile-*
524 restore_lustre_params < $param_file
528 run_test 21a "commit on sharing"
532 do_node $CLIENT1 rm -f $MOUNT1/$tfile-*
535 do_node $CLIENT1 touch $MOUNT1/$tfile-1
536 do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2
537 do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3
539 replay_barrier_nosync $mds
540 shutdown_client $CLIENT2 $MOUNT1
544 # were renames replayed?
546 echo UNLINK $MOUNT1/$tfile-3
547 do_node $CLIENT1 unlink $MOUNT1/$tfile-3 ||
548 { echo "unlink $tfile-3 fail!" && rc=1; }
551 zconf_mount_clients $CLIENT2 $MOUNT1 ||
552 error "mount $CLIENT2 $MOUNT1 fail"
558 [ $CLIENTCOUNT -lt 2 ] &&
559 { skip "Need 2+ clients, have $CLIENTCOUNT" && return; }
561 if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then
562 skip "Several MDTs on one MDS with FAILURE_MODE=$FAILURE_MODE"
566 zconf_umount_clients $CLIENTS $MOUNT2
567 zconf_mount_clients $CLIENTS $MOUNT1
569 local param_file=$TMP/$tfile-params
571 local mdtidx=$($LFS getstripe -m $MOUNT1)
572 local facet=mds$((mdtidx + 1))
574 save_lustre_params $facet "mdt.*.commit_on_sharing" > $param_file
578 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
580 test_21b_sub $facet || error "Not all renames are replayed. COS=$COS"
582 # there is still a window when transactions may be written to disk
583 # before the mds device is set R/O. To avoid such a rare test failure,
584 # the check is repeated several times.
588 # COS disabled (should fail)
589 do_facet $facet lctl set_param mdt.*.commit_on_sharing=$COS
591 test_21b_sub $facet || break
592 n_attempts=$((n_attempts + 1))
593 [ $n_attempts -gt 3 ] &&
594 error "can't check if COS works: rename replied w/o COS"
596 zconf_mount_clients $CLIENTS $MOUNT2
597 restore_lustre_params < $param_file
601 run_test 21b "commit on sharing, two clients"
604 checkstat $MOUNT1/$remote_dir || return 1
605 checkstat $MOUNT1/$remote_dir/dir || return 2
606 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 3
607 checkstat $MOUNT1/$remote_dir/dir/$tfile-1 || return 4
611 create_remote_dir_files_22() {
612 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir/dir || return 1
613 do_node $CLIENT1 createmany -o $MOUNT1/$remote_dir/dir/$tfile- 2 ||
615 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 ||
621 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
622 ([ $FAILURE_MODE == "HARD" ] &&
623 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
624 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
628 local remote_dir=$tdir/remote_dir
630 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
631 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
633 # OBD_FAIL_MDS_REINT_NET_REP 0x119
634 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
635 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
639 fail mds$((MDTIDX + 1))
640 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
642 replay_barrier mds$MDTIDX
643 create_remote_dir_files_22 || error "Remote creation failed $?"
646 checkstat_22 || error "check stat failed $?"
648 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
651 run_test 22a "c1 lfs mkdir -i 1 dir1, M1 drop reply & fail, c2 mkdir dir1/dir"
654 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
656 local remote_dir=$tdir/remote_dir
658 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
659 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
661 # OBD_FAIL_MDS_REINT_NET_REP 0x119
662 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
663 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
667 fail mds${MDTIDX},mds$((MDTIDX + 1))
668 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
670 replay_barrier mds$MDTIDX
671 create_remote_dir_files_22 || error "Remote creation failed $?"
674 checkstat_22 || error "check stat failed $?"
676 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
679 run_test 22b "c1 lfs mkdir -i 1 d1, M1 drop reply & fail M0/M1, c2 mkdir d1/dir"
682 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
683 ([ $FAILURE_MODE == "HARD" ] &&
684 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
685 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
688 local remote_dir=$tdir/remote_dir
690 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
691 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
693 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
694 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
695 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
698 do_facet mds$MDTIDX lctl set_param fail_loc=0
701 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
703 replay_barrier mds$MDTIDX
704 create_remote_dir_files_22 || error "Remote creation failed $?"
707 checkstat_22 || error "check stat failed $?"
709 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
712 run_test 22c "c1 lfs mkdir -i 1 d1, M1 drop update & fail M1, c2 mkdir d1/dir"
715 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
717 local remote_dir=$tdir/remote_dir
719 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
720 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
722 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
723 do_facet mds$MDTIDX lctl set_param fail_loc=0x1701
724 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir &
727 do_facet mds$MDTIDX lctl set_param fail_loc=0
729 fail mds${MDTIDX},mds$((MDTIDX + 1))
730 wait $CLIENT_PID || error "lfs mkdir -i $MDTIDX failed"
732 replay_barrier mds$MDTIDX
733 create_remote_dir_files_22 || error "Remote creation failed $?"
736 checkstat_22 || error "check stat failed $?"
738 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
741 run_test 22d "c1 lfs mkdir -i 1 d1, M1 drop update & fail M0/M1,c2 mkdir d1/dir"
744 checkstat $MOUNT1/$remote_dir || return 1
745 checkstat $MOUNT1/$remote_dir/$tfile-1 || return 2
749 create_remote_dir_files_23() {
750 do_node $CLIENT2 mkdir ${MOUNT2}/$remote_dir || return 1
751 do_node $CLIENT2 createmany -o $MOUNT2/$remote_dir/$tfile- 2 || return 2
756 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
757 ([ $FAILURE_MODE == "HARD" ] &&
758 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
759 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
762 local remote_dir=$tdir/remote_dir
764 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
765 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
766 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
767 error "lfs mkdir -i $MDTIDX failed"
768 # OBD_FAIL_MDS_REINT_NET_REP 0x119
769 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
770 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
773 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
775 fail mds$((MDTIDX + 1))
776 wait $CLIENT_PID || error "rmdir remote dir failed"
778 replay_barrier mds${MDTIDX}
779 create_remote_dir_files_23 || error "Remote creation failed $?"
782 checkstat_23 || error "check stat failed $?"
784 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
787 run_test 23a "c1 rmdir d1, M1 drop reply and fail, client2 mkdir d1"
790 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
792 local remote_dir=$tdir/remote_dir
794 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
795 error "lfs mkdir -i 0 $MOUNT/$tdir failed"
796 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
797 error "lfs mkdir -i $MDTIDX failed"
799 # OBD_FAIL_MDS_REINT_NET_REP 0x119
800 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
801 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
804 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
806 fail mds${MDTIDX},mds$((MDTIDX + 1))
807 wait $CLIENT_PID || error "rmdir remote dir failed"
809 replay_barrier mds${MDTIDX}
810 create_remote_dir_files_23 || error "Remote creation failed $?"
813 checkstat_23 || error "check stat failed $?"
815 rm -rf $MOUNT1/$tdir || error "rmdir remote_dir failed"
818 run_test 23b "c1 rmdir d1, M1 drop reply and fail M0/M1, c2 mkdir d1"
821 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
823 ([ $FAILURE_MODE == "HARD" ] &&
824 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
825 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
828 local remote_dir=$tdir/remote_dir
830 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
831 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
832 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
833 error "lfs mkdir -i $MDTIDX failed"
835 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
836 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
837 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
840 do_facet mds${MDTIDX} lctl set_param fail_loc=0
843 wait $CLIENT_PID || error "rmdir remote dir failed"
845 replay_barrier mds${MDTIDX}
846 create_remote_dir_files_23 || error "Remote creation failed $?"
849 checkstat_23 || error "check stat failed $?"
851 rm -rf $MOUNT1/$tdir || return 6
854 run_test 23c "c1 rmdir d1, M0 drop update reply and fail M0, c2 mkdir d1"
857 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
859 local remote_dir=$tdir/remote_dir
861 do_node $CLIENT1 $LFS mkdir -i 0 $MOUNT1/$tdir ||
862 error "lfs mkdir -i 0 $MOUNT1/$tdir failed"
863 do_node $CLIENT1 $LFS mkdir -i $MDTIDX $MOUNT1/$remote_dir ||
864 error "lfs mkdir -i $MDTIDX failed"
866 # let previous transactions to complete
867 # (distributed llog cancels, etc)
868 do_nodes $(comma_list $(mdts_nodes)) \
869 "$LCTL set_param -n osd*.*MDT*.force_sync=1"
872 # OBD_FAIL_UPDATE_OBJ_NET 0x1701
873 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
874 do_node $CLIENT1 rmdir $MOUNT1/$remote_dir &
877 do_facet mds${MDTIDX} lctl set_param fail_loc=0
879 fail mds${MDTIDX},mds$((MDTIDX + 1))
880 wait $CLIENT_PID || error "rmdir remote dir failed"
882 replay_barrier mds${MDTIDX}
883 create_remote_dir_files_23 || error "Remote creation failed $?"
886 checkstat_23 || error "check stat failed $?"
888 rm -rf $MOUNT1/$tdir || return 6
891 run_test 23d "c1 rmdir d1, M0 drop update reply and fail M0/M1, c2 mkdir d1"
894 [[ "$MDS1_VERSION" -gt $(version_code 2.5.2) ]] ||
895 skip "Need MDS version newer than 2.5.2"
898 stat $MOUNT/$tfile >&/dev/null
899 # OBD_FAIL_MDS_REINT_NET_REP
900 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x119
901 $TRUNCATE $MOUNT/$tfile 100 &
904 do_facet $SINGLEMDS lctl set_param fail_loc=0
905 # sync to release rep-ack lock quickly
906 do_nodes $(comma_list $(mdts_nodes)) \
907 "lctl set_param -n osd*.*MDT*.force_sync 1"
911 run_test 24 "reconstruct on non-existing object"
913 # end commit on sharing tests
918 $LFS setstripe -i 0 -c 1 $DIR/$tfile
920 # get lock for the 1st client
921 dd if=/dev/zero of=$DIR/$tfile count=1 >/dev/null ||
922 error "failed to write data"
924 # get waiting locks for the 2nd client
925 drop_ldlm_cancel "multiop $DIR2/$tfile Ow512" &
928 # failover, replay and resend replayed waiting locks
929 if [ "$OST1_VERSION" -ge $(version_code 2.6.90) ]; then
930 #define OBD_FAIL_LDLM_SRV_CP_AST 0x325
931 do_facet ost1 lctl set_param fail_loc=0x80000325
933 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213
934 do_facet ost1 lctl set_param fail_loc=0x80000213
939 # multiop does not finish because CP AST is skipped;
940 # it is ok to kill it in the test, because CP AST is already re-sent
941 # and it does not hung forever in real life
945 run_test 25 "replay|resend"
953 for pid_26 in "${pids_26[@]}"; do
954 if [[ -n "$pid_26" ]]; then
955 kill -0 "$pid_26" && kill "$pid_26" && \
956 wait "$pid_26" || true
960 for dir_26 in "${dirs_26[@]}"; do
961 if [[ -n "$dir_26" && -d "$dir_26" ]]; then
968 local clients=${CLIENTS:-$HOSTNAME}
970 zconf_mount_clients $clients $MOUNT
973 [[ "$SLOW" == "no" ]] && duration=200
974 # set duration to 900 because it takes some time to boot node
975 [[ "$FAILURE_MODE" == HARD ]] && duration=900
977 local start_ts=$SECONDS
980 stack_trap cleanup_26
982 local tar_dir=$DIR/$tdir/run_tar
986 stack_trap 'set +e; jobs -p | xargs -r kill; wait; exit' \
990 test_mkdir -p -c$MDSCOUNT $tar_dir || break
992 if (( MDSCOUNT >= 2 )); then
993 $LFS setdirstripe -D -c$MDSCOUNT $tar_dir ||
994 error "set default dirstripe failed"
998 tar -C / -cf - etc | tar -xf - &
1003 wait $tar_pid || tar_rc=$?
1005 if (( tar_rc > 0 && tar_rc <= 128 )); then
1006 error "tar failed with rc $tar_rc"
1009 cd $DIR/$tdir || break
1010 rm -rf $tar_dir || break
1016 echo "Started tar loop with pid $tar_26_pid"
1017 pids_26+=($tar_26_pid)
1019 local dbench_dir=$DIR2/$tdir/run_dbench
1021 dirs_26+=($dbench_dir)
1023 stack_trap 'set +e; jobs -p | xargs -r kill; wait; exit' \
1027 test_mkdir -p -c$MDSCOUNT $dbench_dir || break
1029 if (( MDSCOUNT >= 2 )); then
1030 $LFS setdirstripe -D -c$MDSCOUNT $dbench_dir ||
1031 error "set default dirstripe failed"
1034 cd $dbench_dir || break
1035 bash rundbench 1 -D $dbench_dir -t 100 &
1040 wait $dbench_pid || dbench_rc=$?
1042 if (( dbench_rc > 0 && dbench_rc <= 128 )); then
1043 error "dbench failed with rc $dbench_rc"
1046 cd $DIR/$tdir || break
1047 rm -rf $dbench_dir || break
1051 local dbench_26_pid=$!
1053 echo "Started dbench loop with $dbench_26_pid"
1054 pids_26+=($dbench_26_pid)
1056 local num_failovers=0
1059 while (( (SECONDS - start_ts) < duration )); do
1060 kill -0 $tar_26_pid || error "tar $tar_26_pid missing"
1061 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid missing"
1063 replay_barrier mds$fail_index
1064 sleep 2 # give clients a time to do operations
1065 # Increment the number of failovers
1066 num_failovers=$((num_failovers + 1))
1067 log "$TESTNAME fail mds$fail_index $num_failovers times"
1069 if (( fail_index < MDSCOUNT )); then
1070 fail_index=$((fail_index + 1))
1076 # stop the client loads
1077 kill -0 $tar_26_pid || error "tar $tar_26_pid stopped"
1078 kill -0 $dbench_26_pid || error "dbench $dbench_26_pid stopped"
1082 run_test 26 "dbench and tar with mds failover"
1085 $LFS setstripe -i 0 -c 1 $DIR2/$tfile
1086 dd if=/dev/zero of=$DIR2/$tfile bs=4096 count=1
1088 #define OBD_FAIL_LDLM_SRV_BL_AST 0x324
1089 do_facet ost1 $LCTL set_param fail_loc=0x80000324
1091 dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 &
1095 #define OBD_FAIL_LDLM_GRANT_CHECK 0x32a
1096 do_facet ost1 $LCTL set_param fail_loc=0x32a
1101 cancel_lru_locks OST0000-osc
1102 wait $pid || error "dd failed"
1104 run_test 28 "lock replay should be ordered: waiting after granted"
1107 local dir0=$DIR/$tdir/d0
1108 local dir1=$DIR/$tdir/d1
1110 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
1111 [ $CLIENTCOUNT -lt 2 ] && skip "needs >= 2 clients" && return 0
1112 [ "$CLIENT1" == "$CLIENT2" ] &&
1113 skip "clients must be on different nodes" && return 0
1116 $LFS mkdir -i0 $dir0
1117 $LFS mkdir -i1 $dir1
1121 # create a remote dir, drop reply
1122 #define OBD_FAIL_PTLRPC_ROUND_XID 0x530
1123 $LCTL set_param fail_loc=0x530 fail_val=36
1124 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
1125 do_facet mds2 $LCTL set_param fail_loc=0x8000015a
1126 echo make remote dir d0 for $dir0
1127 $LFS mkdir -i1 -c1 $dir0/d3 &
1130 echo make local dir d1 for $dir1
1131 do_node $CLIENT2 $LCTL set_param fail_loc=0x530 fail_val=36
1132 do_node $CLIENT2 mkdir $dir1/d4
1136 run_test 29 "replay vs update with the same xid"
1139 $LFS setstripe -E 1m -L mdt -E -1 $DIR/$tfile
1140 #first write to have no problems with grants
1141 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 ||
1142 error "dd on client failed"
1143 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10 seek=10 ||
1144 error "dd on client failed"
1146 #define OBD_FAIL_LDLM_REPLAY_PAUSE 0x32e
1147 lctl set_param fail_loc=0x32e fail_val=4
1148 dd of=/dev/null if=$DIR2/$tfile &
1154 wait $pid || error "dd on client failed"
1156 run_test 30 "layout lock replay is not blocked on IO"
1159 mkdir_on_mdt0 $DIR1/$tdir
1160 $LFS setstripe -c 1 -i 0 $DIR1/$tdir
1161 for (( i=0; i < 10; i++ )) ; do
1162 mkdir -p $DIR1/$tdir/d.${i}
1164 mkdir $DIR1/$tdir/mdtdir
1165 $LFS setstripe -E 1M -L mdt $DIR1/$tdir/mdtdir
1167 # failover has to take longer than blocking timeout involved
1168 # by second multiop below which is set to obd_timeout/2 by
1170 local timeout=$(do_facet mds1 $LCTL get_param -n timeout)
1172 timeout=$((timeout / 2 + 5))
1173 fail ost1 $timeout &
1178 # consume preallocated objects, precreate thread will be awakened
1179 consume_precreations $DIR1/$tdir mds1 0 1
1181 # disable AT so that blocking timeout gets set to obd_timeout/2
1182 local amm=$(at_max_get mds1)
1185 stack_trap "at_max_set $amm mds1"
1189 #define OBD_FAIL_LLITE_XATTR_PAUSE 0x1420
1190 $LCTL set_param fail_loc=0x80001420
1191 $MULTIOP $DIR1/$tdir/mdtdir/$tfile Osw4096c &
1194 $MULTIOP $DIR2/$tdir/mdtdir/$tfile oO_WRONLY:w4096c &
1197 local mmrif=$($LCTL get_param -n \
1198 mdc.$FSNAME-MDT0000-mdc-*.max_mod_rpcs_in_flight | tail -1)
1199 # these are blocked by precreation until ost failover is in progress
1200 for (( i=0; i < $mmrif; i++ )) ; do
1201 $MULTIOP $DIR1/$tdir/d.${i}/parallel Oc &
1207 for pid in "${multiops[@]}"; do
1208 wait $pid || ((failed++))
1210 ((failed == 0)) || error "$failed multiops failed"
1212 run_test 31 "deadlock on file_remove_privs and occupied mod rpc slots"
1215 (( $MDSCOUNT < 2 )) && skip_env "needs >= 2 MDTs"
1217 # inject a gap with 10th transaction
1218 #define OBD_FAIL_LLOG_ADD_GAP 0x131d
1219 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x0000131d fail_val=10
1220 for ((i=0; i < 20; i++)); do
1221 $LFS setdirstripe -i1 $DIR/$tdir-$i ||
1222 error "can't mkdir $DIR/$tdir-$i"
1225 # prevent update llog cancellation, so next boot MDS has
1226 # process the update llog with gap injected
1227 #define OBD_FAIL_TGT_TXN_NO_CANCEL 0x726
1228 $LCTL set_param fail_loc=0x726
1233 $LCTL set_param fail_loc=0
1240 local testid=$(echo $TESTNAME | tr '_' ' ')
1241 dmesg | tac | sed "/$testid/,$ d" | grep "This client was evicted" &&
1242 error "client got evicted due to aborted recovery"
1245 run_test 32 "gap in update llog shouldn't break recovery"
1247 last_rcvd_check_incompat_flag() {
1249 local flag2check="$2"
1250 local dev=$(facet_device $facet)
1253 incompat=$(do_facet $facet $LR_READER $dev |
1254 awk '/feature_incompat:/ {print $2}')
1255 echo "last_rcvd in $dev: incompat = $incompat"
1257 return $(( (incompat & flag2check) != flag2check ))
1261 test_33() { # LU-15935
1262 (( $MDS1_VERSION >= $(version_code 2.15.52.86) )) ||
1263 (( $MDS1_VERSION >= $(version_code 2.15.2) &&
1264 $MDS1_VERSION < $(version_code 2.15.50) )) ||
1265 skip "Need MDS version at least 2.15.52.86 or 2.15.2"
1267 [[ "$mds1_FSTYPE" == "ldiskfs" ]] || skip "ldiskfs only test"
1272 # check for OBD_INCOMPAT_MULTI_RPCS (0x400) in last_rcvd
1273 last_rcvd_check_incompat_flag mds1 0x400 ||
1274 error "1st failover: OBD_INCOMPAT_MULTI_RPCS is not set on MDT0000 last_rcvd"
1276 # lose 1 client while the MDT failover
1280 wait_clients_import_state "$HOSTNAME" mds1 "\(REPLAY_WAIT\|REPLAY_LOCKS\)"
1282 do_facet mds1 $LCTL --device $(convert_facet2label mds1) abort_recovery
1283 wait_clients_import_state "$HOSTNAME" mds1 "FULL"
1286 last_rcvd_check_incompat_flag mds1 0x400 ||
1287 error "2sd failover: OBD_INCOMPAT_MULTI_RPCS is not set on MDT0000 last_rcvd"
1290 zconf_mount $HOSTNAME $MOUNT2
1291 wait_clients_import_state "$HOSTNAME" mds1 "FULL"
1293 run_test 33 "Check for OBD_INCOMPAT_MULTI_RPCS in last_rcvd after abort_recovery"
1296 SLEEP=$((SECONDS - $NOW))
1297 [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP
1298 [ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true
1299 check_and_cleanup_lustre