7 # This test needs to be run on the client
10 export MULTIOP=${MULTIOP:-multiop}
11 LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)}
14 . $LUSTRE/tests/test-framework.sh
16 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
18 CHECK_GRANT=${CHECK_GRANT:-"yes"}
19 GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
21 require_dsh_mds || exit 0
24 # bug number for skipped tests:
25 ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT "
26 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
28 # time in minutes: 7.5"
29 [ "$SLOW" = "no" ] && EXCEPT_SLOW="44b"
31 if [ $(facet_fstype $SINGLEMDS) = "zfs" ]; then
32 # bug number for skipped test: LU-11388
36 # bug number for skipped tests: LU-9795 (all below)
37 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 0b 0c 0d 34 45"
38 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 47 58b 58c 71a 85a"
39 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 85b 86 88 89 90"
40 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 93a 100a 100b 120"
45 check_and_setup_lustre
50 rm -rf $DIR/[df][0-9]* $DIR/f.$TESTSUITE.*
52 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
53 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
55 do_facet $SINGLEMDS sync
58 test_0a() { # was test_0
59 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
60 replay_barrier $SINGLEMDS
64 run_test 0a "empty replay"
67 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
69 # this test attempts to trigger a race in the precreation code,
70 # and must run before any other objects are created on the filesystem
72 createmany -o $DIR/$tfile 20 || error "createmany -o $DIR/$tfile failed"
73 unlinkmany $DIR/$tfile 20 || error "unlinkmany $DIR/$tfile failed"
75 run_test 0b "ensure object created after recover exists. (3284)"
78 replay_barrier $SINGLEMDS
81 facet_failover $SINGLEMDS
82 zconf_mount $(hostname) $MOUNT || error "mount fails"
83 client_up || error "post-failover df failed"
84 # file shouldn't exist if replay-barrier works as expected
85 rm $DIR/$tfile && error "File exists and it shouldn't"
88 run_test 0c "check replay-barrier"
91 replay_barrier $SINGLEMDS
93 facet_failover $SINGLEMDS
94 zconf_mount $(hostname) $MOUNT || error "mount fails"
95 client_up || error "post-failover df failed"
97 run_test 0d "expired recovery with no clients"
100 replay_barrier $SINGLEMDS
103 $CHECKSTAT -t file $DIR/$tfile ||
104 error "$CHECKSTAT $DIR/$tfile attribute check failed"
107 run_test 1 "simple create"
110 replay_barrier $SINGLEMDS
113 $CHECKSTAT -t file $DIR/$tfile ||
114 error "$CHECKSTAT $DIR/$tfile attribute check failed"
120 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
121 replay_barrier $SINGLEMDS
124 $CHECKSTAT -t file $DIR/$tfile ||
125 error "$CHECKSTAT $DIR/$tfile attribute check failed"
131 replay_barrier $SINGLEMDS
132 $LFS setstripe -c $OSTCOUNT $DIR/$tfile
134 $CHECKSTAT -t file $DIR/$tfile ||
135 error "$CHECKSTAT $DIR/$tfile check failed"
137 run_test 2c "setstripe replay"
140 [[ $mds1_FSTYPE = "zfs" ]] &&
141 [[ $MDS1_VERSION -lt $(version_code 2.12.51) ]] &&
142 skip "requires LU-10143 fix on MDS"
143 replay_barrier $SINGLEMDS
144 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir
146 $CHECKSTAT -t dir $DIR/$tdir ||
147 error "$CHECKSTAT $DIR/$tdir check failed"
149 run_test 2d "setdirstripe replay"
152 local file=$DIR/$tfile
153 replay_barrier $SINGLEMDS
155 openfile -f O_DIRECTORY $file
157 $CHECKSTAT -t file $file ||
158 error "$CHECKSTAT $file attribute check failed"
161 run_test 3a "replay failed open(O_DIRECTORY)"
164 replay_barrier $SINGLEMDS
165 #define OBD_FAIL_MDS_OPEN_PACK | OBD_FAIL_ONCE
166 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000114"
168 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
170 $CHECKSTAT -t file $DIR/$tfile &&
171 error "$CHECKSTAT $DIR/$tfile attribute check should fail"
174 run_test 3b "replay failed open -ENOMEM"
177 replay_barrier $SINGLEMDS
178 #define OBD_FAIL_MDS_ALLOC_OBDO | OBD_FAIL_ONCE
179 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000128"
181 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
184 $CHECKSTAT -t file $DIR/$tfile &&
185 error "$CHECKSTAT $DIR/$tfile attribute check should fail"
188 run_test 3c "replay failed open -ENOMEM"
190 test_4a() { # was test_4
191 replay_barrier $SINGLEMDS
192 for i in $(seq 10); do
193 echo "tag-$i" > $DIR/$tfile-$i
196 for i in $(seq 10); do
197 grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
200 run_test 4a "|x| 10 open(O_CREAT)s"
203 for i in $(seq 10); do
204 echo "tag-$i" > $DIR/$tfile-$i
206 replay_barrier $SINGLEMDS
209 $CHECKSTAT -t file $DIR/$tfile-* &&
210 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
213 run_test 4b "|x| rm 10 files"
215 # The idea is to get past the first block of precreated files on both
216 # osts, and then replay.
218 replay_barrier $SINGLEMDS
219 for i in $(seq 220); do
220 echo "tag-$i" > $DIR/$tfile-$i
223 for i in $(seq 220); do
224 grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
228 # waiting for commitment of removal
230 run_test 5 "|x| 220 open(O_CREAT)"
232 test_6a() { # was test_6
233 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
234 replay_barrier $SINGLEMDS
235 mcreate $DIR/$tdir/$tfile
237 $CHECKSTAT -t dir $DIR/$tdir ||
238 error "$CHECKSTAT $DIR/$tdir attribute check failed"
239 $CHECKSTAT -t file $DIR/$tdir/$tfile ||
240 error "$CHECKSTAT $DIR/$tdir/$tfile attribute check failed"
242 # waiting for log process thread
244 run_test 6a "mkdir + contained create"
247 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
248 replay_barrier $SINGLEMDS
251 $CHECKSTAT -t dir $DIR/$tdir &&
252 error "$CHECKSTAT $DIR/$tdir attribute check should fail" ||
255 run_test 6b "|X| rmdir"
258 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
259 replay_barrier $SINGLEMDS
260 mcreate $DIR/$tdir/$tfile
262 $CHECKSTAT -t dir $DIR/$tdir ||
263 error "$CHECKSTAT $DIR/$tdir attribute check failed"
264 $CHECKSTAT -t file $DIR/$tdir/$tfile ||
265 error "$CHECKSTAT $DIR/$tdir/$tfile attribute check failed"
268 run_test 7 "mkdir |X| contained create"
271 replay_barrier $SINGLEMDS
272 multiop_bg_pause $DIR/$tfile mo_c ||
273 error "multiop mknod $DIR/$tfile failed"
277 $CHECKSTAT -t file $DIR/$tfile ||
278 error "$CHECKSTAT $DIR/$tfile attribute check failed"
279 kill -USR1 $MULTIPID || error "multiop mknod $MULTIPID not running"
280 wait $MULTIPID || error "multiop mknod $MULTIPID failed"
283 run_test 8 "creat open |X| close"
286 replay_barrier $SINGLEMDS
288 local old_inum=$(ls -i $DIR/$tfile | awk '{print $1}')
290 local new_inum=$(ls -i $DIR/$tfile | awk '{print $1}')
292 echo " old_inum == $old_inum, new_inum == $new_inum"
293 if [ $old_inum -eq $new_inum ] ;
295 echo "old_inum and new_inum match"
297 echo " old_inum and new_inum do not match"
298 error "old index($old_inum) does not match new index($new_inum)"
302 run_test 9 "|X| create (same inum/gen)"
305 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
306 replay_barrier $SINGLEMDS
307 mv $DIR/$tfile $DIR/$tfile-2
310 $CHECKSTAT $DIR/$tfile &&
311 error "$CHECKSTAT $DIR/$tfile attribute check should fail"
312 $CHECKSTAT $DIR/$tfile-2 ||
313 error "$CHECKSTAT $DIR/$tfile-2 attribute check failed"
317 run_test 10 "create |X| rename unlink"
320 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
321 echo "old" > $DIR/$tfile
322 mv $DIR/$tfile $DIR/$tfile-2
323 replay_barrier $SINGLEMDS
324 echo "new" > $DIR/$tfile
326 grep old $DIR/$tfile-2
328 grep new $DIR/$tfile || error "grep $DIR/$tfile failed"
329 grep old $DIR/$tfile-2 || error "grep $DIR/$tfile-2 failed"
331 run_test 11 "create open write rename |X| create-old-name read"
334 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
335 multiop_bg_pause $DIR/$tfile o_tSc ||
336 error "multiop_bg_pause $DIR/$tfile failed"
339 replay_barrier $SINGLEMDS
340 kill -USR1 $pid || error "multiop $pid not running"
341 wait $pid || error "multiop $pid failed"
344 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
347 run_test 12 "open, unlink |X| close"
349 # 1777 - replay open after committed chmod that would make
350 # a regular open a failure
352 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
353 multiop_bg_pause $DIR/$tfile O_wc ||
354 error "multiop_bg_pause $DIR/$tfile failed"
357 $CHECKSTAT -p 0 $DIR/$tfile ||
358 error "$CHECKSTAT $DIR/$tfile attribute check failed"
359 replay_barrier $SINGLEMDS
361 kill -USR1 $pid || error "multiop $pid not running"
362 wait $pid || error "multiop $pid failed"
364 $CHECKSTAT -s 1 -p 0 $DIR/$tfile ||
365 error "second $CHECKSTAT $DIR/$tfile attribute check failed"
366 rm $DIR/$tfile || error "rm $DIR/$tfile failed"
369 run_test 13 "open chmod 0 |x| write close"
372 multiop_bg_pause $DIR/$tfile O_tSc ||
373 error "multiop_bg_pause $DIR/$tfile failed"
376 replay_barrier $SINGLEMDS
377 kill -USR1 $pid || error "multiop $pid not running"
378 wait $pid || error "multiop $pid failed"
381 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
384 run_test 14 "open(O_CREAT), unlink |X| close"
387 multiop_bg_pause $DIR/$tfile O_tSc ||
388 error "multiop_bg_pause $DIR/$tfile failed"
391 replay_barrier $SINGLEMDS
392 touch $DIR/$tfile-1 || error "touch $DIR/$tfile-1 failed"
393 kill -USR1 $pid || error "multiop $pid not running"
394 wait $pid || error "multiop $pid failed"
397 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
398 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
401 run_test 15 "open(O_CREAT), unlink |X| touch new, close"
404 replay_barrier $SINGLEMDS
407 mcreate $DIR/$tfile-2
409 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
410 [ -e $DIR/$tfile-2 ] || error "file $DIR/$tfile-2 does not exist"
411 munlink $DIR/$tfile-2 || error "munlink $DIR/$tfile-2 failed"
413 run_test 16 "|X| open(O_CREAT), unlink, touch new, unlink new"
416 replay_barrier $SINGLEMDS
417 multiop_bg_pause $DIR/$tfile O_c ||
418 error "multiop_bg_pause $DIR/$tfile failed"
421 kill -USR1 $pid || error "multiop $pid not running"
422 wait $pid || error "multiop $pid failed"
423 $CHECKSTAT -t file $DIR/$tfile ||
424 error "$CHECKSTAT $DIR/$tfile attribute check failed"
427 run_test 17 "|X| open(O_CREAT), |replay| close"
430 replay_barrier $SINGLEMDS
431 multiop_bg_pause $DIR/$tfile O_tSc ||
432 error "multiop_bg_pause $DIR/$tfile failed"
435 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
436 echo "pid: $pid will close"
437 kill -USR1 $pid || error "multiop $pid not running"
438 wait $pid || error "multiop $pid failed"
441 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
442 [ -e $DIR/$tfile-2 ] || error "file $DIR/$tfile-2 does not exist"
443 # this touch frequently fails
444 touch $DIR/$tfile-3 || error "touch $DIR/$tfile-3 failed"
445 munlink $DIR/$tfile-2 || error "munlink $DIR/$tfile-2 failed"
446 munlink $DIR/$tfile-3 || error "munlink $DIR/$tfile-3 failed"
449 run_test 18 "open(O_CREAT), unlink, touch new, close, touch, unlink"
451 # bug 1855 (a simpler form of test_11 above)
453 replay_barrier $SINGLEMDS
455 echo "old" > $DIR/$tfile
456 mv $DIR/$tfile $DIR/$tfile-2
457 grep old $DIR/$tfile-2
459 grep old $DIR/$tfile-2 || error "grep $DIR/$tfile-2 failed"
461 run_test 19 "mcreate, open, write, rename "
463 test_20a() { # was test_20
464 replay_barrier $SINGLEMDS
465 multiop_bg_pause $DIR/$tfile O_tSc ||
466 error "multiop_bg_pause $DIR/$tfile failed"
471 kill -USR1 $pid || error "multiop $pid not running"
472 wait $pid || error "multiop $pid failed"
473 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
476 run_test 20a "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
478 test_20b() { # bug 10480
479 local wait_timeout=$((TIMEOUT * 4))
480 local extra=$(fs_log_size)
484 $LFS setstripe -i 0 -c 1 $DIR
486 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
488 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 &
489 while [ ! -e $DIR/$tfile ] ; do
490 usleep 60 # give dd a chance to start
493 $LFS getstripe $DIR/$tfile || error "$LFS getstripe $DIR/$tfile failed"
495 rm -f $DIR/$tfile || error "rm -f $DIR/$tfile failed"
497 client_up || client_up || true # reconnect
499 do_facet $SINGLEMDS "lctl set_param -n osd*.*MDT*.force_sync=1"
501 fail $SINGLEMDS # start orphan recovery
502 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
503 wait_delete_completed $wait_timeout || error "delete did not finish"
507 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
508 log "before $beforeused, after $afterused"
510 (( $beforeused + $extra >= $afterused )) && break
511 n_attempts=$((n_attempts + 1))
512 [ $n_attempts -gt 3 ] &&
513 error "after $afterused > before $beforeused + $extra"
515 wait_zfs_commit $SINGLEMDS 5
520 run_test 20b "write, unlink, eviction, replay (test mds_cleanup_orphans)"
522 test_20c() { # bug 10480
523 multiop_bg_pause $DIR/$tfile Ow_c ||
524 error "multiop_bg_pause $DIR/$tfile failed"
530 client_up || client_up || true # reconnect
532 kill -USR1 $pid || error "multiop $pid not running"
533 wait $pid || error "multiop $pid failed"
534 [ -s $DIR/$tfile ] || error "File was truncated"
538 run_test 20c "check that client eviction does not affect file content"
541 replay_barrier $SINGLEMDS
542 multiop_bg_pause $DIR/$tfile O_tSc ||
543 error "multiop_bg_pause $DIR/$tfile failed"
546 touch $DIR/$tfile-1 || error "touch $DIR/$tfile-1 failed"
549 kill -USR1 $pid || error "multiop $pid not running"
550 wait $pid || error "multiop $pid failed"
551 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
552 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
555 run_test 21 "|X| open(O_CREAT), unlink touch new, replay, close (test mds_cleanup_orphans)"
558 multiop_bg_pause $DIR/$tfile O_tSc ||
559 error "multiop_bg_pause $DIR/$tfile failed"
562 replay_barrier $SINGLEMDS
566 kill -USR1 $pid || error "multiop $pid not running"
567 wait $pid || error "multiop $pid failed"
568 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
571 run_test 22 "open(O_CREAT), |X| unlink, replay, close (test mds_cleanup_orphans)"
574 multiop_bg_pause $DIR/$tfile O_tSc ||
575 error "multiop_bg_pause $DIR/$tfile failed"
578 replay_barrier $SINGLEMDS
580 touch $DIR/$tfile-1 || error "touch $DIR/$tfile-1 failed"
583 kill -USR1 $pid || error "multiop $pid not running"
584 wait $pid || error "multiop $pid failed"
585 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
586 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
589 run_test 23 "open(O_CREAT), |X| unlink touch new, replay, close (test mds_cleanup_orphans)"
592 multiop_bg_pause $DIR/$tfile O_tSc ||
593 error "multiop_bg_pause $DIR/$tfile failed"
596 replay_barrier $SINGLEMDS
599 kill -USR1 $pid || error "multiop $pid not running"
600 wait $pid || error "multiop $pid failed"
601 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
604 run_test 24 "open(O_CREAT), replay, unlink, close (test mds_cleanup_orphans)"
607 multiop_bg_pause $DIR/$tfile O_tSc ||
608 error "multiop_bg_pause $DIR/$tfile failed"
612 replay_barrier $SINGLEMDS
614 kill -USR1 $pid || error "multiop $pid not running"
615 wait $pid || error "multiop $pid failed"
616 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
619 run_test 25 "open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
622 replay_barrier $SINGLEMDS
623 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
624 error "multiop_bg_pause $DIR/$tfile-1 failed"
626 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
627 error "multiop_bg_pause $DIR/$tfile-2 failed"
631 kill -USR1 $pid2 || error "second multiop $pid2 not running"
632 wait $pid2 || error "second multiop $pid2 failed"
635 kill -USR1 $pid1 || error "multiop $pid1 not running"
636 wait $pid1 || error "multiop $pid1 failed"
637 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
638 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
641 run_test 26 "|X| open(O_CREAT), unlink two, close one, replay, close one (test mds_cleanup_orphans)"
644 replay_barrier $SINGLEMDS
645 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
646 error "multiop_bg_pause $DIR/$tfile-1 failed"
648 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
649 error "multiop_bg_pause $DIR/$tfile-2 failed"
655 kill -USR1 $pid1 || error "multiop $pid1 not running"
656 wait $pid1 || error "multiop $pid1 failed"
657 kill -USR1 $pid2 || error "second multiop $pid2 not running"
658 wait $pid2 || error "second multiop $pid2 failed"
659 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
660 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
663 run_test 27 "|X| open(O_CREAT), unlink two, replay, close two (test mds_cleanup_orphans)"
666 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
667 error "multiop_bg_pause $DIR/$tfile-1 failed"
669 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
670 error "multiop_bg_pause $DIR/$tfile-2 failed"
672 replay_barrier $SINGLEMDS
675 kill -USR1 $pid2 || error "second multiop $pid2 not running"
676 wait $pid2 || error "second multiop $pid2 failed"
679 kill -USR1 $pid1 || error "multiop $pid1 not running"
680 wait $pid1 || error "multiop $pid1 failed"
681 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
682 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
685 run_test 28 "open(O_CREAT), |X| unlink two, close one, replay, close one (test mds_cleanup_orphans)"
688 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
689 error "multiop_bg_pause $DIR/$tfile-1 failed"
691 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
692 error "multiop_bg_pause $DIR/$tfile-2 failed"
694 replay_barrier $SINGLEMDS
699 kill -USR1 $pid1 || error "multiop $pid1 not running"
700 wait $pid1 || error "multiop $pid1 failed"
701 kill -USR1 $pid2 || error "second multiop $pid2 not running"
702 wait $pid2 || error "second multiop $pid2 failed"
703 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
704 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
707 run_test 29 "open(O_CREAT), |X| unlink two, replay, close two (test mds_cleanup_orphans)"
710 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
711 error "multiop_bg_pause $DIR/$tfile-1 failed"
713 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
714 error "multiop_bg_pause $DIR/$tfile-2 failed"
719 replay_barrier $SINGLEMDS
721 kill -USR1 $pid1 || error "multiop $pid1 not running"
722 wait $pid1 || error "multiop $pid1 failed"
723 kill -USR1 $pid2 || error "second multiop $pid2 not running"
724 wait $pid2 || error "second multiop $pid2 failed"
725 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
726 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
729 run_test 30 "open(O_CREAT) two, unlink two, replay, close two (test mds_cleanup_orphans)"
732 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
733 error "multiop_bg_pause $DIR/$tfile-1 failed"
735 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
736 error "multiop_bg_pause $DIR/$tfile-2 failed"
740 replay_barrier $SINGLEMDS
743 kill -USR1 $pid1 || error "multiop $pid1 not running"
744 wait $pid1 || error "multiop $pid1 failed"
745 kill -USR1 $pid2 || error "second multiop $pid2 not running"
746 wait $pid2 || error "second multiop $pid2 failed"
747 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
748 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
751 run_test 31 "open(O_CREAT) two, unlink one, |X| unlink one, close two (test mds_cleanup_orphans)"
753 # tests for bug 2104; completion without crashing is success. The close is
754 # stale, but we always return 0 for close, so the app never sees it.
756 multiop_bg_pause $DIR/$tfile O_c ||
757 error "multiop_bg_pause $DIR/$tfile failed"
759 multiop_bg_pause $DIR/$tfile O_c ||
760 error "second multiop_bg_pause $DIR/$tfile failed"
763 client_up || client_up || error "client_up failed"
764 kill -USR1 $pid1 || error "multiop $pid1 not running"
765 kill -USR1 $pid2 || error "second multiop $pid2 not running"
766 wait $pid1 || error "multiop $pid1 failed"
767 wait $pid2 || error "second multiop $pid2 failed"
770 run_test 32 "close() notices client eviction; close() after client eviction"
773 createmany -o $DIR/$tfile-%d 10 ||
774 error "createmany create $DIR/$tfile failed"
775 replay_barrier_nosync $SINGLEMDS
776 fail_abort $SINGLEMDS
777 # recreate shouldn't fail
778 createmany -o $DIR/$tfile--%d 10 ||
779 error "createmany recreate $DIR/$tfile failed"
783 run_test 33a "fid seq shouldn't be reused after abort recovery"
786 #define OBD_FAIL_SEQ_ALLOC 0x1311
787 do_facet $SINGLEMDS "lctl set_param fail_loc=0x1311"
789 createmany -o $DIR/$tfile-%d 10
790 replay_barrier_nosync $SINGLEMDS
791 fail_abort $SINGLEMDS
792 # recreate shouldn't fail
793 createmany -o $DIR/$tfile--%d 10 ||
794 error "createmany recreate $DIR/$tfile failed"
798 run_test 33b "test fid seq allocation"
801 multiop_bg_pause $DIR/$tfile O_c ||
802 error "multiop_bg_pause $DIR/$tfile failed"
806 replay_barrier $SINGLEMDS
807 fail_abort $SINGLEMDS
808 kill -USR1 $pid || error "multiop $pid not running"
809 wait $pid || error "multiop $pid failed"
810 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
814 run_test 34 "abort recovery before client does replay (test mds_cleanup_orphans)"
816 # bug 2278 - generate one orphan on OST, then destroy it during recovery from llog
818 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
820 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
821 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000119"
826 # give a chance to remove from MDS
827 fail_abort $SINGLEMDS
828 $CHECKSTAT -t file $DIR/$tfile &&
829 error "$CHECKSTAT $DIR/$tfile attribute check should fail" ||
832 run_test 35 "test recovery from llog for unlink op"
834 # b=2432 resent cancel after replay uses wrong cookie,
835 # so don't resend cancels
837 replay_barrier $SINGLEMDS
839 checkstat $DIR/$tfile
840 facet_failover $SINGLEMDS
842 if dmesg | grep "unknown lock cookie"; then
843 error "cancel after replay failed"
846 run_test 36 "don't resend cancel"
849 # directory orphans can't be unlinked from PENDING directory
851 rmdir $DIR/$tfile 2>/dev/null
852 multiop_bg_pause $DIR/$tfile dD_c ||
853 error "multiop_bg_pause $DIR/$tfile failed"
857 replay_barrier $SINGLEMDS
858 # clear the dmesg buffer so we only see errors from this recovery
859 do_facet $SINGLEMDS dmesg -c >/dev/null
860 fail_abort $SINGLEMDS
861 kill -USR1 $pid || error "multiop $pid not running"
862 do_facet $SINGLEMDS dmesg | grep "error .* unlinking .* from PENDING" &&
863 error "error unlinking files"
864 wait $pid || error "multiop $pid failed"
868 run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)"
871 createmany -o $DIR/$tfile-%d 800 ||
872 error "createmany -o $DIR/$tfile failed"
873 unlinkmany $DIR/$tfile-%d 0 400 || error "unlinkmany $DIR/$tfile failed"
874 replay_barrier $SINGLEMDS
876 unlinkmany $DIR/$tfile-%d 400 400 ||
877 error "unlinkmany $DIR/$tfile 400 failed"
879 $CHECKSTAT -t file $DIR/$tfile-* &&
880 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
883 run_test 38 "test recovery from unlink llog (test llog_gen_rec) "
885 test_39() { # bug 4176
886 createmany -o $DIR/$tfile-%d 800 ||
887 error "createmany -o $DIR/$tfile failed"
888 replay_barrier $SINGLEMDS
889 unlinkmany $DIR/$tfile-%d 0 400
891 unlinkmany $DIR/$tfile-%d 400 400 ||
892 error "unlinkmany $DIR/$tfile 400 failed"
894 $CHECKSTAT -t file $DIR/$tfile-* &&
895 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
898 run_test 39 "test recovery from unlink llog (test llog_gen_rec) "
901 lctl get_param -n osc.*.stats | awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }'
906 # always need connection to MDS to verify layout during IO. LU-2628.
907 lctl get_param mdc.*.connect_flags | grep -q layout_lock &&
908 skip "layout_lock needs MDS connection for IO" && return 0
910 $LCTL mark multiop $MOUNT/$tfile OS_c
911 multiop $MOUNT/$tfile OS_c &
913 writeme -s $MOUNT/${tfile}-2 &
916 facet_failover $SINGLEMDS
917 #define OBD_FAIL_MDS_CONNECT_NET 0x117
918 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000117"
920 stat1=$(count_ost_writes)
922 stat2=$(count_ost_writes)
923 echo "$stat1, $stat2"
924 if [ $stat1 -lt $stat2 ]; then
925 echo "writes continuing during recovery"
928 echo "writes not continuing during recovery, bug 2477"
931 echo "waiting for writeme $WRITE_PID"
935 echo "waiting for multiop $PID"
936 wait $PID || error "multiop $PID failed"
937 do_facet client munlink $MOUNT/$tfile ||
938 error "munlink $MOUNT/$tfile failed"
939 do_facet client munlink $MOUNT/${tfile}-2 ||
940 error "munlink $MOUNT/$tfile-2 failed"
943 run_test 40 "cause recovery in ptlrpc, ensure IO continues"
946 # make sure that a read to one osc doesn't try to double-unlock its page just
947 # because another osc is invalid. trigger_group_io used to mistakenly return
948 # an error if any oscs were invalid even after having successfully put rpcs
949 # on valid oscs. This was fatal if the caller was ll_readpage who unlocked
950 # the page, guarnateeing that the unlock from the RPC completion would
951 # assert on trying to unlock the unlocked page.
953 [ $OSTCOUNT -lt 2 ] && skip_env "needs >= 2 OSTs" && return
955 local f=$MOUNT/$tfile
956 # make sure the start of the file is ost1
957 $LFS setstripe -S $((128 * 1024)) -i 0 $f
958 do_facet client dd if=/dev/zero of=$f bs=4k count=1 ||
959 error "dd on client failed"
961 # fail ost2 and read from ost1
962 local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $ost2_svc)
963 local osc2dev=$(do_facet $SINGLEMDS "lctl get_param -n devices" |
964 grep $mdtosc | awk '{print $1}')
965 [ -z "$osc2dev" ] && echo "OST: $ost2_svc" &&
966 lctl get_param -n devices &&
967 error "OST 2 $osc2dev does not exist"
968 do_facet $SINGLEMDS $LCTL --device $osc2dev deactivate ||
969 error "deactive device on $SINGLEMDS failed"
970 do_facet client dd if=$f of=/dev/null bs=4k count=1 ||
971 error "second dd on client failed"
972 do_facet $SINGLEMDS $LCTL --device $osc2dev activate ||
973 error "active device on $SINGLEMDS failed"
976 run_test 41 "read from a valid osc while other oscs are invalid"
978 # test MDS recovery after ost failure
980 blocks=$(df -P $MOUNT | tail -n 1 | awk '{ print $2 }')
981 createmany -o $DIR/$tfile-%d 800 ||
982 error "createmany -o $DIR/$tfile failed"
984 unlinkmany $DIR/$tfile-%d 0 400
986 lctl set_param debug=-1
989 # osc is evicted, fs is smaller (but only with failout OSTs (bug 7287)
990 #blocks_after=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'`
991 #[ $blocks_after -lt $blocks ] || return 1
992 echo "wait for MDS to timeout and recover"
993 sleep $((TIMEOUT * 2))
995 unlinkmany $DIR/$tfile-%d 400 400 ||
996 error "unlinkmany $DIR/$tfile 400 failed"
997 $CHECKSTAT -t file $DIR/$tfile-* &&
998 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
1001 run_test 42 "recovery after ost failure"
1003 # timeout in MDS/OST recovery RPC will LBUG MDS
1004 test_43() { # bug 2530
1005 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1007 replay_barrier $SINGLEMDS
1009 # OBD_FAIL_OST_CREATE_NET 0x204
1010 do_facet ost1 "lctl set_param fail_loc=0x80000204"
1016 run_test 43 "mds osc import failure during recovery; don't LBUG"
1018 test_44a() { # was test_44
1019 local at_max_saved=0
1021 local mdcdev=$($LCTL dl |
1022 awk "/${FSNAME}-MDT0000-mdc-/ {if (\$2 == \"UP\") {print \$1}}")
1023 [ "$mdcdev" ] || error "${FSNAME}-MDT0000-mdc- not UP"
1024 [ $(echo $mdcdev | wc -w) -eq 1 ] ||
1025 { $LCTL dl; error "looking for mdcdev=$mdcdev"; }
1027 # adaptive timeouts slow this way down
1028 if at_is_enabled; then
1029 at_max_saved=$(at_max_get mds)
1033 for i in $(seq 1 10); do
1034 echo "$i of 10 ($(date +%s))"
1035 do_facet $SINGLEMDS \
1036 "lctl get_param -n md[ts].*.mdt.timeouts | grep service"
1037 #define OBD_FAIL_TGT_CONN_RACE 0x701
1038 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
1039 # lctl below may fail, it is valid case
1040 $LCTL --device $mdcdev recover
1043 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1044 [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds
1047 run_test 44a "race in target handle connect"
1050 local mdcdev=$($LCTL dl |
1051 awk "/${FSNAME}-MDT0000-mdc-/ {if (\$2 == \"UP\") {print \$1}}")
1052 [ "$mdcdev" ] || error "${FSNAME}-MDT0000-mdc not up"
1053 [ $(echo $mdcdev | wc -w) -eq 1 ] ||
1054 { echo mdcdev=$mdcdev; $LCTL dl;
1055 error "more than one ${FSNAME}-MDT0000-mdc"; }
1057 for i in $(seq 1 10); do
1058 echo "$i of 10 ($(date +%s))"
1059 do_facet $SINGLEMDS \
1060 "lctl get_param -n md[ts].*.mdt.timeouts | grep service"
1061 #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
1062 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
1063 # lctl below may fail, it is valid case
1064 $LCTL --device $mdcdev recover
1069 run_test 44b "race in target handle connect"
1072 replay_barrier $SINGLEMDS
1073 createmany -m $DIR/$tfile-%d 100 || error "failed to create directories"
1074 #define OBD_FAIL_TGT_RCVG_FLAG 0x712
1075 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000712"
1076 fail_abort $SINGLEMDS
1077 unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail abort"
1079 unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail"
1082 run_test 44c "race in target handle connect"
1084 # Handle failed close
1086 local mdcdev=$($LCTL get_param -n devices |
1087 awk "/ ${FSNAME}-MDT0000-mdc-/ {print \$1}")
1088 [ "$mdcdev" ] || error "${FSNAME}-MDT0000-mdc not up"
1089 [ $(echo $mdcdev | wc -w) -eq 1 ] ||
1090 { echo mdcdev=$mdcdev; $LCTL dl;
1091 error "more than one ${FSNAME}-MDT0000-mdc"; }
1093 $LCTL --device $mdcdev recover ||
1094 error "$LCTL --device $mdcdev recover failed"
1096 multiop_bg_pause $DIR/$tfile O_c ||
1097 error "multiop_bg_pause $DIR/$tfile failed"
1100 # This will cause the CLOSE to fail before even
1101 # allocating a reply buffer
1102 $LCTL --device $mdcdev deactivate ||
1103 error "$LCTL --device $mdcdev deactivate failed"
1106 kill -USR1 $pid || error "multiop $pid not running"
1107 wait $pid || error "multiop $pid failed"
1109 $LCTL --device $mdcdev activate ||
1110 error "$LCTL --device $mdcdev activate failed"
1113 $CHECKSTAT -t file $DIR/$tfile ||
1114 error "$CHECKSTAT $DIR/$tfile attribute check failed"
1117 run_test 45 "Handle failed close"
1121 drop_reply "touch $DIR/$tfile"
1123 # ironically, the previous test, 45, will cause a real forced close,
1124 # so just look for one for this test
1125 dmesg | grep -i "force closing client file handle for $tfile" &&
1126 error "found force closing in dmesg"
1129 run_test 46 "Don't leak file handle after open resend (3325)"
1131 test_47() { # bug 2824
1132 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1134 # create some files to make sure precreate has been done on all
1135 # OSTs. (just in case this test is run independently)
1136 createmany -o $DIR/$tfile 20 ||
1137 error "createmany create $DIR/$tfile failed"
1139 # OBD_FAIL_OST_CREATE_NET 0x204
1141 do_facet ost1 "lctl set_param fail_loc=0x80000204"
1142 client_up || error "client_up failed"
1144 # let the MDS discover the OST failure, attempt to recover, fail
1145 # and recover again.
1146 sleep $((3 * TIMEOUT))
1148 # Without 2824, this createmany would hang
1149 createmany -o $DIR/$tfile 20 ||
1150 error "createmany recraete $DIR/$tfile failed"
1151 unlinkmany $DIR/$tfile 20 || error "unlinkmany $DIR/$tfile failed"
1155 run_test 47 "MDS->OSC failure during precreate cleanup (2824)"
1158 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1159 [ "$OSTCOUNT" -lt "2" ] && skip_env "needs >= 2 OSTs" && return
1161 replay_barrier $SINGLEMDS
1162 createmany -o $DIR/$tfile 20 ||
1163 error "createmany -o $DIR/$tfile failed"
1164 # OBD_FAIL_OST_EROFS 0x216
1165 facet_failover $SINGLEMDS
1166 do_facet ost1 "lctl set_param fail_loc=0x80000216"
1167 client_up || error "client_up failed"
1169 # let the MDS discover the OST failure, attempt to recover, fail
1170 # and recover again.
1171 sleep $((3 * TIMEOUT))
1173 createmany -o $DIR/$tfile 20 20 ||
1174 error "createmany recraete $DIR/$tfile failed"
1175 unlinkmany $DIR/$tfile 40 || error "unlinkmany $DIR/$tfile failed"
1178 run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
1181 local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $ost1_svc)
1182 local oscdev=$(do_facet $SINGLEMDS "lctl get_param -n devices" |
1183 grep $mdtosc | awk '{print $1}')
1184 [ "$oscdev" ] || error "could not find OSC device on MDS"
1185 do_facet $SINGLEMDS $LCTL --device $oscdev recover ||
1186 error "OSC device $oscdev recovery failed"
1187 do_facet $SINGLEMDS $LCTL --device $oscdev recover ||
1188 error "second OSC device $oscdev recovery failed"
1189 # give the mds_lov_sync threads a chance to run
1192 run_test 50 "Double OSC recovery, don't LASSERT (3812)"
1194 # b3764 timed out lock replay
1196 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.6.90) ] &&
1197 skip "MDS prior to 2.6.90 handle LDLM_REPLY_NET incorrectly" &&
1200 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
1201 cancel_lru_locks mdc
1203 multiop_bg_pause $DIR/$tfile s_s || error "multiop $DIR/$tfile failed"
1206 #define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157
1207 lctl set_param -n ldlm.cancel_unused_locks_before_replay "0"
1208 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000157"
1210 fail $SINGLEMDS || error "fail $SINGLEMDS failed"
1212 wait $mpid || error "multiop_bg_pause pid failed"
1214 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1215 lctl set_param fail_loc=0x0
1216 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
1219 run_test 52 "time out lock replay (3764)"
1221 # bug 3462 - simultaneous MDC requests
1223 [[ $(lctl get_param mdc.*.import |
1224 grep "connect_flags:.*multi_mod_rpc") ]] ||
1225 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
1227 cancel_lru_locks mdc # cleanup locks from former test cases
1228 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1229 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1230 multiop $DIR/${tdir}-1/f O_c &
1232 # give multiop a change to open
1235 #define OBD_FAIL_MDS_CLOSE_NET 0x115
1236 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
1237 kill -USR1 $close_pid
1238 cancel_lru_locks mdc # force the close
1239 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1241 mcreate $DIR/${tdir}-2/f || error "mcreate $DIR/${tdir}-2/f failed"
1243 # close should still be here
1244 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1246 replay_barrier_nodf $SINGLEMDS
1248 wait $close_pid || error "close_pid $close_pid failed"
1250 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1251 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1252 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1253 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1254 rm -rf $DIR/${tdir}-*
1256 run_test 53a "|X| close request while two MDC requests in flight"
1259 cancel_lru_locks mdc # cleanup locks from former test cases
1261 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1262 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1263 multiop_bg_pause $DIR/${tdir}-1/f O_c ||
1264 error "multiop_bg_pause $DIR/${tdir}-1/f failed"
1267 #define OBD_FAIL_MDS_REINT_NET 0x107
1268 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000107"
1269 mcreate $DIR/${tdir}-2/f &
1273 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1274 kill -USR1 $close_pid
1275 cancel_lru_locks mdc # force the close
1276 wait $close_pid || error "close_pid $close_pid failed"
1277 # open should still be here
1278 [ -d /proc/$open_pid ] || error "open_pid doesn't exist"
1280 replay_barrier_nodf $SINGLEMDS
1282 wait $open_pid || error "open_pid failed"
1284 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1285 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1286 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1287 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1288 rm -rf $DIR/${tdir}-*
1290 run_test 53b "|X| open request while two MDC requests in flight"
1293 cancel_lru_locks mdc # cleanup locks from former test cases
1295 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1296 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1297 multiop $DIR/${tdir}-1/f O_c &
1300 #define OBD_FAIL_MDS_REINT_NET 0x107
1301 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000107"
1302 mcreate $DIR/${tdir}-2/f &
1306 #define OBD_FAIL_MDS_CLOSE_NET 0x115
1307 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
1308 kill -USR1 $close_pid
1309 cancel_lru_locks mdc # force the close
1311 #bz20647: make sure all pids exist before failover
1312 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1313 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1314 replay_barrier_nodf $SINGLEMDS
1315 fail_nodf $SINGLEMDS
1316 wait $open_pid || error "open_pid failed"
1318 # close should be gone
1319 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1320 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1322 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1323 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1324 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1325 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1326 rm -rf $DIR/${tdir}-*
1328 run_test 53c "|X| open request and close request while two MDC requests in flight"
1331 [[ $(lctl get_param mdc.*.import |
1332 grep "connect_flags:.*multi_mod_rpc") ]] ||
1333 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
1335 cancel_lru_locks mdc # cleanup locks from former test cases
1337 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1338 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1339 multiop $DIR/${tdir}-1/f O_c &
1341 # give multiop a chance to open
1344 #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
1345 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013b"
1346 kill -USR1 $close_pid
1347 cancel_lru_locks mdc # force the close
1348 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1349 mcreate $DIR/${tdir}-2/f || error "mcreate $DIR/${tdir}-2/f failed"
1351 # close should still be here
1352 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1354 wait $close_pid || error "close_pid failed"
1356 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1357 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1358 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1359 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1360 rm -rf $DIR/${tdir}-*
1362 run_test 53d "close reply while two MDC requests in flight"
1365 cancel_lru_locks mdc # cleanup locks from former test cases
1367 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1368 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1369 multiop $DIR/${tdir}-1/f O_c &
1372 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
1373 do_facet $SINGLEMDS "lctl set_param fail_loc=0x119"
1374 mcreate $DIR/${tdir}-2/f &
1378 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1379 kill -USR1 $close_pid
1380 cancel_lru_locks mdc # force the close
1381 wait $close_pid || error "close_pid failed"
1382 # open should still be here
1383 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1385 replay_barrier_nodf $SINGLEMDS
1387 wait $open_pid || error "open_pid failed"
1389 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1390 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1391 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1392 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1393 rm -rf $DIR/${tdir}-*
1395 run_test 53e "|X| open reply while two MDC requests in flight"
1398 cancel_lru_locks mdc # cleanup locks from former test cases
1400 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1401 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1402 multiop $DIR/${tdir}-1/f O_c &
1405 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
1406 do_facet $SINGLEMDS "lctl set_param fail_loc=0x119"
1407 mcreate $DIR/${tdir}-2/f &
1411 #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
1412 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013b"
1413 kill -USR1 $close_pid
1414 cancel_lru_locks mdc # force the close
1416 #bz20647: make sure all pids are exists before failover
1417 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1418 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1419 replay_barrier_nodf $SINGLEMDS
1420 fail_nodf $SINGLEMDS
1421 wait $open_pid || error "open_pid failed"
1423 # close should be gone
1424 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1425 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1427 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1428 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1429 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1430 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1431 rm -rf $DIR/${tdir}-*
1433 run_test 53f "|X| open reply and close reply while two MDC requests in flight"
1436 cancel_lru_locks mdc # cleanup locks from former test cases
1438 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1439 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1440 multiop $DIR/${tdir}-1/f O_c &
1443 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
1444 do_facet $SINGLEMDS "lctl set_param fail_loc=0x119"
1445 mcreate $DIR/${tdir}-2/f &
1449 #define OBD_FAIL_MDS_CLOSE_NET 0x115
1450 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
1451 kill -USR1 $close_pid
1452 cancel_lru_locks mdc # force the close
1453 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1455 #bz20647: make sure all pids are exists before failover
1456 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1457 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1458 replay_barrier_nodf $SINGLEMDS
1459 fail_nodf $SINGLEMDS
1460 wait $open_pid || error "open_pid failed"
1462 # close should be gone
1463 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1465 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1466 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1467 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1468 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1469 rm -rf $DIR/${tdir}-*
1471 run_test 53g "|X| drop open reply and close request while close and open are both in flight"
1474 cancel_lru_locks mdc # cleanup locks from former test cases
1476 mkdir $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1477 mkdir $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1478 multiop $DIR/${tdir}-1/f O_c &
1481 #define OBD_FAIL_MDS_REINT_NET 0x107
1482 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000107"
1483 mcreate $DIR/${tdir}-2/f &
1487 #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
1488 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013b"
1489 kill -USR1 $close_pid
1490 cancel_lru_locks mdc # force the close
1493 #bz20647: make sure all pids are exists before failover
1494 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1495 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1496 replay_barrier_nodf $SINGLEMDS
1497 fail_nodf $SINGLEMDS
1498 wait $open_pid || error "open_pid failed"
1500 # close should be gone
1501 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1502 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1504 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1505 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1506 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1507 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1508 rm -rf $DIR/${tdir}-*
1510 run_test 53h "open request and close reply while two MDC requests in flight"
1512 #b3761 ASSERTION(hash != 0) failed
1514 # OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE
1515 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012b"
1517 # give touch a chance to run
1519 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1523 run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"
1525 #b3440 ASSERTION(rec->ur_fid2->id) failed
1527 ln -s foo $DIR/$tfile
1528 replay_barrier $SINGLEMDS
1529 #drop_reply "cat $DIR/$tfile"
1533 run_test 56 "don't replay a symlink open request (3440)"
1535 #recovery one mds-ost setattr from llog
1537 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1538 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012c"
1539 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
1540 replay_barrier $SINGLEMDS
1542 wait_recovery_complete $SINGLEMDS || error "MDS recovery is not done"
1543 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
1544 $CHECKSTAT -t file $DIR/$tfile ||
1545 error "$CHECKSTAT $DIR/$tfile attribute check failed"
1546 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1549 run_test 57 "test recovery from llog for setattr op"
1552 zconf_umount $(hostname) $MOUNT2
1556 #recovery many mds-ost setattr from llog
1558 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1559 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1560 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012c"
1561 createmany -o $DIR/$tdir/$tfile-%d 2500
1562 replay_barrier $SINGLEMDS
1565 $CHECKSTAT -t file $DIR/$tdir/$tfile-* >/dev/null ||
1566 error "$CHECKSTAT $DIR/$tfile-* attribute check failed"
1567 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1568 unlinkmany $DIR/$tdir/$tfile-%d 2500 ||
1569 error "unlinkmany $DIR/$tfile failed"
1572 run_test 58a "test recovery from llog for setattr op (test llog_gen_rec)"
1578 trap cleanup_58 EXIT
1580 large_xattr_enabled &&
1581 orig="$(generate_string $(max_xattr_size))" || orig="bar"
1582 # Original extended attribute can be long. Print a small version of
1583 # attribute if an error occurs
1584 local sm_msg=$(printf "%.9s" $orig)
1586 mount_client $MOUNT2 || error "mount_client on $MOUNT2 failed"
1587 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1588 touch $DIR/$tdir/$tfile || error "touch $DIR/$tdir/$tfile failed"
1589 replay_barrier $SINGLEMDS
1590 setfattr -n trusted.foo -v $orig $DIR/$tdir/$tfile
1592 new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile)
1593 [[ "$new" = "$orig" ]] ||
1594 error "xattr set ($sm_msg...) differs from xattr get ($new)"
1595 rm -f $DIR/$tdir/$tfile
1598 wait_clients_import_state ${CLIENTS:-$HOSTNAME} "mgs" FULL
1600 run_test 58b "test replay of setxattr op"
1602 test_58c() { # bug 16570
1607 trap cleanup_58 EXIT
1609 if large_xattr_enabled; then
1610 local xattr_size=$(max_xattr_size)
1611 orig="$(generate_string $((xattr_size / 2)))"
1612 orig1="$(generate_string $xattr_size)"
1618 # PING_INTERVAL max(obd_timeout / 4, 1U)
1619 sleep $((TIMEOUT / 4))
1621 # Original extended attribute can be long. Print a small version of
1622 # attribute if an error occurs
1623 local sm_msg=$(printf "%.9s" $orig)
1624 local sm_msg1=$(printf "%.9s" $orig1)
1626 mount_client $MOUNT2 || error "mount_client on $MOUNT2 failed"
1627 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1628 touch $DIR/$tdir/$tfile || error "touch $DIR/$tdir/$tfile failed"
1629 drop_request "setfattr -n trusted.foo -v $orig $DIR/$tdir/$tfile" ||
1630 error "drop_request for setfattr failed"
1631 new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile)
1632 [[ "$new" = "$orig" ]] ||
1633 error "xattr set ($sm_msg...) differs from xattr get ($new)"
1634 drop_reint_reply "setfattr -n trusted.foo1 \
1635 -v $orig1 $DIR/$tdir/$tfile" ||
1636 error "drop_reint_reply for setfattr failed"
1637 new=$(get_xattr_value trusted.foo1 $MOUNT2/$tdir/$tfile)
1638 [[ "$new" = "$orig1" ]] ||
1639 error "second xattr set ($sm_msg1...) differs xattr get ($new)"
1640 rm -f $DIR/$tdir/$tfile
1644 run_test 58c "resend/reconstruct setxattr op"
1646 # log_commit_thread vs filter_destroy race used to lead to import use after free
1649 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1651 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1652 createmany -o $DIR/$tdir/$tfile-%d 200 ||
1653 error "createmany create files failed"
1655 unlinkmany $DIR/$tdir/$tfile-%d 200 ||
1656 error "unlinkmany $DIR/$tdir/$tfile failed"
1657 #define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
1658 do_facet ost1 "lctl set_param fail_loc=0x507"
1661 do_facet ost1 "lctl set_param fail_loc=0x0"
1665 run_test 59 "test log_commit_thread vs filter_destroy race"
1667 # race between add unlink llog vs cat log init in post_recovery (only for b1_6)
1668 # bug 12086: should no oops and No ctxt error for this test
1670 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1671 createmany -o $DIR/$tdir/$tfile-%d 200 ||
1672 error "createmany create files failed"
1673 replay_barrier $SINGLEMDS
1674 unlinkmany $DIR/$tdir/$tfile-%d 0 100
1676 unlinkmany $DIR/$tdir/$tfile-%d 100 100
1677 local no_ctxt=$(dmesg | grep "No ctxt")
1678 [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
1680 run_test 60 "test llog post recovery init vs llog unlink"
1682 #test race llog recovery thread vs llog cleanup
1683 test_61a() { # was test_61
1684 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1686 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1687 createmany -o $DIR/$tdir/$tfile-%d 800 ||
1688 error "createmany create files failed"
1690 unlinkmany $DIR/$tdir/$tfile-%d 800
1691 # OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
1692 set_nodes_failloc "$(osts_nodes)" 0x80000221
1697 set_nodes_failloc "$(osts_nodes)" 0x0
1699 $CHECKSTAT -t file $DIR/$tdir/$tfile-* &&
1700 error "$CHECKSTAT $DIR/$tdir/$tfile attribute check should fail"
1703 run_test 61a "test race llog recovery vs llog cleanup"
1705 #test race mds llog sync vs llog cleanup
1707 # OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
1708 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013a"
1709 facet_failover $SINGLEMDS
1712 do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 ||
1715 run_test 61b "test race mds llog sync vs llog cleanup"
1717 #test race cancel cookie cb vs llog cleanup
1719 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1721 # OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
1722 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
1723 set_nodes_failloc "$(osts_nodes)" 0x80000222
1727 set_nodes_failloc "$(osts_nodes)" 0x0
1729 run_test 61c "test race mds llog sync vs llog cleanup"
1731 test_61d() { # bug 16002 # bug 17466 # bug 22137
1732 # OBD_FAIL_OBD_LLOG_SETUP 0x605
1734 do_facet mgs "lctl set_param fail_loc=0x80000605"
1735 start mgs $(mgsdevname) $MGS_MOUNT_OPTS &&
1736 error "mgs start should have failed"
1737 do_facet mgs "lctl set_param fail_loc=0"
1738 start mgs $(mgsdevname) $MGS_MOUNT_OPTS || error "cannot restart mgs"
1740 run_test 61d "error in llog_setup should cleanup the llog context correctly"
1742 test_62() { # Bug 15756 - don't mis-drop resent replay
1743 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1744 replay_barrier $SINGLEMDS
1745 createmany -o $DIR/$tdir/$tfile- 25 ||
1746 error "createmany create files failed"
1747 #define OBD_FAIL_TGT_REPLAY_DROP 0x707
1748 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707"
1750 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1751 unlinkmany $DIR/$tdir/$tfile- 25 ||
1752 error "unlinkmany $DIR/$tdir/$tfile failed"
1755 run_test 62 "don't mis-drop resent replay"
1757 #Adaptive Timeouts (bug 3055)
1765 echo "Cleaning up AT ..."
1766 if [ -n "$ATOLDBASE" ]; then
1767 local at_history=$($LCTL get_param -n at_history)
1768 do_facet $SINGLEMDS "lctl set_param at_history=$at_history" || true
1769 do_facet ost1 "lctl set_param at_history=$at_history" || true
1772 if [ $AT_MAX_SET -ne 0 ]; then
1773 for facet in mds client ost; do
1774 var=AT_MAX_SAVE_${facet}
1775 echo restore AT on $facet to saved value ${!var}
1776 at_max_set ${!var} $facet
1777 at_new=$(at_max_get $facet)
1778 echo Restored AT value on $facet $at_new
1779 [ $at_new -eq ${!var} ] ||
1780 error "AT value not restored SAVED ${!var} NEW $at_new"
1787 local at_max_new=600
1789 # Save at_max original values
1791 if [ $AT_MAX_SET -eq 0 ]; then
1792 # Suppose that all osts have the same at_max
1793 for facet in mds client ost; do
1794 eval AT_MAX_SAVE_${facet}=$(at_max_get $facet)
1798 for facet in mds client ost; do
1799 at_max=$(at_max_get $facet)
1800 if [ $at_max -ne $at_max_new ]; then
1801 echo "AT value on $facet is $at_max, set it by force temporarily to $at_max_new"
1802 at_max_set $at_max_new $facet
1807 if [ -z "$ATOLDBASE" ]; then
1808 ATOLDBASE=$(do_facet $SINGLEMDS "lctl get_param -n at_history")
1809 # speed up the timebase so we can check decreasing AT
1810 do_facet $SINGLEMDS "lctl set_param at_history=8" || true
1811 do_facet ost1 "lctl set_param at_history=8" || true
1813 # sleep for a while to cool down, should be > 8s and also allow
1814 # at least one ping to be sent. simply use TIMEOUT to be safe.
1819 test_65a() #bug 3055
1821 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1823 at_start || return 0
1824 $LCTL dk > /dev/null
1826 $LCTL set_param debug="other"
1827 # Slow down a request to the current service time, this is critical
1828 # because previous tests may have caused this value to increase.
1829 REQ_DELAY=`lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts |
1830 awk '/portal 12/ {print $5}'`
1831 REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
1833 do_facet $SINGLEMDS lctl set_param fail_val=$((${REQ_DELAY} * 1000))
1834 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1835 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x8000050a
1836 createmany -o $DIR/$tfile 10 > /dev/null
1837 unlinkmany $DIR/$tfile 10 > /dev/null
1838 # check for log message
1839 $LCTL dk | grep "Early reply #" || error "No early reply"
1841 # client should show REQ_DELAY estimates
1842 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal
1844 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal
1846 run_test 65a "AT: verify early replies"
1848 test_65b() #bug 3055
1850 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1852 at_start || return 0
1855 $LCTL set_param debug="other trace"
1856 $LCTL dk > /dev/null
1857 # Slow down a request to the current service time, this is critical
1858 # because previous tests may have caused this value to increase.
1859 $LFS setstripe --stripe-index=0 --stripe-count=1 $DIR/$tfile ||
1860 error "$LFS setstripe failed for $DIR/$tfile"
1862 multiop $DIR/$tfile Ow1yc
1863 REQ_DELAY=`lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts |
1864 awk '/portal 6/ {print $5}'`
1865 REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
1867 do_facet ost1 lctl set_param fail_val=${REQ_DELAY}
1868 #define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
1869 do_facet ost1 $LCTL set_param fail_loc=0x224
1872 $LFS setstripe --stripe-index=0 --stripe-count=1 $DIR/$tfile ||
1873 error "$LFS setstripe failed"
1874 # force some real bulk transfer
1875 multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
1877 do_facet ost1 $LCTL set_param fail_loc=0
1878 # check for log message
1879 $LCTL dk | grep "Early reply #" || error "No early reply"
1881 # client should show REQ_DELAY estimates
1882 lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts | grep portal
1884 run_test 65b "AT: verify early replies on packed reply / bulk"
1886 test_66a() #bug 3055
1888 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1890 at_start || return 0
1891 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1892 # adjust 5s at a time so no early reply is sent (within deadline)
1893 do_facet $SINGLEMDS "$LCTL set_param fail_val=5000"
1894 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1895 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x8000050a"
1896 createmany -o $DIR/$tfile 20 > /dev/null
1897 unlinkmany $DIR/$tfile 20 > /dev/null
1898 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1899 do_facet $SINGLEMDS "$LCTL set_param fail_val=10000"
1900 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x8000050a"
1901 createmany -o $DIR/$tfile 20 > /dev/null
1902 unlinkmany $DIR/$tfile 20 > /dev/null
1903 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1904 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0"
1906 createmany -o $DIR/$tfile 20 > /dev/null
1907 unlinkmany $DIR/$tfile 20 > /dev/null
1908 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1909 CUR=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $5}')
1910 WORST=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $7}')
1911 echo "Current MDT timeout $CUR, worst $WORST"
1912 [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
1914 run_test 66a "AT: verify MDT service time adjusts with no early replies"
1916 test_66b() #bug 3055
1918 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1920 at_start || return 0
1921 ORIG=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
1922 awk '/network/ {print $4}')
1923 $LCTL set_param fail_val=$(($ORIG + 5))
1924 #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
1925 $LCTL set_param fail_loc=0x50c
1926 touch $DIR/$tfile > /dev/null 2>&1
1927 $LCTL set_param fail_loc=0
1928 CUR=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
1929 awk '/network/ {print $4}')
1930 WORST=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
1931 awk '/network/ {print $6}')
1932 echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
1933 [ $WORST -gt $ORIG ] ||
1934 error "Worst $WORST should be worse than orig $ORIG"
1936 run_test 66b "AT: verify net latency adjusts"
1938 test_67a() #bug 3055
1940 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1942 at_start || return 0
1943 CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1944 # sleeping threads may drive values above this
1945 do_facet ost1 "$LCTL set_param fail_val=400"
1946 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1947 do_facet ost1 "$LCTL set_param fail_loc=0x50a"
1948 createmany -o $DIR/$tfile 20 > /dev/null
1949 unlinkmany $DIR/$tfile 20 > /dev/null
1950 do_facet ost1 "$LCTL set_param fail_loc=0"
1951 CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1952 ATTEMPTS=$(($CONN2 - $CONN1))
1953 echo "$ATTEMPTS osc reconnect attempts on gradual slow"
1954 [ $ATTEMPTS -gt 0 ] &&
1955 error_ignore bz13721 "AT should have prevented reconnect"
1958 run_test 67a "AT: verify slow request processing doesn't induce reconnects"
1960 test_67b() #bug 3055
1962 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1964 at_start || return 0
1965 CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1967 # exhaust precreations on ost1
1968 local OST=$(ostname_from_index 0)
1969 local mdtosc=$(get_mdtosc_proc_path mds $OST)
1970 local last_id=$(do_facet $SINGLEMDS lctl get_param -n \
1971 osc.$mdtosc.prealloc_last_id)
1972 local next_id=$(do_facet $SINGLEMDS lctl get_param -n \
1973 osc.$mdtosc.prealloc_next_id)
1975 mkdir -p $DIR/$tdir/${OST} || error "mkdir $DIR/$tdir/${OST} failed"
1976 $LFS setstripe -i 0 -c 1 $DIR/$tdir/${OST} ||
1977 error "$LFS setstripe failed"
1978 echo "Creating to objid $last_id on ost $OST..."
1979 #define OBD_FAIL_OST_PAUSE_CREATE 0x223
1980 do_facet ost1 "$LCTL set_param fail_val=20000"
1981 do_facet ost1 "$LCTL set_param fail_loc=0x80000223"
1982 createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2))
1985 do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
1987 CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1988 ATTEMPTS=$(($CONN2 - $CONN1))
1989 echo "$ATTEMPTS osc reconnect attempts on instant slow"
1990 # do it again; should not timeout
1991 do_facet ost1 "$LCTL set_param fail_loc=0x80000223"
1992 cp /etc/profile $DIR/$tfile || error "cp failed"
1993 do_facet ost1 "$LCTL set_param fail_loc=0"
1995 do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
1996 CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1997 ATTEMPTS=$(($CONN3 - $CONN2))
1998 echo "$ATTEMPTS osc reconnect attempts on 2nd slow"
1999 [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
2002 run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
2004 test_68 () #bug 13813
2006 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
2008 at_start || return 0
2009 local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min)
2010 [ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0
2011 local ldlm_enqueue_min_r=$(do_facet ost1 "find /sys -name ldlm_enqueue_min")
2012 [ -z "$ldlm_enqueue_min_r" ] && skip "missing /sys/.../ldlm_enqueue_min in the ost1" && return 0
2013 local ENQ_MIN=$(cat $ldlm_enqueue_min)
2014 local ENQ_MIN_R=$(do_facet ost1 "cat $ldlm_enqueue_min_r")
2015 echo $TIMEOUT >> $ldlm_enqueue_min
2016 do_facet ost1 "echo $TIMEOUT >> $ldlm_enqueue_min_r"
2018 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2019 $LFS setstripe --stripe-index=0 -c 1 $DIR/$tdir ||
2020 error "$LFS setstripe failed for $DIR/$tdir"
2021 #define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
2022 $LCTL set_param fail_val=$(($TIMEOUT - 1))
2023 $LCTL set_param fail_loc=0x80000312
2024 cp /etc/profile $DIR/$tdir/${tfile}_1 || error "1st cp failed $?"
2025 $LCTL set_param fail_val=$((TIMEOUT * 5 / 4))
2026 $LCTL set_param fail_loc=0x80000312
2027 cp /etc/profile $DIR/$tdir/${tfile}_2 || error "2nd cp failed $?"
2028 $LCTL set_param fail_loc=0
2030 echo $ENQ_MIN >> $ldlm_enqueue_min
2031 do_facet ost1 "echo $ENQ_MIN_R >> $ldlm_enqueue_min_r"
2035 run_test 68 "AT: verify slowing locks"
2038 # end of AT tests includes above lines
2040 # start multi-client tests
2042 [ -z "$CLIENTS" ] &&
2043 { skip "Need two or more clients." && return; }
2044 [ $CLIENTCOUNT -lt 2 ] &&
2045 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
2047 echo "mount clients $CLIENTS ..."
2048 zconf_mount_clients $CLIENTS $MOUNT
2050 local clients=${CLIENTS//,/ }
2051 echo "Write/read files on $DIR ; clients $CLIENTS ... "
2052 for CLIENT in $clients; do
2053 do_node $CLIENT dd bs=1M count=10 if=/dev/zero \
2054 of=$DIR/${tfile}_${CLIENT} 2>/dev/null ||
2055 error "dd failed on $CLIENT"
2058 local prev_client=$(echo $clients | sed 's/^.* \(.\+\)$/\1/')
2059 for C in ${CLIENTS//,/ }; do
2060 do_node $prev_client dd if=$DIR/${tfile}_${C} \
2061 of=/dev/null 2>/dev/null ||
2062 error "dd if=$DIR/${tfile}_${C} failed on $prev_client"
2068 run_test 70a "check multi client t-f"
2070 check_for_process () {
2075 killall_process $clients "$prog" -0
2079 local clients=${CLIENTS:-$HOSTNAME}
2081 zconf_mount_clients $clients $MOUNT
2084 [ "$SLOW" = "no" ] && duration=120
2085 # set duration to 900 because it takes some time to boot node
2086 [ "$FAILURE_MODE" = HARD ] && duration=900
2089 local start_ts=$(date +%s)
2090 local cmd="rundbench 1 -t $duration"
2092 if [ $MDSCOUNT -ge 2 ]; then
2093 test_mkdir -p -c$MDSCOUNT $DIR/$tdir
2094 $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir
2096 do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
2097 PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
2098 DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \
2099 MOUNT=$MOUNT DIR=$DIR/$tdir/\\\$(hostname) LCTL=$LCTL $cmd" &
2102 #LU-1897 wait for all dbench copies to start
2103 while ! check_for_process $clients dbench; do
2104 elapsed=$(($(date +%s) - start_ts))
2105 if [ $elapsed -gt $duration ]; then
2106 killall_process $clients dbench
2107 error "dbench failed to start on $clients!"
2112 log "Started rundbench load pid=$pid ..."
2114 elapsed=$(($(date +%s) - start_ts))
2115 local num_failovers=0
2117 while [ $elapsed -lt $duration ]; do
2118 if ! check_for_process $clients dbench; then
2119 error_noexit "dbench stopped on some of $clients!"
2120 killall_process $clients dbench
2124 replay_barrier mds$fail_index
2125 sleep 1 # give clients a time to do operations
2126 # Increment the number of failovers
2127 num_failovers=$((num_failovers+1))
2128 log "$TESTNAME fail mds$fail_index $num_failovers times"
2130 elapsed=$(($(date +%s) - start_ts))
2131 if [ $fail_index -ge $MDSCOUNT ]; then
2134 fail_index=$((fail_index+1))
2138 wait $pid || error "rundbench load on $clients failed!"
2140 run_test 70b "dbench ${MDSCOUNT}mdts recovery; $CLIENTCOUNT clients"
2141 # end multi-client tests
2146 local monitor_pid=$3
2148 local start_ts=$(date +%s)
2149 local num_failovers=0
2152 elapsed=$(($(date +%s) - start_ts))
2153 while [ $elapsed -lt $duration ]; do
2154 fail_index=$((RANDOM%max_index+1))
2155 kill -0 $monitor_pid ||
2156 error "$monitor_pid stopped"
2158 replay_barrier mds$fail_index
2160 # Increment the number of failovers
2161 num_failovers=$((num_failovers+1))
2162 log "$TESTNAME fail mds$fail_index $num_failovers times"
2164 elapsed=$(($(date +%s) - start_ts))
2170 rm -f $DIR/replay-single.70c.lck
2175 local clients=${CLIENTS:-$HOSTNAME}
2178 zconf_mount_clients $clients $MOUNT
2181 [ "$SLOW" = "no" ] && duration=180
2182 # set duration to 900 because it takes some time to boot node
2183 [ "$FAILURE_MODE" = HARD ] && duration=600
2186 local start_ts=$(date +%s)
2188 trap cleanup_70c EXIT
2190 while [ ! -e $DIR/replay-single.70c.lck ]; do
2191 test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break
2192 if [ $MDSCOUNT -ge 2 ]; then
2193 $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir ||
2194 error "set default dirstripe failed"
2196 cd $DIR/$tdir || break
2197 tar cf - /etc | tar xf - || error "tar failed in loop"
2201 echo "Started tar $tar_70c_pid"
2203 random_fail_mdt $MDSCOUNT $duration $tar_70c_pid
2204 kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped"
2206 touch $DIR/replay-single.70c.lck
2207 wait $tar_70c_pid || error "$?: tar failed"
2212 run_test 70c "tar ${MDSCOUNT}mdts recovery"
2216 kill -9 $mkdir_70d_pid
2220 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2221 local clients=${CLIENTS:-$HOSTNAME}
2224 zconf_mount_clients $clients $MOUNT
2227 [ "$SLOW" = "no" ] && duration=180
2228 # set duration to 900 because it takes some time to boot node
2229 [ "$FAILURE_MODE" = HARD ] && duration=900
2234 local start_ts=$(date +%s)
2236 trap cleanup_70d EXIT
2239 $LFS mkdir -i0 -c2 $DIR/$tdir/test || {
2243 $LFS mkdir -i1 -c2 $DIR/$tdir/test1 || {
2248 touch $DIR/$tdir/test/a || {
2252 mkdir $DIR/$tdir/test/b || {
2256 rm -rf $DIR/$tdir/test || {
2262 touch $DIR/$tdir/test1/a || {
2266 mkdir $DIR/$tdir/test1/b || {
2271 rm -rf $DIR/$tdir/test1 || {
2273 ls -lR $DIR/$tdir/test1
2279 echo "Started $mkdir_70d_pid"
2281 random_fail_mdt $MDSCOUNT $duration $mkdir_70d_pid
2282 kill -0 $mkdir_70d_pid || error "mkdir/rmdir $mkdir_70d_pid stopped"
2287 run_test 70d "mkdir/rmdir striped dir ${MDSCOUNT}mdts recovery"
2290 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2291 local clients=${CLIENTS:-$HOSTNAME}
2294 lctl set_param debug=+ha
2295 zconf_mount_clients $clients $MOUNT
2298 [ "$SLOW" = "no" ] && duration=180
2299 # set duration to 900 because it takes some time to boot node
2300 [ "$FAILURE_MODE" = HARD ] && duration=900
2303 $LFS mkdir -i0 $DIR/$tdir/test_0
2304 $LFS mkdir -i0 $DIR/$tdir/test_1
2305 touch $DIR/$tdir/test_0/a
2306 touch $DIR/$tdir/test_1/b
2309 mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > /dev/null || {
2314 checkstat $DIR/$tdir/test_0/a && {
2315 echo "a still exists"
2319 checkstat $DIR/$tdir/test_1/b || {
2320 echo "b still exists"
2324 touch $DIR/$tdir/test_0/a || {
2325 echo "touch a fails"
2329 mrename $DIR/$tdir/test_1/b $DIR/$tdir/test_0/a > /dev/null || {
2336 stack_trap "kill -9 $rename_70e_pid" EXIT
2337 echo "Started PID=$rename_70e_pid"
2339 random_fail_mdt 2 $duration $rename_70e_pid
2340 kill -0 $rename_70e_pid || error "rename $rename_70e_pid stopped"
2342 run_test 70e "rename cross-MDT with random fails"
2344 test_70f_write_and_read(){
2349 echo "Write/read files in: '$DIR/$tdir', clients: '$CLIENTS' ..."
2350 for client in ${CLIENTS//,/ }; do
2351 [ -f $stopflag ] || return
2353 local tgtfile=$DIR/$tdir/$tfile.$client
2354 do_node $client dd $DD_OPTS bs=1M count=10 if=$srcfile \
2355 of=$tgtfile 2>/dev/null ||
2356 error "dd $DD_OPTS bs=1M count=10 if=$srcfile " \
2357 "of=$tgtfile failed on $client, rc=$?"
2360 local prev_client=$(echo ${CLIENTS//,/ } | awk '{ print $NF }')
2363 for client in ${CLIENTS//,/ }; do
2364 [ -f $stopflag ] || return
2366 # flush client cache in case test is running on only one client
2367 # do_node $client cancel_lru_locks osc
2368 do_node $client $LCTL set_param ldlm.namespaces.*.lru_size=clear
2370 tgtfile=$DIR/$tdir/$tfile.$client
2371 local md5=$(do_node $prev_client "md5sum $tgtfile")
2372 [ ${checksum[$index]// */} = ${md5// */} ] ||
2373 error "$tgtfile: checksum doesn't match on $prev_client"
2374 index=$((index + 1))
2384 mkdir -p $DIR/$tdir || error "cannot create $DIR/$tdir directory"
2385 $LFS setstripe -c -1 $DIR/$tdir ||
2386 error "cannot $LFS setstripe $DIR/$tdir"
2389 while [ -f $stopflag ]; do
2390 test_70f_write_and_read $srcfile $stopflag
2391 # use direct IO and buffer cache in turns if loop
2392 [ -n "$DD_OPTS" ] && DD_OPTS="" || DD_OPTS="oflag=direct"
2396 test_70f_cleanup() {
2398 rm -f $TMP/$tfile.stop
2399 do_nodes $CLIENTS rm -f $TMP/$tfile
2400 rm -f $DIR/$tdir/$tfile.*
2404 # [ x$ost1failover_HOST = x$ost_HOST ] &&
2405 # { skip "Failover host not defined" && return; }
2406 # [ -z "$CLIENTS" ] &&
2407 # { skip "CLIENTS are not specified." && return; }
2408 # [ $CLIENTCOUNT -lt 2 ] &&
2409 # { skip "Need 2 or more clients, have $CLIENTCOUNT" && return; }
2411 [[ $(lustre_version_code ost1) -lt $(version_code 2.9.53) ]] &&
2412 skip "Need server version at least 2.9.53" && return
2414 echo "mount clients $CLIENTS ..."
2415 zconf_mount_clients $CLIENTS $MOUNT
2417 local srcfile=$TMP/$tfile
2421 trap test_70f_cleanup EXIT
2422 # create a different source file local to each client node so we can
2423 # detect if the file wasn't written out properly after failover
2424 do_nodes $CLIENTS dd bs=1M count=10 if=/dev/urandom of=$srcfile \
2425 2>/dev/null || error "can't create $srcfile on $CLIENTS"
2426 for client in ${CLIENTS//,/ }; do
2427 checksum[$index]=$(do_node $client "md5sum $srcfile")
2428 index=$((index + 1))
2432 [ "$SLOW" = "no" ] && duration=60
2433 # set duration to 900 because it takes some time to boot node
2434 [ "$FAILURE_MODE" = HARD ] && duration=900
2436 local stopflag=$TMP/$tfile.stop
2437 test_70f_loop $srcfile $stopflag &
2441 local num_failovers=0
2442 local start_ts=$SECONDS
2443 while [ $elapsed -lt $duration ]; do
2447 num_failovers=$((num_failovers + 1))
2448 log "$TESTNAME failing OST $num_failovers times"
2451 elapsed=$((SECONDS - start_ts))
2458 run_test 70f "OSS O_DIRECT recovery with $CLIENTCOUNT clients"
2462 kill -9 $mkdir_71a_pid
2465 random_double_fail_mdt() {
2468 local monitor_pid=$3
2470 local start_ts=$(date +%s)
2471 local num_failovers=0
2475 elapsed=$(($(date +%s) - start_ts))
2476 while [ $elapsed -lt $duration ]; do
2477 fail_index=$((RANDOM%max_index + 1))
2478 if [ $fail_index -eq $max_index ]; then
2481 second_index=$((fail_index + 1))
2483 kill -0 $monitor_pid ||
2484 error "$monitor_pid stopped"
2486 replay_barrier mds$fail_index
2487 replay_barrier mds$second_index
2489 # Increment the number of failovers
2490 num_failovers=$((num_failovers+1))
2491 log "fail mds$fail_index mds$second_index $num_failovers times"
2492 fail mds${fail_index},mds${second_index}
2493 elapsed=$(($(date +%s) - start_ts))
2498 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2499 local clients=${CLIENTS:-$HOSTNAME}
2502 zconf_mount_clients $clients $MOUNT
2505 [ "$SLOW" = "no" ] && duration=180
2506 # set duration to 900 because it takes some time to boot node
2507 [ "$FAILURE_MODE" = HARD ] && duration=900
2512 local start_ts=$(date +%s)
2514 trap cleanup_71a EXIT
2517 $LFS mkdir -i0 -c2 $DIR/$tdir/test
2518 rmdir $DIR/$tdir/test
2522 echo "Started $mkdir_71a_pid"
2524 random_double_fail_mdt 2 $duration $mkdir_71a_pid
2525 kill -0 $mkdir_71a_pid || error "mkdir/rmdir $mkdir_71a_pid stopped"
2530 run_test 71a "mkdir/rmdir striped dir with 2 mdts recovery"
2533 multiop_bg_pause $DIR/$tfile O_tSc ||
2534 error "multiop_bg_pause $DIR/$tfile failed"
2538 replay_barrier $SINGLEMDS
2539 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
2540 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000302"
2543 wait $pid || error "multiop pid failed"
2544 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
2547 run_test 73a "open(O_CREAT), unlink, replay, reconnect before open replay, close"
2550 multiop_bg_pause $DIR/$tfile O_tSc ||
2551 error "multiop_bg_pause $DIR/$tfile failed"
2555 replay_barrier $SINGLEMDS
2556 #define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157
2557 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000157"
2560 wait $pid || error "multiop pid failed"
2561 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
2564 run_test 73b "open(O_CREAT), unlink, replay, reconnect at open_replay reply, close"
2568 local clients=${CLIENTS:-$HOSTNAME}
2570 zconf_umount_clients $clients $MOUNT
2572 facet_failover $SINGLEMDS
2573 zconf_mount_clients $clients $MOUNT
2575 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
2576 rm $DIR/$tfile || error "rm $DIR/$tfile failed"
2577 clients_up || error "client evicted: $?"
2580 run_test 74 "Ensure applications don't fail waiting for OST recovery"
2582 remote_dir_check_80() {
2587 diridx=$($LFS getstripe -m $remote_dir) ||
2588 error "$LFS getstripe -m $remote_dir failed"
2589 [ $diridx -eq $mdtidx ] || error "$diridx != $mdtidx"
2591 createmany -o $remote_dir/f-%d 20 || error "creation failed"
2592 fileidx=$($LFS getstripe -m $remote_dir/f-1) ||
2593 error "$LFS getstripe -m $remote_dir/f-1 failed"
2594 [ $fileidx -eq $mdtidx ] || error "$fileidx != $mdtidx"
2600 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2601 ([ $FAILURE_MODE == "HARD" ] &&
2602 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2603 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2607 local remote_dir=$DIR/$tdir/remote_dir
2609 mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2610 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2611 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2612 $LFS mkdir -i $MDTIDX $remote_dir &
2618 wait $CLIENT_PID || error "remote creation failed"
2620 remote_dir_check_80 || error "remote dir check failed"
2621 rm -rf $DIR/$tdir || error "rmdir failed"
2625 run_test 80a "DNE: create remote dir, drop update rep from MDT0, fail MDT0"
2628 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2629 ([ $FAILURE_MODE == "HARD" ] &&
2630 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2631 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2635 local remote_dir=$DIR/$tdir/remote_dir
2637 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2638 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2639 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2640 $LFS mkdir -i $MDTIDX $remote_dir &
2645 fail mds$((MDTIDX + 1))
2647 wait $CLIENT_PID || error "remote creation failed"
2649 remote_dir_check_80 || error "remote dir check failed"
2650 rm -rf $DIR/$tdir || error "rmdir failed"
2654 run_test 80b "DNE: create remote dir, drop update rep from MDT0, fail MDT1"
2657 [[ $mds1_FSTYPE = "zfs" ]] &&
2658 [[ $MDS1_VERSION -lt $(version_code 2.12.51) ]] &&
2659 skip "requires LU-10143 fix on MDS"
2660 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2661 ([ $FAILURE_MODE == "HARD" ] &&
2662 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2663 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2667 local remote_dir=$DIR/$tdir/remote_dir
2669 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2670 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2671 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2672 $LFS mkdir -i $MDTIDX $remote_dir &
2678 fail mds$((MDTIDX + 1))
2680 wait $CLIENT_PID || error "remote creation failed"
2682 remote_dir_check_80 || error "remote dir check failed"
2683 rm -rf $DIR/$tdir || error "rmdir failed"
2687 run_test 80c "DNE: create remote dir, drop update rep from MDT1, fail MDT[0,1]"
2690 [[ $mds1_FSTYPE = "zfs" ]] &&
2691 [[ $MDS1_VERSION -lt $(version_code 2.12.51) ]] &&
2692 skip "requires LU-10143 fix on MDS"
2693 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2695 local remote_dir=$DIR/$tdir/remote_dir
2697 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2698 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2699 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2700 $LFS mkdir -i $MDTIDX $remote_dir &
2703 # sleep 3 seconds to make sure MDTs are failed after
2704 # lfs mkdir -i has finished on all of MDTs.
2709 fail mds${MDTIDX},mds$((MDTIDX + 1))
2711 wait $CLIENT_PID || error "remote creation failed"
2713 remote_dir_check_80 || error "remote dir check failed"
2714 rm -rf $DIR/$tdir || error "rmdir failed"
2718 run_test 80d "DNE: create remote dir, drop update rep from MDT1, fail 2 MDTs"
2721 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2722 ([ $FAILURE_MODE == "HARD" ] &&
2723 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2724 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2728 local remote_dir=$DIR/$tdir/remote_dir
2730 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2731 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2732 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2733 $LFS mkdir -i $MDTIDX $remote_dir &
2736 # sleep 3 seconds to make sure MDTs are failed after
2737 # lfs mkdir -i has finished on all of MDTs.
2743 wait $CLIENT_PID || error "remote creation failed"
2745 remote_dir_check_80 || error "remote dir check failed"
2746 rm -rf $DIR/$tdir || error "rmdir failed"
2750 run_test 80e "DNE: create remote dir, drop MDT1 rep, fail MDT0"
2753 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2754 ([ $FAILURE_MODE == "HARD" ] &&
2755 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2756 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2759 local remote_dir=$DIR/$tdir/remote_dir
2761 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2762 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2763 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2764 $LFS mkdir -i $MDTIDX $remote_dir &
2768 fail mds$((MDTIDX + 1))
2770 wait $CLIENT_PID || error "remote creation failed"
2772 remote_dir_check_80 || error "remote dir check failed"
2773 rm -rf $DIR/$tdir || error "rmdir failed"
2777 run_test 80f "DNE: create remote dir, drop MDT1 rep, fail MDT1"
2780 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2781 ([ $FAILURE_MODE == "HARD" ] &&
2782 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2783 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2787 local remote_dir=$DIR/$tdir/remote_dir
2789 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2790 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2791 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2792 $LFS mkdir -i $MDTIDX $remote_dir &
2795 # sleep 3 seconds to make sure MDTs are failed after
2796 # lfs mkdir -i has finished on all of MDTs.
2802 fail mds$((MDTIDX + 1))
2804 wait $CLIENT_PID || error "remote creation failed"
2806 remote_dir_check_80 || error "remote dir check failed"
2807 rm -rf $DIR/$tdir || error "rmdir failed"
2811 run_test 80g "DNE: create remote dir, drop MDT1 rep, fail MDT0, then MDT1"
2814 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2816 local remote_dir=$DIR/$tdir/remote_dir
2818 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2819 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2820 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2821 $LFS mkdir -i $MDTIDX $remote_dir &
2824 # sleep 3 seconds to make sure MDTs are failed after
2825 # lfs mkdir -i has finished on all of MDTs.
2830 fail mds${MDTIDX},mds$((MDTIDX + 1))
2832 wait $CLIENT_PID || error "remote dir creation failed"
2834 remote_dir_check_80 || error "remote dir check failed"
2835 rm -rf $DIR/$tdir || error "rmdir failed"
2839 run_test 80h "DNE: create remote dir, drop MDT1 rep, fail 2 MDTs"
2842 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2843 ([ $FAILURE_MODE == "HARD" ] &&
2844 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2845 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2849 local remote_dir=$DIR/$tdir/remote_dir
2851 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2852 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2854 touch $remote_dir || error "touch $remote_dir failed"
2855 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2856 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2861 fail mds$((MDTIDX + 1))
2863 wait $CLIENT_PID || error "rm remote dir failed"
2865 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2867 rm -rf $DIR/$tdir || error "rmdir failed"
2871 run_test 81a "DNE: unlink remote dir, drop MDT0 update rep, fail MDT1"
2874 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2875 ([ $FAILURE_MODE == "HARD" ] &&
2876 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2877 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2880 local remote_dir=$DIR/$tdir/remote_dir
2882 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2883 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2885 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2886 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2893 wait $CLIENT_PID || error "rm remote dir failed"
2895 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2897 rm -rf $DIR/$tdir || error "rmdir failed"
2901 run_test 81b "DNE: unlink remote dir, drop MDT0 update reply, fail MDT0"
2904 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2905 ([ $FAILURE_MODE == "HARD" ] &&
2906 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2907 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2911 local remote_dir=$DIR/$tdir/remote_dir
2913 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2914 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2916 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2917 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2924 fail mds$((MDTIDX + 1))
2926 wait $CLIENT_PID || error "rm remote dir failed"
2928 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2930 rm -rf $DIR/$tdir || error "rmdir failed"
2934 run_test 81c "DNE: unlink remote dir, drop MDT0 update reply, fail MDT0,MDT1"
2937 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2939 local remote_dir=$DIR/$tdir/remote_dir
2941 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2942 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2944 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2945 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2951 fail mds${MDTIDX},mds$((MDTIDX + 1))
2953 wait $CLIENT_PID || error "rm remote dir failed"
2955 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2957 rm -rf $DIR/$tdir || error "rmdir failed"
2961 run_test 81d "DNE: unlink remote dir, drop MDT0 update reply, fail 2 MDTs"
2964 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2965 ([ $FAILURE_MODE == "HARD" ] &&
2966 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2967 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2971 local remote_dir=$DIR/$tdir/remote_dir
2973 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2974 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2976 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2977 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2980 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
2985 wait $CLIENT_PID || error "rm remote dir failed"
2987 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2989 rm -rf $DIR/$tdir || error "rmdir failed"
2993 run_test 81e "DNE: unlink remote dir, drop MDT1 req reply, fail MDT0"
2996 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2997 ([ $FAILURE_MODE == "HARD" ] &&
2998 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2999 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3003 local remote_dir=$DIR/$tdir/remote_dir
3005 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3006 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
3008 # OBD_FAIL_MDS_REINT_NET_REP 0x119
3009 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
3014 fail mds$((MDTIDX + 1))
3016 wait $CLIENT_PID || error "rm remote dir failed"
3018 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
3020 rm -rf $DIR/$tdir || error "rmdir failed"
3024 run_test 81f "DNE: unlink remote dir, drop MDT1 req reply, fail MDT1"
3027 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3028 ([ $FAILURE_MODE == "HARD" ] &&
3029 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3030 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3034 local remote_dir=$DIR/$tdir/remote_dir
3036 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3037 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
3039 # OBD_FAIL_MDS_REINT_NET_REP 0x119
3040 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
3047 fail mds$((MDTIDX + 1))
3049 wait $CLIENT_PID || error "rm remote dir failed"
3051 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
3053 rm -rf $DIR/$tdir || error "rmdir failed"
3057 run_test 81g "DNE: unlink remote dir, drop req reply, fail M0, then M1"
3060 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3062 local remote_dir=$DIR/$tdir/remote_dir
3064 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3065 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
3067 # OBD_FAIL_MDS_REINT_NET_REP 0x119
3068 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
3074 fail mds${MDTIDX},mds$((MDTIDX + 1))
3076 wait $CLIENT_PID || error "rm remote dir failed"
3078 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
3080 rm -rf $DIR/$tdir || error "rmdir failed"
3084 run_test 81h "DNE: unlink remote dir, drop request reply, fail 2 MDTs"
3087 #define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144
3088 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000144"
3089 createmany -o $DIR/$tfile- 1 &
3093 client_up || client_up || true # reconnect
3095 run_test 84a "stale open during export disconnect"
3097 test_85a() { #bug 16774
3098 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
3100 for i in $(seq 100); do
3101 echo "tag-$i" > $DIR/$tfile-$i
3102 grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i"
3105 lov_id=$(lctl dl | grep "clilov")
3106 addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}')
3107 count=$(lctl get_param -n \
3108 ldlm.namespaces.*MDT0000*$addr.lock_unused_count)
3109 echo "before recovery: unused locks count = $count"
3113 count2=$(lctl get_param -n \
3114 ldlm.namespaces.*MDT0000*$addr.lock_unused_count)
3115 echo "after recovery: unused locks count = $count2"
3117 if [ $count2 -ge $count ]; then
3118 error "unused locks are not canceled"
3121 run_test 85a "check the cancellation of unused locks during recovery(IBITS)"
3123 test_85b() { #bug 16774
3127 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
3129 if ! combined_mgs_mds ; then
3133 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3135 for i in $(seq 100); do
3136 dd if=/dev/urandom of=$DIR/$tdir/$tfile-$i bs=4096 \
3137 count=32 >/dev/null 2>&1
3140 cancel_lru_locks osc
3142 for i in $(seq 100); do
3143 dd if=$DIR/$tdir/$tfile-$i of=/dev/null bs=4096 \
3144 count=32 >/dev/null 2>&1
3147 lov_id=$(lctl dl | grep "clilov")
3148 addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}')
3149 count=$(lctl get_param -n \
3150 ldlm.namespaces.*OST0000*$addr.lock_unused_count)
3151 echo "before recovery: unused locks count = $count"
3152 [ $count -ne 0 ] || error "unused locks ($count) should be zero"
3156 count2=$(lctl get_param \
3157 -n ldlm.namespaces.*OST0000*$addr.lock_unused_count)
3158 echo "after recovery: unused locks count = $count2"
3160 if ! combined_mgs_mds ; then
3164 if [ $count2 -ge $count ]; then
3165 error "unused locks are not canceled"
3170 run_test 85b "check the cancellation of unused locks during recovery(EXTENT)"
3173 local clients=${CLIENTS:-$HOSTNAME}
3175 zconf_umount_clients $clients $MOUNT
3176 do_facet $SINGLEMDS lctl set_param mdt.${FSNAME}-MDT*.exports.clear=0
3177 remount_facet $SINGLEMDS
3178 zconf_mount_clients $clients $MOUNT
3180 run_test 86 "umount server after clear nid_stats should not hit LBUG"
3183 do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
3186 $LFS setstripe -i 0 -c 1 $DIR/$tfile
3187 dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 ||
3188 error "dd to $DIR/$tfile failed"
3189 cksum=$(md5sum $DIR/$tfile | awk '{print $1}')
3190 cancel_lru_locks osc
3192 dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
3193 cksum2=$(md5sum $DIR/$tfile | awk '{print $1}')
3194 if [ $cksum != $cksum2 ] ; then
3195 error "New checksum $cksum2 does not match original $cksum"
3198 run_test 87a "write replay"
3201 do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
3204 $LFS setstripe -i 0 -c 1 $DIR/$tfile
3205 dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 ||
3206 error "dd to $DIR/$tfile failed"
3207 sleep 1 # Give it a chance to flush dirty data
3208 echo TESTTEST | dd of=$DIR/$tfile bs=1 count=8 seek=64
3209 cksum=$(md5sum $DIR/$tfile | awk '{print $1}')
3210 cancel_lru_locks osc
3212 dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
3213 cksum2=$(md5sum $DIR/$tfile | awk '{print $1}')
3214 if [ $cksum != $cksum2 ] ; then
3215 error "New checksum $cksum2 does not match original $cksum"
3218 run_test 87b "write replay with changed data (checksum resend)"
3220 test_88() { #bug 17485
3221 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3222 mkdir -p $TMP/$tdir || error "mkdir $TMP/$tdir failed"
3224 $LFS setstripe -i 0 -c 1 $DIR/$tdir || error "$LFS setstripe failed"
3227 replay_barrier $SINGLEMDS
3229 # exhaust precreations on ost1
3230 local OST=$(ostname_from_index 0)
3231 local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $OST)
3232 local last_id=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_last_id)
3233 local next_id=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_next_id)
3234 echo "before test: last_id = $last_id, next_id = $next_id"
3236 echo "Creating to objid $last_id on ost $OST..."
3237 createmany -o $DIR/$tdir/f-%d $next_id $((last_id - next_id + 2)) ||
3238 error "createmany create files to last_id failed"
3240 #create some files to use some uncommitted objids
3241 last_id=$(($last_id + 1))
3242 createmany -o $DIR/$tdir/f-%d $last_id 8 ||
3243 error "createmany create files with uncommitted objids failed"
3245 last_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_last_id)
3246 next_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_next_id)
3247 echo "before recovery: last_id = $last_id2, next_id = $next_id2"
3249 # if test uses shutdown_facet && reboot_facet instead of facet_failover ()
3250 # it has to take care about the affected facets, bug20407
3251 local affected_mds1=$(affected_facets mds1)
3252 local affected_ost1=$(affected_facets ost1)
3254 shutdown_facet $SINGLEMDS
3257 reboot_facet $SINGLEMDS
3258 change_active $affected_mds1
3259 wait_for_facet $affected_mds1
3260 mount_facets $affected_mds1 || error "Restart of mds failed"
3263 change_active $affected_ost1
3264 wait_for_facet $affected_ost1
3265 mount_facets $affected_ost1 || error "Restart of ost1 failed"
3269 last_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_last_id)
3270 next_id2=$(do_facet $SINGLEMDS lctl get_param -n osc.$mdtosc.prealloc_next_id)
3271 echo "after recovery: last_id = $last_id2, next_id = $next_id2"
3273 # create new files, which should use new objids, and ensure the orphan
3274 # cleanup phase for ost1 is completed at the same time
3275 for i in $(seq 8); do
3276 file_id=$(($last_id + 10 + $i))
3277 dd if=/dev/urandom of=$DIR/$tdir/f-$file_id bs=4096 count=128
3280 # if the objids were not recreated, then "ls" will fail with -ENOENT
3281 ls -l $DIR/$tdir/* || error "can't get the status of precreated files"
3284 # write into previously created files
3285 for i in $(seq 8); do
3286 file_id=$(($last_id + $i))
3287 dd if=/dev/urandom of=$DIR/$tdir/f-$file_id bs=4096 count=128
3288 cp -f $DIR/$tdir/f-$file_id $TMP/$tdir/
3291 # compare the content
3292 for i in $(seq 8); do
3293 file_id=$(($last_id + $i))
3294 cmp $TMP/$tdir/f-$file_id $DIR/$tdir/f-$file_id ||
3295 error "the content of file is modified!"
3300 run_test 88 "MDS should not assign same objid to different files "
3302 function calc_osc_kbytes_used() {
3303 local kbtotal=$(calc_osc_kbytes kbytestotal)
3304 local kbfree=$(calc_osc_kbytes kbytesfree)
3305 echo $((kbtotal-kbfree))
3309 cancel_lru_locks osc
3310 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3311 rm -f $DIR/$tdir/$tfile
3312 wait_mds_ost_sync || error "initial MDS-OST sync timed out"
3313 wait_delete_completed || error "initial wait delete timed out"
3314 local blocks1=$(calc_osc_kbytes_used)
3315 local write_size=$(fs_log_size)
3317 $LFS setstripe -i 0 -c 1 $DIR/$tdir/$tfile
3318 [ $write_size -lt 1024 ] && write_size=1024
3319 dd if=/dev/zero bs=${write_size}k count=10 of=$DIR/$tdir/$tfile
3322 facet_failover $SINGLEMDS
3323 rm $DIR/$tdir/$tfile
3326 zconf_mount $(hostname) $MOUNT || error "mount fails"
3327 client_up || error "client_up failed"
3329 # wait for the remounted client to connect to ost1
3330 local target=$(get_osc_import_name client ost1)
3331 wait_import_state "FULL" "osc.${target}.ost_server_uuid" \
3332 $(max_recovery_time)
3334 wait_mds_ost_sync || error "MDS-OST sync timed out"
3335 wait_delete_completed || error "wait delete timed out"
3336 local blocks2=$(calc_osc_kbytes_used)
3338 [ $((blocks2 - blocks1)) -le $(fs_log_size) ] ||
3339 error $((blocks2 - blocks1)) blocks leaked
3341 run_test 89 "no disk space leak on late ost connection"
3348 change_active $facet
3349 wait_for_facet $facet
3350 mount_facet $facet || error "Restart of $facet failed"
3354 test_90() { # bug 19494
3355 local dir=$DIR/$tdir
3356 local ostfail=$(get_random_entry $(get_facets OST))
3358 if [[ $FAILURE_MODE = HARD ]]; then
3359 local affected=$(affected_facets $ostfail);
3360 if [[ "$affected" != $ostfail ]]; then
3361 skip not functional with FAILURE_MODE=$FAILURE_MODE, affected: $affected
3365 # ensure all OSTs are active to allow allocations
3368 mkdir $dir || error "mkdir $dir failed"
3370 echo "Create the files"
3372 # file "f${index}" striped over 1 OST
3373 # file "all" striped over all OSTs
3375 $LFS setstripe -c $OSTCOUNT $dir/all ||
3376 error "setstripe failed to create $dir/all"
3378 for ((i = 0; i < $OSTCOUNT; i++)); do
3381 $LFS setstripe -i $i -c 1 $f ||
3382 error "$LFS setstripe failed to create $f"
3384 # confirm setstripe actually created stripe on requested OST
3385 local uuid=$(ostuuid_from_index $i)
3387 for file in f$i all; do
3388 local found=$($LFS find --obd $uuid --name $file $dir)
3390 if [[ $dir/$file != $found ]]; then
3391 $LFS getstripe $dir/$file
3392 error "wrong stripe: $file, uuid: $uuid"
3397 # Before failing an OST, get its obd name and index
3398 local varsvc=${ostfail}_svc
3399 local obd=$(do_facet $ostfail lctl get_param \
3400 -n obdfilter.${!varsvc}.uuid)
3401 local index=$(($(facet_number $ostfail) - 1))
3403 echo "Fail $ostfail $obd, display the list of affected files"
3404 shutdown_facet $ostfail || error "shutdown_facet $ostfail failed"
3406 trap "cleanup_90 $ostfail" EXIT INT
3407 echo "General Query: lfs find $dir"
3408 local list=$($LFS find $dir)
3410 for (( i=0; i<$OSTCOUNT; i++ )); do
3411 list_member "$list" $dir/f$i ||
3412 error_noexit "lfs find $dir: no file f$i"
3414 list_member "$list" $dir/all ||
3415 error_noexit "lfs find $dir: no file all"
3417 # focus on the missing OST,
3418 # we expect to see only two files affected: "f$(index)" and "all"
3420 echo "Querying files on shutdown $ostfail: lfs find --obd $obd"
3421 list=$($LFS find --obd $obd $dir)
3423 for file in all f$index; do
3424 list_member "$list" $dir/$file ||
3425 error_noexit "lfs find does not report the affected $obd for $file"
3428 [[ $(echo $list | wc -w) -eq 2 ]] ||
3429 error_noexit "lfs find reports the wrong list of affected files ${#list[@]}"
3431 echo "Check getstripe: $LFS getstripe -r --obd $obd"
3432 list=$($LFS getstripe -r --obd $obd $dir)
3434 for file in all f$index; do
3435 echo "$list" | grep $dir/$file ||
3436 error_noexit "lfs getsripe does not report the affected $obd for $file"
3441 run_test 90 "lfs find identifies the missing striped file segments"
3444 local server_version=$(lustre_version_code $SINGLEMDS)
3445 [[ $server_version -ge $(version_code 2.6.90) ]] ||
3446 [[ $server_version -ge $(version_code 2.5.4) &&
3447 $server_version -lt $(version_code 2.5.50) ]] ||
3448 { skip "Need MDS version 2.5.4+ or 2.6.90+"; return; }
3450 cancel_lru_locks osc
3452 $LFS setstripe -i 0 -c 1 $DIR/$tfile ||
3453 error "$LFS setstripe $DIR/$tfile failed"
3454 dd if=/dev/zero of=$DIR/$tfile bs=1024 count=1 ||
3455 error "dd to $DIR/$tfile failed"
3456 #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715
3457 # We need to emulate a state that OST is waiting for other clients
3458 # not completing the recovery. Final ping is queued, but reply will be
3459 # sent on the recovery completion. It is done by sleep before
3460 # processing final pings
3461 do_facet ost1 "$LCTL set_param fail_val=40"
3462 do_facet ost1 "$LCTL set_param fail_loc=0x715"
3465 run_test 93a "replay + reconnect"
3468 local server_version=$(lustre_version_code $SINGLEMDS)
3469 [[ $server_version -ge $(version_code 2.7.90) ]] ||
3470 { skip "Need MDS version 2.7.90+"; return; }
3472 cancel_lru_locks mdc
3474 createmany -o $DIR/$tfile 20 ||
3475 error "createmany -o $DIR/$tfile failed"
3477 #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715
3478 # We need to emulate a state that MDT is waiting for other clients
3479 # not completing the recovery. Final ping is queued, but reply will be
3480 # sent on the recovery completion. It is done by sleep before
3481 # processing final pings
3482 do_facet mds1 "$LCTL set_param fail_val=80"
3483 do_facet mds1 "$LCTL set_param fail_loc=0x715"
3486 run_test 93b "replay + reconnect on mds"
3488 striped_dir_check_100() {
3489 local striped_dir=$DIR/$tdir/striped_dir
3490 local stripe_count=$($LFS getdirstripe -c $striped_dir)
3492 $LFS getdirstripe $striped_dir
3493 [ $stripe_count -eq 2 ] || error "$stripe_count != 2"
3495 createmany -o $striped_dir/f-%d 20 ||
3496 error "creation failed under striped dir"
3500 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3501 ([ $FAILURE_MODE == "HARD" ] &&
3502 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3503 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3506 local striped_dir=$DIR/$tdir/striped_dir
3509 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3511 #To make sure MDT1 and MDT0 are connected
3512 #otherwise it may create single stripe dir here
3513 $LFS setdirstripe -i1 $DIR/$tdir/remote_dir
3515 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
3516 do_facet mds$((MDTIDX+1)) lctl set_param fail_loc=0x1701
3517 $LFS setdirstripe -i0 -c2 $striped_dir &
3520 fail mds$((MDTIDX + 1))
3522 wait $CLIENT_PID || error "striped dir creation failed"
3524 striped_dir_check_100 || error "striped dir check failed"
3525 rm -rf $DIR/$tdir || error "rmdir failed"
3527 run_test 100a "DNE: create striped dir, drop update rep from MDT1, fail MDT1"
3530 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3531 ([ $FAILURE_MODE == "HARD" ] &&
3532 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3533 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3536 local striped_dir=$DIR/$tdir/striped_dir
3539 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3541 #To make sure MDT1 and MDT0 are connected
3542 #otherwise it may create single stripe dir here
3543 $LFS setdirstripe -i1 $DIR/$tdir/remote_dir
3545 # OBD_FAIL_MDS_REINT_NET_REP 0x119
3546 do_facet mds$MDTIDX lctl set_param fail_loc=0x119
3547 $LFS mkdir -i0 -c2 $striped_dir &
3552 wait $CLIENT_PID || error "striped dir creation failed"
3554 striped_dir_check_100 || error "striped dir check failed"
3555 rm -rf $DIR/$tdir || error "rmdir failed"
3557 run_test 100b "DNE: create striped dir, fail MDT0"
3559 test_101() { #LU-5648
3560 mkdir -p $DIR/$tdir/d1
3561 mkdir -p $DIR/$tdir/d2
3562 touch $DIR/$tdir/file0
3565 replay_barrier $SINGLEMDS
3566 for i in $(seq $num) ; do
3567 echo test$i > $DIR/$tdir/d1/file$i
3570 fail_abort $SINGLEMDS
3571 for i in $(seq $num) ; do
3572 touch $DIR/$tdir/d2/file$i
3573 test -s $DIR/$tdir/d2/file$i &&
3574 ls -al $DIR/$tdir/d2/file$i && error "file$i's size > 0"
3579 run_test 101 "Shouldn't reassign precreated objs to other files after recovery"
3588 [[ $(lctl get_param mdc.*.import |
3589 grep "connect_flags:.*multi_mod_rpc") ]] ||
3590 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3592 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3593 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3594 facet="mds$((0x$idx + 1))"
3596 # get current value of max_mod_rcps_in_flight
3597 num=$($LCTL get_param -n \
3598 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3599 # set default value if client does not support multi mod RPCs
3600 [ -z "$num" ] && num=1
3602 echo "creating $num files ..."
3604 for i in $(seq $num); do
3605 touch $DIR/$tdir/file-$i
3608 # drop request on MDT to force resend
3609 #define OBD_FAIL_MDS_REINT_MULTI_NET 0x159
3610 do_facet $facet "$LCTL set_param fail_loc=0x159"
3611 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3612 for i in $(seq $num); do
3613 chmod 0600 $DIR/$tdir/file-$i &
3617 do_facet $facet "$LCTL set_param fail_loc=0"
3618 for pid in $pids; do
3619 wait $pid || error "chmod failed"
3621 echo "done ($(date +%H:%M:%S))"
3623 # check chmod succeed
3624 for i in $(seq $num); do
3625 checkstat -vp 0600 $DIR/$tdir/file-$i
3630 run_test 102a "check resend (request lost) with multiple modify RPCs in flight"
3639 [[ $(lctl get_param mdc.*.import |
3640 grep "connect_flags:.*multi_mod_rpc") ]] ||
3641 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3643 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3644 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3645 facet="mds$((0x$idx + 1))"
3647 # get current value of max_mod_rcps_in_flight
3648 num=$($LCTL get_param -n \
3649 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3650 # set default value if client does not support multi mod RPCs
3651 [ -z "$num" ] && num=1
3653 echo "creating $num files ..."
3655 for i in $(seq $num); do
3656 touch $DIR/$tdir/file-$i
3659 # drop reply on MDT to force reconstruction
3660 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
3661 do_facet $facet "$LCTL set_param fail_loc=0x15a"
3662 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3663 for i in $(seq $num); do
3664 chmod 0600 $DIR/$tdir/file-$i &
3668 do_facet $facet "$LCTL set_param fail_loc=0"
3669 for pid in $pids; do
3670 wait $pid || error "chmod failed"
3672 echo "done ($(date +%H:%M:%S))"
3674 # check chmod succeed
3675 for i in $(seq $num); do
3676 checkstat -vp 0600 $DIR/$tdir/file-$i
3681 run_test 102b "check resend (reply lost) with multiple modify RPCs in flight"
3690 [[ $(lctl get_param mdc.*.import |
3691 grep "connect_flags:.*multi_mod_rpc") ]] ||
3692 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3694 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3695 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3696 facet="mds$((0x$idx + 1))"
3698 # get current value of max_mod_rcps_in_flight
3699 num=$($LCTL get_param -n \
3700 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3701 # set default value if client does not support multi mod RPCs
3702 [ -z "$num" ] && num=1
3704 echo "creating $num files ..."
3706 for i in $(seq $num); do
3707 touch $DIR/$tdir/file-$i
3710 replay_barrier $facet
3713 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
3714 do_facet $facet "$LCTL set_param fail_loc=0x15a"
3715 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3716 for i in $(seq $num); do
3717 chmod 0600 $DIR/$tdir/file-$i &
3721 do_facet $facet "$LCTL set_param fail_loc=0"
3726 for pid in $pids; do
3727 wait $pid || error "chmod failed"
3729 echo "done ($(date +%H:%M:%S))"
3731 # check chmod succeed
3732 for i in $(seq $num); do
3733 checkstat -vp 0600 $DIR/$tdir/file-$i
3738 run_test 102c "check replay w/o reconstruction with multiple mod RPCs in flight"
3747 [[ $(lctl get_param mdc.*.import |
3748 grep "connect_flags:.*multi_mod_rpc") ]] ||
3749 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3751 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3752 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3753 facet="mds$((0x$idx + 1))"
3755 # get current value of max_mod_rcps_in_flight
3756 num=$($LCTL get_param -n \
3757 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3758 # set default value if client does not support multi mod RPCs
3759 [ -z "$num" ] && num=1
3761 echo "creating $num files ..."
3763 for i in $(seq $num); do
3764 touch $DIR/$tdir/file-$i
3768 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
3769 do_facet $facet "$LCTL set_param fail_loc=0x15a"
3770 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3771 for i in $(seq $num); do
3772 chmod 0600 $DIR/$tdir/file-$i &
3777 # write MDT transactions to disk
3778 do_facet $facet "sync; sync; sync"
3780 do_facet $facet "$LCTL set_param fail_loc=0"
3785 for pid in $pids; do
3786 wait $pid || error "chmod failed"
3788 echo "done ($(date +%H:%M:%S))"
3790 # check chmod succeed
3791 for i in $(seq $num); do
3792 checkstat -vp 0600 $DIR/$tdir/file-$i
3797 run_test 102d "check replay & reconstruction with multiple mod RPCs in flight"
3800 remote_mds_nodsh && skip "remote MDS with nodsh" && return
3801 local mds_version=$(lustre_version_code $SINGLEMDS)
3802 [[ $mds_version -gt $(version_code 2.8.54) ]] ||
3803 { skip "Need MDS version 2.8.54+"; return; }
3805 #define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162
3806 do_facet mds1 $LCTL set_param fail_loc=0x80000162
3809 createmany -o $DIR/$tdir/t- 30 ||
3810 error "create files on remote directory failed"
3812 rm -rf $DIR/$tdir/t-*
3814 #MDS should crash with tr->otr_next_id overflow
3817 run_test 103 "Check otr_next_id overflow"
3820 check_striped_dir_110()
3822 $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
3823 error "create striped dir failed"
3824 local stripe_count=$($LFS getdirstripe -c $DIR/$tdir/striped_dir)
3825 [ $stripe_count -eq $MDSCOUNT ] ||
3826 error "$stripe_count != 2 after recovery"
3830 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3831 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
3832 skip "Need MDS version at least 2.7.56"
3834 ([ $FAILURE_MODE == "HARD" ] &&
3835 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3836 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3841 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3844 check_striped_dir_110 || error "check striped_dir failed"
3845 rm -rf $DIR/$tdir || error "rmdir failed"
3849 run_test 110a "DNE: create striped dir, fail MDT1"
3852 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3853 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
3854 skip "Need MDS version at least 2.7.56"
3856 ([ $FAILURE_MODE == "HARD" ] &&
3857 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3858 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3863 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3866 zconf_mount $(hostname) $MOUNT
3867 client_up || return 1
3869 check_striped_dir_110 || error "check striped_dir failed"
3871 rm -rf $DIR/$tdir || error "rmdir failed"
3875 run_test 110b "DNE: create striped dir, fail MDT1 and client"
3878 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3879 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
3880 skip "Need MDS version at least 2.7.56"
3882 ([ $FAILURE_MODE == "HARD" ] &&
3883 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3884 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3889 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3892 check_striped_dir_110 || error "check striped_dir failed"
3894 rm -rf $DIR/$tdir || error "rmdir failed"
3898 run_test 110c "DNE: create striped dir, fail MDT2"
3901 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3902 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
3903 skip "Need MDS version at least 2.7.56"
3905 ([ $FAILURE_MODE == "HARD" ] &&
3906 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3907 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3912 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3915 zconf_mount $(hostname) $MOUNT
3916 client_up || return 1
3918 check_striped_dir_110 || error "check striped_dir failed"
3920 rm -rf $DIR/$tdir || error "rmdir failed"
3924 run_test 110d "DNE: create striped dir, fail MDT2 and client"
3927 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3928 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
3929 skip "Need MDS version at least 2.7.56"
3931 ([ $FAILURE_MODE == "HARD" ] &&
3932 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3933 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3938 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3942 zconf_mount $(hostname) $MOUNT
3943 client_up || return 1
3945 check_striped_dir_110 || error "check striped_dir failed"
3947 rm -rf $DIR/$tdir || error "rmdir failed"
3951 run_test 110e "DNE: create striped dir, uncommit on MDT2, fail client/MDT1/MDT2"
3954 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3955 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
3956 skip "Need MDS version at least 2.7.56"
3958 ([ $FAILURE_MODE == "HARD" ] &&
3959 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3960 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3966 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3969 check_striped_dir_110 || error "check striped_dir failed"
3971 rm -rf $DIR/$tdir || error "rmdir failed"
3975 run_test 110f "DNE: create striped dir, fail MDT1/MDT2"
3978 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3979 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
3980 skip "Need MDS version at least 2.7.56"
3982 ([ $FAILURE_MODE == "HARD" ] &&
3983 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3984 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3989 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3993 zconf_mount $(hostname) $MOUNT
3994 client_up || return 1
3996 check_striped_dir_110 || error "check striped_dir failed"
3998 rm -rf $DIR/$tdir || error "rmdir failed"
4002 run_test 110g "DNE: create striped dir, uncommit on MDT1, fail client/MDT1/MDT2"
4005 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4006 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4007 skip "Need MDS version at least 2.7.56"
4009 ([ $FAILURE_MODE == "HARD" ] &&
4010 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4011 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4015 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4017 rm -rf $DIR/$tdir/striped_dir
4020 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4021 error "striped dir still exists"
4024 run_test 111a "DNE: unlink striped dir, fail MDT1"
4027 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4028 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4029 skip "Need MDS version at least 2.7.56"
4031 ([ $FAILURE_MODE == "HARD" ] &&
4032 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4033 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4037 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4039 rm -rf $DIR/$tdir/striped_dir
4042 zconf_mount $(hostname) $MOUNT
4043 client_up || return 1
4045 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4046 error "striped dir still exists"
4049 run_test 111b "DNE: unlink striped dir, fail MDT2"
4052 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4053 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4054 skip "Need MDS version at least 2.7.56"
4056 ([ $FAILURE_MODE == "HARD" ] &&
4057 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4058 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4062 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4064 rm -rf $DIR/$tdir/striped_dir
4068 zconf_mount $(hostname) $MOUNT
4069 client_up || return 1
4070 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4071 error "striped dir still exists"
4074 run_test 111c "DNE: unlink striped dir, uncommit on MDT1, fail client/MDT1/MDT2"
4077 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4078 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4079 skip "Need MDS version at least 2.7.56"
4081 ([ $FAILURE_MODE == "HARD" ] &&
4082 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4083 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4087 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4089 rm -rf $DIR/$tdir/striped_dir
4093 zconf_mount $(hostname) $MOUNT
4094 client_up || return 1
4095 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4096 error "striped dir still exists"
4100 run_test 111d "DNE: unlink striped dir, uncommit on MDT2, fail client/MDT1/MDT2"
4103 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4104 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4105 skip "Need MDS version at least 2.7.56"
4107 ([ $FAILURE_MODE == "HARD" ] &&
4108 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4109 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4113 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4115 rm -rf $DIR/$tdir/striped_dir
4118 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4119 error "striped dir still exists"
4122 run_test 111e "DNE: unlink striped dir, uncommit on MDT2, fail MDT1/MDT2"
4125 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4126 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4127 skip "Need MDS version at least 2.7.56"
4129 ([ $FAILURE_MODE == "HARD" ] &&
4130 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4131 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4135 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4137 rm -rf $DIR/$tdir/striped_dir
4140 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4141 error "striped dir still exists"
4144 run_test 111f "DNE: unlink striped dir, uncommit on MDT1, fail MDT1/MDT2"
4147 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4148 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4149 skip "Need MDS version at least 2.7.56"
4151 ([ $FAILURE_MODE == "HARD" ] &&
4152 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4153 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4157 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4160 rm -rf $DIR/$tdir/striped_dir
4162 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4163 error "striped dir still exists"
4166 run_test 111g "DNE: unlink striped dir, fail MDT1/MDT2"
4168 test_112_rename_prepare() {
4169 mkdir -p $DIR/$tdir/src_dir
4170 $LFS mkdir -i 1 $DIR/$tdir/src_dir/src_child ||
4171 error "create remote source failed"
4173 touch $DIR/$tdir/src_dir/src_child/a
4175 $LFS mkdir -i 2 $DIR/$tdir/tgt_dir ||
4176 error "create remote target dir failed"
4178 $LFS mkdir -i 3 $DIR/$tdir/tgt_dir/tgt_child ||
4179 error "create remote target child failed"
4184 $CHECKSTAT -t dir $DIR/$tdir/src_dir/src_child &&
4185 error "src_child still exists after rename"
4187 $CHECKSTAT -t file $DIR/$tdir/tgt_dir/tgt_child/a ||
4188 error "missing file(a) after rename"
4192 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4193 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4194 skip "Need MDS version at least 2.7.56"
4196 ([ $FAILURE_MODE == "HARD" ] &&
4197 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4198 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4201 test_112_rename_prepare
4204 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4205 error "rename dir cross MDT failed!"
4209 rm -rf $DIR/$tdir || error "rmdir failed"
4211 run_test 112a "DNE: cross MDT rename, fail MDT1"
4214 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4215 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4216 skip "Need MDS version at least 2.7.56"
4218 ([ $FAILURE_MODE == "HARD" ] &&
4219 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4220 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4223 test_112_rename_prepare
4226 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4227 error "rename dir cross MDT failed!"
4232 rm -rf $DIR/$tdir || error "rmdir failed"
4234 run_test 112b "DNE: cross MDT rename, fail MDT2"
4237 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4238 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4239 skip "Need MDS version at least 2.7.56"
4241 ([ $FAILURE_MODE == "HARD" ] &&
4242 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4243 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4246 test_112_rename_prepare
4249 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4250 error "rename dir cross MDT failed!"
4255 rm -rf $DIR/$tdir || error "rmdir failed"
4257 run_test 112c "DNE: cross MDT rename, fail MDT3"
4260 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4261 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4262 skip "Need MDS version at least 2.7.56"
4264 ([ $FAILURE_MODE == "HARD" ] &&
4265 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4266 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4269 test_112_rename_prepare
4272 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4273 error "rename dir cross MDT failed!"
4278 rm -rf $DIR/$tdir || error "rmdir failed"
4280 run_test 112d "DNE: cross MDT rename, fail MDT4"
4283 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4284 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4285 skip "Need MDS version at least 2.7.56"
4287 ([ $FAILURE_MODE == "HARD" ] &&
4288 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4289 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4292 test_112_rename_prepare
4296 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4297 error "rename dir cross MDT failed!"
4302 rm -rf $DIR/$tdir || error "rmdir failed"
4304 run_test 112e "DNE: cross MDT rename, fail MDT1 and MDT2"
4307 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4308 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4309 skip "Need MDS version at least 2.7.56"
4311 ([ $FAILURE_MODE == "HARD" ] &&
4312 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4313 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4316 test_112_rename_prepare
4320 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4321 error "rename dir cross MDT failed!"
4326 rm -rf $DIR/$tdir || error "rmdir failed"
4328 run_test 112f "DNE: cross MDT rename, fail MDT1 and MDT3"
4331 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4332 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4333 skip "Need MDS version at least 2.7.56"
4335 ([ $FAILURE_MODE == "HARD" ] &&
4336 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4337 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4340 test_112_rename_prepare
4344 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4345 error "rename dir cross MDT failed!"
4350 rm -rf $DIR/$tdir || error "rmdir failed"
4352 run_test 112g "DNE: cross MDT rename, fail MDT1 and MDT4"
4355 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4356 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4357 skip "Need MDS version at least 2.7.56"
4359 ([ $FAILURE_MODE == "HARD" ] &&
4360 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4361 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4364 test_112_rename_prepare
4368 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4369 error "rename dir cross MDT failed!"
4374 rm -rf $DIR/$tdir || error "rmdir failed"
4376 run_test 112h "DNE: cross MDT rename, fail MDT2 and MDT3"
4379 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4380 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4381 skip "Need MDS version at least 2.7.56"
4383 ([ $FAILURE_MODE == "HARD" ] &&
4384 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4385 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4388 test_112_rename_prepare
4392 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4393 error "rename dir cross MDT failed!"
4398 rm -rf $DIR/$tdir || error "rmdir failed"
4400 run_test 112i "DNE: cross MDT rename, fail MDT2 and MDT4"
4403 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4404 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4405 skip "Need MDS version at least 2.7.56"
4407 ([ $FAILURE_MODE == "HARD" ] &&
4408 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4409 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4412 test_112_rename_prepare
4416 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4417 error "rename dir cross MDT failed!"
4422 rm -rf $DIR/$tdir || error "rmdir failed"
4424 run_test 112j "DNE: cross MDT rename, fail MDT3 and MDT4"
4427 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4428 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4429 skip "Need MDS version at least 2.7.56"
4431 ([ $FAILURE_MODE == "HARD" ] &&
4432 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4433 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4436 test_112_rename_prepare
4441 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4442 error "rename dir cross MDT failed!"
4447 rm -rf $DIR/$tdir || error "rmdir failed"
4449 run_test 112k "DNE: cross MDT rename, fail MDT1,MDT2,MDT3"
4452 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4453 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4454 skip "Need MDS version at least 2.7.56"
4456 ([ $FAILURE_MODE == "HARD" ] &&
4457 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4458 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4461 test_112_rename_prepare
4466 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4467 error "rename dir cross MDT failed!"
4472 rm -rf $DIR/$tdir || error "rmdir failed"
4474 run_test 112l "DNE: cross MDT rename, fail MDT1,MDT2,MDT4"
4477 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4478 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4479 skip "Need MDS version at least 2.7.56"
4481 ([ $FAILURE_MODE == "HARD" ] &&
4482 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4483 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4486 test_112_rename_prepare
4491 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4492 error "rename dir cross MDT failed!"
4497 rm -rf $DIR/$tdir || error "rmdir failed"
4499 run_test 112m "DNE: cross MDT rename, fail MDT1,MDT3,MDT4"
4502 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4503 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4504 skip "Need MDS version at least 2.7.56"
4506 ([ $FAILURE_MODE == "HARD" ] &&
4507 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4508 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4511 test_112_rename_prepare
4516 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4517 error "rename dir cross MDT failed!"
4522 rm -rf $DIR/$tdir || error "rmdir failed"
4524 run_test 112n "DNE: cross MDT rename, fail MDT2,MDT3,MDT4"
4527 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4528 [[ $(lustre_version_code $SINGLEMDS) -ge $(version_code 2.7.56) ]] ||
4529 skip "Need MDS version at least 2.7.56"
4531 ([ $FAILURE_MODE == "HARD" ] &&
4532 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4533 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4541 for ((j=0;j<$((MDSCOUNT));j++)); do
4542 fail_index=$((fail_index+1))
4543 index=$((fail_index % MDSCOUNT))
4544 replay_barrier mds$((index + 1))
4545 for ((i=0;i<5;i++)); do
4546 test_mkdir -i$index -c$MDSCOUNT $DIR/$tdir/test_$i ||
4547 error "create striped dir $DIR/$tdir/test_$i"
4550 fail mds$((index + 1))
4551 for ((i=0;i<5;i++)); do
4552 checkstat -t dir $DIR/$tdir/test_$i ||
4553 error "$DIR/$tdir/test_$i does not exist!"
4555 rm -rf $DIR/$tdir/test_* ||
4559 run_test 115 "failover for create/unlink striped directory"
4562 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4563 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] &&
4564 skip "Do not support large update log before 2.7.55" &&
4566 ([ $FAILURE_MODE == "HARD" ] &&
4567 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4568 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4575 # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
4576 do_facet mds1 "lctl set_param fail_loc=0x80001702"
4577 $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/striped_dir
4580 $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
4581 error "stried_dir does not exists"
4583 run_test 116a "large update log master MDT recovery"
4586 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4587 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.55) ] &&
4588 skip "Do not support large update log before 2.7.55" &&
4591 ([ $FAILURE_MODE == "HARD" ] &&
4592 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4593 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4600 # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
4601 do_facet mds2 "lctl set_param fail_loc=0x80001702"
4602 $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/striped_dir
4605 $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
4606 error "stried_dir does not exists"
4608 run_test 116b "large update log slave MDT recovery"
4611 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4612 ([ $FAILURE_MODE == "HARD" ] &&
4613 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4614 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4620 $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
4621 $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
4624 # Let's set rdonly on all MDTs, so client will send
4625 # replay requests on all MDTs and replay these requests
4626 # at the same time. This test will verify the recovery
4627 # will not be deadlock in this case, LU-7531.
4628 for ((index = 0; index < $((MDSCOUNT)); index++)); do
4629 replay_barrier mds$((index + 1))
4630 if [ -z $mds_indexs ]; then
4631 mds_indexs="${mds_indexs}mds$((index+1))"
4633 mds_indexs="${mds_indexs},mds$((index+1))"
4637 rm -rf $DIR/$tdir/remote_dir
4638 rm -rf $DIR/$tdir/remote_dir_1
4642 rm -rf $DIR/$tdir || error "rmdir failed"
4644 run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
4647 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4648 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
4649 skip "Do not support large update log before 2.7.64" &&
4654 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir ||
4655 error "setdirstripe fails"
4656 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 ||
4657 error "setdirstripe fails 1"
4658 rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails"
4660 # OBD_FAIL_INVALIDATE_UPDATE 0x1705
4661 do_facet mds1 "lctl set_param fail_loc=0x1705"
4662 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
4663 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
4664 do_facet mds1 "lctl set_param fail_loc=0x0"
4667 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
4668 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
4673 run_test 118 "invalidate osp update will not cause update log corruption"
4676 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4677 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
4678 skip "Do not support large update log before 2.7.64" &&
4681 local hard_timeout=$(do_facet mds1 \
4682 "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
4684 local clients=${CLIENTS:-$HOSTNAME}
4685 local time_min=$(recovery_time_min)
4688 mkdir $DIR/$tdir/tmp
4689 rmdir $DIR/$tdir/tmp
4692 mkdir $DIR/$tdir/dir_1
4693 for ((i = 0; i < 20; i++)); do
4694 $LFS setdirstripe -i0 -c2 $DIR/$tdir/stripe_dir-$i
4701 #define OBD_FAIL_TGT_REPLAY_DELAY 0x714
4702 do_facet mds1 $LCTL set_param fail_loc=0x80000714
4703 #sleep (timeout + 5), so mds will evict the client exports,
4704 #but DNE update recovery will keep going.
4705 do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
4707 mount_facet mds1 "-o recovery_time_hard=$time_min"
4709 wait_clients_import_state "$clients" mds1 FULL
4711 clients_up || clients_up || error "failover df: $?"
4713 #revert back the hard timeout
4714 do_facet mds1 $LCTL set_param \
4715 mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
4717 for ((i = 0; i < 20; i++)); do
4718 stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
4719 [ $stripe_count == 2 ] || {
4720 error "stripe_dir-$i creation replay fails"
4725 run_test 119 "timeout of normal replay does not cause DNE replay fails "
4728 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4729 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.7.64) ] &&
4730 skip "Do not support large update log before 2.7.64" &&
4734 replay_barrier_nosync mds1
4735 for ((i = 0; i < 20; i++)); do
4736 mkdir $DIR/$tdir/dir-$i || {
4737 error "create dir-$i fails"
4740 $LFS setdirstripe -i0 -c2 $DIR/$tdir/stripe_dir-$i || {
4741 error "create stripe_dir-$i fails"
4748 for ((i = 0; i < 20; i++)); do
4749 [ ! -e "$DIR/$tdir/dir-$i" ] || {
4750 error "dir-$i still exists"
4753 [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
4754 error "stripe_dir-$i still exists"
4759 run_test 120 "DNE fail abort should stop both normal and DNE replay"
4762 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.90) ] &&
4763 skip "Don't support it before 2.11" &&
4766 local at_max_saved=$(at_max_get mds)
4768 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
4769 cancel_lru_locks mdc
4771 multiop_bg_pause $DIR/$tfile s_s || error "multiop $DIR/$tfile failed"
4774 lctl set_param -n ldlm.cancel_unused_locks_before_replay "0"
4780 #define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721
4781 do_facet $SINGLEMDS "lctl set_param fail_loc=0x721 fail_val=0"
4785 wait_clients_import_state "$clients" mds1 FULL
4786 clients_up || clients_up || error "failover df: $?"
4789 wait $mpid || error "multiop_bg_pause pid failed"
4791 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
4792 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
4793 at_max_set $at_max_saved mds
4796 run_test 121 "lock replay timed out and race"
4799 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.90) ] &&
4800 skip "Do not support Data-on-MDT before 2.11"
4802 replay_barrier $SINGLEMDS
4803 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
4806 [ $($LFS getstripe -L $DIR/$tfile) == "mdt" ] ||
4807 error "Fail to replay DoM file creation"
4809 run_test 130a "DoM file create (setstripe) replay"
4812 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.90) ] &&
4813 skip "Do not support Data-on-MDT before 2.11"
4816 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tdir
4817 replay_barrier $SINGLEMDS
4818 touch $DIR/$tdir/$tfile
4821 [ $($LFS getstripe -L $DIR/$tdir/$tfile) == "mdt" ] ||
4822 error "Fail to replay DoM file creation"
4824 run_test 130b "DoM file create (inherited) replay"
4827 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.90) ] &&
4828 skip "Do not support Data-on-MDT before 2.11"
4830 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
4831 replay_barrier $SINGLEMDS
4832 echo "dom_data" | dd of=$DIR/$tfile bs=8 count=1
4833 # lock is not canceled and will be replayed
4836 [ $(cat $DIR/$tfile) == "dom_data" ] ||
4837 error "Wrong file content after failover"
4839 run_test 131a "DoM file write lock replay"
4842 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.10.90) ] &&
4843 skip "Do not support Data-on-MDT before 2.11"
4845 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
4846 replay_barrier $SINGLEMDS
4847 echo "dom_data" | dd of=$DIR/$tfile bs=8 count=1
4848 cancel_lru_locks mdc
4852 [ $(cat $DIR/$tfile) == "dom_data" ] ||
4853 error "Wrong file content after failover"
4855 run_test 131b "DoM file write replay"
4858 [ $(lustre_version_code $SINGLEMDS) -lt $(version_code 2.9.90) ] &&
4859 skip "Do not support PFL files before 2.10"
4861 $LFS setstripe -E 1M -c 1 -E EOF -c 2 $DIR/$tfile
4862 replay_barrier $SINGLEMDS
4863 # write over the first component size cause next component instantiation
4864 dd if=/dev/urandom of=$DIR/$tfile bs=1M count=1 seek=1 ||
4865 error "dd to $DIR/$tfile failed"
4866 lfs getstripe $DIR/$tfile
4868 cksum=$(md5sum $DIR/$tfile | awk '{print $1}')
4869 $LFS getstripe -I2 $DIR/$tfile | grep -q lmm_objects ||
4870 error "Component #1 was not instantiated"
4874 lfs getstripe $DIR/$tfile
4875 $LFS getstripe -I2 $DIR/$tfile | grep -q lmm_objects ||
4876 error "Component #1 instantiation was not replayed"
4877 cksum2=$(md5sum $DIR/$tfile | awk '{print $1}')
4878 if [ $cksum != $cksum2 ] ; then
4879 error_noexit "New cksum $cksum2 does not match original $cksum"
4882 run_test 132a "PFL new component instantiate replay"
4885 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4886 ([ $FAILURE_MODE == "HARD" ] &&
4887 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4888 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4891 local remote_dir=$DIR/$tdir/remote_dir
4893 mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
4894 $LFS mkdir -i 1 $remote_dir
4897 do_facet mds2 $LCTL set_param seq.srv*MDT0001.space=clear
4899 zconf_mount $(hostname) $MOUNT
4900 client_up || return 1
4902 #define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123
4904 do_facet mds1 $LCTL set_param fail_val=700 fail_loc=0x80000123
4905 cp /etc/hosts $remote_dir/file &
4911 wait $pid || error "cp failed"
4912 rm -rf $DIR/$tdir || error "rmdir failed"
4916 run_test 133 "check resend of ongoing requests for lwp during failover"
4919 check_and_cleanup_lustre