5 LUSTRE=${LUSTRE:-$(dirname $0)/..}
6 . $LUSTRE/tests/test-framework.sh
10 ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT "
11 # bug number for skipped test: LU-13614
14 if [ "$mds1_FSTYPE" = zfs ]; then
19 # bug number for skipped tests: LU-9795
20 ALWAYS_EXCEPT="$ALWAYS_EXCEPT 121"
22 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
26 CHECK_GRANT=${CHECK_GRANT:-"yes"}
27 GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
29 require_dsh_mds || exit 0
30 check_and_setup_lustre
35 rm -rf $DIR/[df][0-9]* $DIR/f.$TESTSUITE.*
37 # LU-482 Avert LVM and VM inability to flush caches in pre .33 kernels
38 if [ $LINUX_VERSION_CODE -lt $(version_code 2.6.33) ]; then
40 do_facet $SINGLEMDS sync
45 test_0a() { # was test_0
46 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
47 replay_barrier $SINGLEMDS
51 run_test 0a "empty replay"
54 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
56 # this test attempts to trigger a race in the precreation code,
57 # and must run before any other objects are created on the filesystem
59 createmany -o $DIR/$tfile 20 || error "createmany -o $DIR/$tfile failed"
60 unlinkmany $DIR/$tfile 20 || error "unlinkmany $DIR/$tfile failed"
62 run_test 0b "ensure object created after recover exists. (3284)"
65 replay_barrier $SINGLEMDS
68 facet_failover $SINGLEMDS
69 zconf_mount $(hostname) $MOUNT || error "mount fails"
70 client_up || error "post-failover df failed"
71 # file shouldn't exist if replay-barrier works as expected
72 rm $DIR/$tfile && error "File exists and it shouldn't"
75 run_test 0c "check replay-barrier"
78 replay_barrier $SINGLEMDS
80 facet_failover $SINGLEMDS
81 zconf_mount $(hostname) $MOUNT || error "mount fails"
82 client_up || error "post-failover df failed"
84 run_test 0d "expired recovery with no clients"
87 replay_barrier $SINGLEMDS
90 $CHECKSTAT -t file $DIR/$tfile ||
91 error "$CHECKSTAT $DIR/$tfile attribute check failed"
94 run_test 1 "simple create"
97 replay_barrier $SINGLEMDS
100 $CHECKSTAT -t file $DIR/$tfile ||
101 error "$CHECKSTAT $DIR/$tfile attribute check failed"
107 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
108 replay_barrier $SINGLEMDS
111 $CHECKSTAT -t file $DIR/$tfile ||
112 error "$CHECKSTAT $DIR/$tfile attribute check failed"
118 replay_barrier $SINGLEMDS
119 $LFS setstripe -c $OSTCOUNT $DIR/$tfile
121 $CHECKSTAT -t file $DIR/$tfile ||
122 error "$CHECKSTAT $DIR/$tfile check failed"
124 run_test 2c "setstripe replay"
127 [[ "$mds1_FSTYPE" = zfs ]] &&
128 [[ "$MDS1_VERSION" -lt $(version_code 2.12.51) ]] &&
129 skip "requires LU-10143 fix on MDS"
130 replay_barrier $SINGLEMDS
131 $LFS setdirstripe -i 0 -c $MDSCOUNT $DIR/$tdir
133 $CHECKSTAT -t dir $DIR/$tdir ||
134 error "$CHECKSTAT $DIR/$tdir check failed"
136 run_test 2d "setdirstripe replay"
139 testid=$(echo $TESTNAME | tr '_' ' ')
140 #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
141 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x8000013b"
142 openfile -f O_CREAT:O_EXCL $DIR/$tfile &
144 replay_barrier $SINGLEMDS
147 $CHECKSTAT -t file $DIR/$tfile ||
148 error "$CHECKSTAT $DIR/$tfile attribute check failed"
149 dmesg | tac | sed "/$testid/,$ d" | \
150 grep "Open request replay failed with -17" &&
151 error "open replay failed" || true
153 run_test 2e "O_CREAT|O_EXCL create replay"
156 local file=$DIR/$tfile
157 replay_barrier $SINGLEMDS
159 openfile -f O_DIRECTORY $file
161 $CHECKSTAT -t file $file ||
162 error "$CHECKSTAT $file attribute check failed"
165 run_test 3a "replay failed open(O_DIRECTORY)"
168 replay_barrier $SINGLEMDS
169 #define OBD_FAIL_MDS_OPEN_PACK | OBD_FAIL_ONCE
170 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000114"
172 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
174 $CHECKSTAT -t file $DIR/$tfile &&
175 error "$CHECKSTAT $DIR/$tfile attribute check should fail"
178 run_test 3b "replay failed open -ENOMEM"
181 replay_barrier $SINGLEMDS
182 #define OBD_FAIL_MDS_ALLOC_OBDO | OBD_FAIL_ONCE
183 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000128"
185 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
188 $CHECKSTAT -t file $DIR/$tfile &&
189 error "$CHECKSTAT $DIR/$tfile attribute check should fail"
192 run_test 3c "replay failed open -ENOMEM"
194 test_4a() { # was test_4
195 replay_barrier $SINGLEMDS
196 for i in $(seq 10); do
197 echo "tag-$i" > $DIR/$tfile-$i
200 for i in $(seq 10); do
201 grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
204 run_test 4a "|x| 10 open(O_CREAT)s"
207 for i in $(seq 10); do
208 echo "tag-$i" > $DIR/$tfile-$i
210 replay_barrier $SINGLEMDS
213 $CHECKSTAT -t file $DIR/$tfile-* &&
214 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
217 run_test 4b "|x| rm 10 files"
219 # The idea is to get past the first block of precreated files on both
220 # osts, and then replay.
222 replay_barrier $SINGLEMDS
223 for i in $(seq 220); do
224 echo "tag-$i" > $DIR/$tfile-$i
227 for i in $(seq 220); do
228 grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
232 # waiting for commitment of removal
234 run_test 5 "|x| 220 open(O_CREAT)"
236 test_6a() { # was test_6
237 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
238 replay_barrier $SINGLEMDS
239 mcreate $DIR/$tdir/$tfile
241 $CHECKSTAT -t dir $DIR/$tdir ||
242 error "$CHECKSTAT $DIR/$tdir attribute check failed"
243 $CHECKSTAT -t file $DIR/$tdir/$tfile ||
244 error "$CHECKSTAT $DIR/$tdir/$tfile attribute check failed"
246 # waiting for log process thread
248 run_test 6a "mkdir + contained create"
251 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
252 replay_barrier $SINGLEMDS
255 $CHECKSTAT -t dir $DIR/$tdir &&
256 error "$CHECKSTAT $DIR/$tdir attribute check should fail" ||
259 run_test 6b "|X| rmdir"
262 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
263 replay_barrier $SINGLEMDS
264 mcreate $DIR/$tdir/$tfile
266 $CHECKSTAT -t dir $DIR/$tdir ||
267 error "$CHECKSTAT $DIR/$tdir attribute check failed"
268 $CHECKSTAT -t file $DIR/$tdir/$tfile ||
269 error "$CHECKSTAT $DIR/$tdir/$tfile attribute check failed"
272 run_test 7 "mkdir |X| contained create"
275 replay_barrier $SINGLEMDS
276 multiop_bg_pause $DIR/$tfile mo_c ||
277 error "multiop mknod $DIR/$tfile failed"
281 $CHECKSTAT -t file $DIR/$tfile ||
282 error "$CHECKSTAT $DIR/$tfile attribute check failed"
283 kill -USR1 $MULTIPID || error "multiop mknod $MULTIPID not running"
284 wait $MULTIPID || error "multiop mknod $MULTIPID failed"
287 run_test 8 "creat open |X| close"
290 replay_barrier $SINGLEMDS
292 local old_inum=$(ls -i $DIR/$tfile | awk '{print $1}')
294 local new_inum=$(ls -i $DIR/$tfile | awk '{print $1}')
296 echo " old_inum == $old_inum, new_inum == $new_inum"
297 if [ $old_inum -eq $new_inum ] ;
299 echo "old_inum and new_inum match"
301 echo " old_inum and new_inum do not match"
302 error "old index($old_inum) does not match new index($new_inum)"
306 run_test 9 "|X| create (same inum/gen)"
309 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
310 replay_barrier $SINGLEMDS
311 mv $DIR/$tfile $DIR/$tfile-2
314 $CHECKSTAT $DIR/$tfile &&
315 error "$CHECKSTAT $DIR/$tfile attribute check should fail"
316 $CHECKSTAT $DIR/$tfile-2 ||
317 error "$CHECKSTAT $DIR/$tfile-2 attribute check failed"
321 run_test 10 "create |X| rename unlink"
324 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
325 echo "old" > $DIR/$tfile
326 mv $DIR/$tfile $DIR/$tfile-2
327 replay_barrier $SINGLEMDS
328 echo "new" > $DIR/$tfile
330 grep old $DIR/$tfile-2
332 grep new $DIR/$tfile || error "grep $DIR/$tfile failed"
333 grep old $DIR/$tfile-2 || error "grep $DIR/$tfile-2 failed"
335 run_test 11 "create open write rename |X| create-old-name read"
338 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
339 multiop_bg_pause $DIR/$tfile o_tSc ||
340 error "multiop_bg_pause $DIR/$tfile failed"
343 replay_barrier $SINGLEMDS
344 kill -USR1 $pid || error "multiop $pid not running"
345 wait $pid || error "multiop $pid failed"
348 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
351 run_test 12 "open, unlink |X| close"
353 # 1777 - replay open after committed chmod that would make
354 # a regular open a failure
356 mcreate $DIR/$tfile || error "mcreate $DIR/$tfile failed"
357 multiop_bg_pause $DIR/$tfile O_wc ||
358 error "multiop_bg_pause $DIR/$tfile failed"
361 $CHECKSTAT -p 0 $DIR/$tfile ||
362 error "$CHECKSTAT $DIR/$tfile attribute check failed"
363 replay_barrier $SINGLEMDS
365 kill -USR1 $pid || error "multiop $pid not running"
366 wait $pid || error "multiop $pid failed"
368 $CHECKSTAT -s 1 -p 0 $DIR/$tfile ||
369 error "second $CHECKSTAT $DIR/$tfile attribute check failed"
370 rm $DIR/$tfile || error "rm $DIR/$tfile failed"
373 run_test 13 "open chmod 0 |x| write close"
376 multiop_bg_pause $DIR/$tfile O_tSc ||
377 error "multiop_bg_pause $DIR/$tfile failed"
380 replay_barrier $SINGLEMDS
381 kill -USR1 $pid || error "multiop $pid not running"
382 wait $pid || error "multiop $pid failed"
385 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
388 run_test 14 "open(O_CREAT), unlink |X| close"
391 multiop_bg_pause $DIR/$tfile O_tSc ||
392 error "multiop_bg_pause $DIR/$tfile failed"
395 replay_barrier $SINGLEMDS
396 touch $DIR/$tfile-1 || error "touch $DIR/$tfile-1 failed"
397 kill -USR1 $pid || error "multiop $pid not running"
398 wait $pid || error "multiop $pid failed"
401 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
402 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
405 run_test 15 "open(O_CREAT), unlink |X| touch new, close"
408 replay_barrier $SINGLEMDS
411 mcreate $DIR/$tfile-2
413 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
414 [ -e $DIR/$tfile-2 ] || error "file $DIR/$tfile-2 does not exist"
415 munlink $DIR/$tfile-2 || error "munlink $DIR/$tfile-2 failed"
417 run_test 16 "|X| open(O_CREAT), unlink, touch new, unlink new"
420 replay_barrier $SINGLEMDS
421 multiop_bg_pause $DIR/$tfile O_c ||
422 error "multiop_bg_pause $DIR/$tfile failed"
425 kill -USR1 $pid || error "multiop $pid not running"
426 wait $pid || error "multiop $pid failed"
427 $CHECKSTAT -t file $DIR/$tfile ||
428 error "$CHECKSTAT $DIR/$tfile attribute check failed"
431 run_test 17 "|X| open(O_CREAT), |replay| close"
434 replay_barrier $SINGLEMDS
435 multiop_bg_pause $DIR/$tfile O_tSc ||
436 error "multiop_bg_pause $DIR/$tfile failed"
439 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
440 echo "pid: $pid will close"
441 kill -USR1 $pid || error "multiop $pid not running"
442 wait $pid || error "multiop $pid failed"
445 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
446 [ -e $DIR/$tfile-2 ] || error "file $DIR/$tfile-2 does not exist"
447 # this touch frequently fails
448 touch $DIR/$tfile-3 || error "touch $DIR/$tfile-3 failed"
449 munlink $DIR/$tfile-2 || error "munlink $DIR/$tfile-2 failed"
450 munlink $DIR/$tfile-3 || error "munlink $DIR/$tfile-3 failed"
453 run_test 18 "open(O_CREAT), unlink, touch new, close, touch, unlink"
455 # bug 1855 (a simpler form of test_11 above)
457 replay_barrier $SINGLEMDS
459 echo "old" > $DIR/$tfile
460 mv $DIR/$tfile $DIR/$tfile-2
461 grep old $DIR/$tfile-2
463 grep old $DIR/$tfile-2 || error "grep $DIR/$tfile-2 failed"
465 run_test 19 "mcreate, open, write, rename "
467 test_20a() { # was test_20
468 replay_barrier $SINGLEMDS
469 multiop_bg_pause $DIR/$tfile O_tSc ||
470 error "multiop_bg_pause $DIR/$tfile failed"
475 kill -USR1 $pid || error "multiop $pid not running"
476 wait $pid || error "multiop $pid failed"
477 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
480 run_test 20a "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
482 test_20b() { # bug 10480
483 local wait_timeout=$((TIMEOUT * 4))
484 local extra=$(fs_log_size)
488 save_layout_restore_at_exit $MOUNT
489 $LFS setstripe -i 0 -c 1 $DIR
491 local beforeused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
493 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 &
494 while [ ! -e $DIR/$tfile ] ; do
495 sleep 0.01 # give dd a chance to start
498 $LFS getstripe $DIR/$tfile || error "$LFS getstripe $DIR/$tfile failed"
500 rm -f $DIR/$tfile || error "rm -f $DIR/$tfile failed"
502 client_up || client_up || true # reconnect
504 do_facet $SINGLEMDS "lctl set_param -n osd*.*MDT*.force_sync=1"
506 fail $SINGLEMDS # start orphan recovery
507 wait_recovery_complete $SINGLEMDS || error "MDS recovery not done"
508 wait_delete_completed $wait_timeout || error "delete did not finish"
512 local afterused=$(df -P $DIR | tail -1 | awk '{ print $3 }')
513 log "before $beforeused, after $afterused"
515 (( $beforeused + $extra >= $afterused )) && break
516 n_attempts=$((n_attempts + 1))
517 [ $n_attempts -gt 3 ] &&
518 error "after $afterused > before $beforeused + $extra"
520 wait_zfs_commit $SINGLEMDS 5
525 run_test 20b "write, unlink, eviction, replay (test mds_cleanup_orphans)"
527 test_20c() { # bug 10480
528 multiop_bg_pause $DIR/$tfile Ow_c ||
529 error "multiop_bg_pause $DIR/$tfile failed"
535 client_up || client_up || true # reconnect
537 kill -USR1 $pid || error "multiop $pid not running"
538 wait $pid || error "multiop $pid failed"
539 [ -s $DIR/$tfile ] || error "File was truncated"
543 run_test 20c "check that client eviction does not affect file content"
546 replay_barrier $SINGLEMDS
547 multiop_bg_pause $DIR/$tfile O_tSc ||
548 error "multiop_bg_pause $DIR/$tfile failed"
551 touch $DIR/$tfile-1 || error "touch $DIR/$tfile-1 failed"
554 kill -USR1 $pid || error "multiop $pid not running"
555 wait $pid || error "multiop $pid failed"
556 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
557 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
560 run_test 21 "|X| open(O_CREAT), unlink touch new, replay, close (test mds_cleanup_orphans)"
563 multiop_bg_pause $DIR/$tfile O_tSc ||
564 error "multiop_bg_pause $DIR/$tfile failed"
567 replay_barrier $SINGLEMDS
571 kill -USR1 $pid || error "multiop $pid not running"
572 wait $pid || error "multiop $pid failed"
573 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
576 run_test 22 "open(O_CREAT), |X| unlink, replay, close (test mds_cleanup_orphans)"
579 multiop_bg_pause $DIR/$tfile O_tSc ||
580 error "multiop_bg_pause $DIR/$tfile failed"
583 replay_barrier $SINGLEMDS
585 touch $DIR/$tfile-1 || error "touch $DIR/$tfile-1 failed"
588 kill -USR1 $pid || error "multiop $pid not running"
589 wait $pid || error "multiop $pid failed"
590 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
591 touch $DIR/$tfile-2 || error "touch $DIR/$tfile-2 failed"
594 run_test 23 "open(O_CREAT), |X| unlink touch new, replay, close (test mds_cleanup_orphans)"
597 multiop_bg_pause $DIR/$tfile O_tSc ||
598 error "multiop_bg_pause $DIR/$tfile failed"
601 replay_barrier $SINGLEMDS
604 kill -USR1 $pid || error "multiop $pid not running"
605 wait $pid || error "multiop $pid failed"
606 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
609 run_test 24 "open(O_CREAT), replay, unlink, close (test mds_cleanup_orphans)"
612 multiop_bg_pause $DIR/$tfile O_tSc ||
613 error "multiop_bg_pause $DIR/$tfile failed"
617 replay_barrier $SINGLEMDS
619 kill -USR1 $pid || error "multiop $pid not running"
620 wait $pid || error "multiop $pid failed"
621 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
624 run_test 25 "open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
627 replay_barrier $SINGLEMDS
628 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
629 error "multiop_bg_pause $DIR/$tfile-1 failed"
631 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
632 error "multiop_bg_pause $DIR/$tfile-2 failed"
636 kill -USR1 $pid2 || error "second multiop $pid2 not running"
637 wait $pid2 || error "second multiop $pid2 failed"
640 kill -USR1 $pid1 || error "multiop $pid1 not running"
641 wait $pid1 || error "multiop $pid1 failed"
642 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
643 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
646 run_test 26 "|X| open(O_CREAT), unlink two, close one, replay, close one (test mds_cleanup_orphans)"
649 replay_barrier $SINGLEMDS
650 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
651 error "multiop_bg_pause $DIR/$tfile-1 failed"
653 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
654 error "multiop_bg_pause $DIR/$tfile-2 failed"
660 kill -USR1 $pid1 || error "multiop $pid1 not running"
661 wait $pid1 || error "multiop $pid1 failed"
662 kill -USR1 $pid2 || error "second multiop $pid2 not running"
663 wait $pid2 || error "second multiop $pid2 failed"
664 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
665 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
668 run_test 27 "|X| open(O_CREAT), unlink two, replay, close two (test mds_cleanup_orphans)"
671 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
672 error "multiop_bg_pause $DIR/$tfile-1 failed"
674 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
675 error "multiop_bg_pause $DIR/$tfile-2 failed"
677 replay_barrier $SINGLEMDS
680 kill -USR1 $pid2 || error "second multiop $pid2 not running"
681 wait $pid2 || error "second multiop $pid2 failed"
684 kill -USR1 $pid1 || error "multiop $pid1 not running"
685 wait $pid1 || error "multiop $pid1 failed"
686 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
687 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
690 run_test 28 "open(O_CREAT), |X| unlink two, close one, replay, close one (test mds_cleanup_orphans)"
693 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
694 error "multiop_bg_pause $DIR/$tfile-1 failed"
696 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
697 error "multiop_bg_pause $DIR/$tfile-2 failed"
699 replay_barrier $SINGLEMDS
704 kill -USR1 $pid1 || error "multiop $pid1 not running"
705 wait $pid1 || error "multiop $pid1 failed"
706 kill -USR1 $pid2 || error "second multiop $pid2 not running"
707 wait $pid2 || error "second multiop $pid2 failed"
708 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
709 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
712 run_test 29 "open(O_CREAT), |X| unlink two, replay, close two (test mds_cleanup_orphans)"
715 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
716 error "multiop_bg_pause $DIR/$tfile-1 failed"
718 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
719 error "multiop_bg_pause $DIR/$tfile-2 failed"
724 replay_barrier $SINGLEMDS
726 kill -USR1 $pid1 || error "multiop $pid1 not running"
727 wait $pid1 || error "multiop $pid1 failed"
728 kill -USR1 $pid2 || error "second multiop $pid2 not running"
729 wait $pid2 || error "second multiop $pid2 failed"
730 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
731 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
734 run_test 30 "open(O_CREAT) two, unlink two, replay, close two (test mds_cleanup_orphans)"
737 multiop_bg_pause $DIR/$tfile-1 O_tSc ||
738 error "multiop_bg_pause $DIR/$tfile-1 failed"
740 multiop_bg_pause $DIR/$tfile-2 O_tSc ||
741 error "multiop_bg_pause $DIR/$tfile-2 failed"
745 replay_barrier $SINGLEMDS
748 kill -USR1 $pid1 || error "multiop $pid1 not running"
749 wait $pid1 || error "multiop $pid1 failed"
750 kill -USR1 $pid2 || error "second multiop $pid2 not running"
751 wait $pid2 || error "second multiop $pid2 failed"
752 [ -e $DIR/$tfile-1 ] && error "file $DIR/$tfile-1 should not exist"
753 [ -e $DIR/$tfile-2 ] && error "file $DIR/$tfile-2 should not exist"
756 run_test 31 "open(O_CREAT) two, unlink one, |X| unlink one, close two (test mds_cleanup_orphans)"
758 # tests for bug 2104; completion without crashing is success. The close is
759 # stale, but we always return 0 for close, so the app never sees it.
761 multiop_bg_pause $DIR/$tfile O_c ||
762 error "multiop_bg_pause $DIR/$tfile failed"
764 multiop_bg_pause $DIR/$tfile O_c ||
765 error "second multiop_bg_pause $DIR/$tfile failed"
768 client_up || client_up || error "client_up failed"
769 kill -USR1 $pid1 || error "multiop $pid1 not running"
770 kill -USR1 $pid2 || error "second multiop $pid2 not running"
771 wait $pid1 || error "multiop $pid1 failed"
772 wait $pid2 || error "second multiop $pid2 failed"
775 run_test 32 "close() notices client eviction; close() after client eviction"
778 createmany -o $DIR/$tfile-%d 10 ||
779 error "createmany create $DIR/$tfile failed"
780 replay_barrier_nosync $SINGLEMDS
781 fail_abort $SINGLEMDS
782 # recreate shouldn't fail
783 createmany -o $DIR/$tfile--%d 10 ||
784 error "createmany recreate $DIR/$tfile failed"
788 run_test 33a "fid seq shouldn't be reused after abort recovery"
791 #define OBD_FAIL_SEQ_ALLOC 0x1311
792 do_facet $SINGLEMDS "lctl set_param fail_loc=0x1311"
794 createmany -o $DIR/$tfile-%d 10
795 replay_barrier_nosync $SINGLEMDS
796 fail_abort $SINGLEMDS
797 # recreate shouldn't fail
798 createmany -o $DIR/$tfile--%d 10 ||
799 error "createmany recreate $DIR/$tfile failed"
803 run_test 33b "test fid seq allocation"
806 multiop_bg_pause $DIR/$tfile O_c ||
807 error "multiop_bg_pause $DIR/$tfile failed"
811 replay_barrier $SINGLEMDS
812 fail_abort $SINGLEMDS
813 kill -USR1 $pid || error "multiop $pid not running"
814 wait $pid || error "multiop $pid failed"
815 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
819 run_test 34 "abort recovery before client does replay (test mds_cleanup_orphans)"
821 # bug 2278 - generate one orphan on OST, then destroy it during recovery from llog
823 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
825 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
826 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000119"
831 # give a chance to remove from MDS
832 fail_abort $SINGLEMDS
833 $CHECKSTAT -t file $DIR/$tfile &&
834 error "$CHECKSTAT $DIR/$tfile attribute check should fail" ||
837 run_test 35 "test recovery from llog for unlink op"
839 # b=2432 resent cancel after replay uses wrong cookie,
840 # so don't resend cancels
842 replay_barrier $SINGLEMDS
844 checkstat $DIR/$tfile
845 facet_failover $SINGLEMDS
847 if $LCTL dk | grep "stale lock .*cookie"; then
848 error "cancel after replay failed"
851 run_test 36 "don't resend cancel"
854 # directory orphans can't be unlinked from PENDING directory
856 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $tdir failed"
857 rmdir $DIR/$tdir/$tfile 2>/dev/null
858 multiop_bg_pause $DIR/$tdir/$tfile dD_c ||
859 error "multiop_bg_pause $tfile failed"
861 rmdir $DIR/$tdir/$tfile
863 replay_barrier $SINGLEMDS
864 # clear the dmesg buffer so we only see errors from this recovery
865 do_facet $SINGLEMDS dmesg -c >/dev/null
866 fail_abort $SINGLEMDS
867 kill -USR1 $pid || error "multiop $pid not running"
868 do_facet $SINGLEMDS dmesg | grep "error unlinking orphan" &&
869 error "error unlinking files"
870 wait $pid || error "multiop $pid failed"
874 run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)"
877 createmany -o $DIR/$tfile-%d 800 ||
878 error "createmany -o $DIR/$tfile failed"
879 unlinkmany $DIR/$tfile-%d 0 400 || error "unlinkmany $DIR/$tfile failed"
880 replay_barrier $SINGLEMDS
882 unlinkmany $DIR/$tfile-%d 400 400 ||
883 error "unlinkmany $DIR/$tfile 400 failed"
885 $CHECKSTAT -t file $DIR/$tfile-* &&
886 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
889 run_test 38 "test recovery from unlink llog (test llog_gen_rec) "
891 test_39() { # bug 4176
892 createmany -o $DIR/$tfile-%d 800 ||
893 error "createmany -o $DIR/$tfile failed"
894 replay_barrier $SINGLEMDS
895 unlinkmany $DIR/$tfile-%d 0 400
897 unlinkmany $DIR/$tfile-%d 400 400 ||
898 error "unlinkmany $DIR/$tfile 400 failed"
900 $CHECKSTAT -t file $DIR/$tfile-* &&
901 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
904 run_test 39 "test recovery from unlink llog (test llog_gen_rec) "
907 lctl get_param -n osc.*.stats | awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }'
911 # make sure that a read to one osc doesn't try to double-unlock its page just
912 # because another osc is invalid. trigger_group_io used to mistakenly return
913 # an error if any oscs were invalid even after having successfully put rpcs
914 # on valid oscs. This was fatal if the caller was ll_readpage who unlocked
915 # the page, guarnateeing that the unlock from the RPC completion would
916 # assert on trying to unlock the unlocked page.
918 [ $OSTCOUNT -lt 2 ] && skip_env "needs >= 2 OSTs" && return
920 local f=$MOUNT/$tfile
921 # make sure the start of the file is ost1
922 $LFS setstripe -S $((128 * 1024)) -i 0 $f
923 do_facet client dd if=/dev/zero of=$f bs=4k count=1 ||
924 error "dd on client failed"
926 # fail ost2 and read from ost1
927 local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $ost2_svc)
928 local osc2dev=$(do_facet $SINGLEMDS "lctl get_param -n devices" |
929 grep $mdtosc | awk '{print $1}')
930 [ -z "$osc2dev" ] && echo "OST: $ost2_svc" &&
931 lctl get_param -n devices &&
932 error "OST 2 $osc2dev does not exist"
933 do_facet $SINGLEMDS $LCTL --device $osc2dev deactivate ||
934 error "deactive device on $SINGLEMDS failed"
935 do_facet client dd if=$f of=/dev/null bs=4k count=1 ||
936 error "second dd on client failed"
937 do_facet $SINGLEMDS $LCTL --device $osc2dev activate ||
938 error "active device on $SINGLEMDS failed"
941 run_test 41 "read from a valid osc while other oscs are invalid"
943 # test MDS recovery after ost failure
945 blocks=$(df -P $MOUNT | tail -n 1 | awk '{ print $2 }')
946 createmany -o $DIR/$tfile-%d 800 ||
947 error "createmany -o $DIR/$tfile failed"
949 unlinkmany $DIR/$tfile-%d 0 400
951 lctl set_param debug=-1
954 # osc is evicted, fs is smaller (but only with failout OSTs (bug 7287)
955 #blocks_after=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'`
956 #[ $blocks_after -lt $blocks ] || return 1
957 echo "wait for MDS to timeout and recover"
958 sleep $((TIMEOUT * 2))
960 unlinkmany $DIR/$tfile-%d 400 400 ||
961 error "unlinkmany $DIR/$tfile 400 failed"
962 $CHECKSTAT -t file $DIR/$tfile-* &&
963 error "$CHECKSTAT $DIR/$tfile-* attribute check should fail" ||
966 run_test 42 "recovery after ost failure"
968 # timeout in MDS/OST recovery RPC will LBUG MDS
969 test_43() { # bug 2530
970 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
972 replay_barrier $SINGLEMDS
974 # OBD_FAIL_OST_CREATE_NET 0x204
975 do_facet ost1 "lctl set_param fail_loc=0x80000204"
981 run_test 43 "mds osc import failure during recovery; don't LBUG"
983 test_44a() { # was test_44
986 local mdcdev=$($LCTL dl |
987 awk "/${FSNAME}-MDT0000-mdc-/ {if (\$2 == \"UP\") {print \$1}}")
988 [ "$mdcdev" ] || error "${FSNAME}-MDT0000-mdc- not UP"
989 [ $(echo $mdcdev | wc -w) -eq 1 ] ||
990 { $LCTL dl; error "looking for mdcdev=$mdcdev"; }
992 # adaptive timeouts slow this way down
993 if at_is_enabled; then
994 at_max_saved=$(at_max_get mds)
998 for i in $(seq 1 10); do
999 echo "$i of 10 ($(date +%s))"
1000 do_facet $SINGLEMDS \
1001 "lctl get_param -n md[ts].*.mdt.timeouts | grep service"
1002 #define OBD_FAIL_TGT_CONN_RACE 0x701
1003 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701"
1004 # lctl below may fail, it is valid case
1005 $LCTL --device $mdcdev recover
1008 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1009 [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds
1012 run_test 44a "race in target handle connect"
1015 local mdcdev=$($LCTL dl |
1016 awk "/${FSNAME}-MDT0000-mdc-/ {if (\$2 == \"UP\") {print \$1}}")
1017 [ "$mdcdev" ] || error "${FSNAME}-MDT0000-mdc not up"
1018 [ $(echo $mdcdev | wc -w) -eq 1 ] ||
1019 { echo mdcdev=$mdcdev; $LCTL dl;
1020 error "more than one ${FSNAME}-MDT0000-mdc"; }
1022 for i in $(seq 1 10); do
1023 echo "$i of 10 ($(date +%s))"
1024 do_facet $SINGLEMDS \
1025 "lctl get_param -n md[ts].*.mdt.timeouts | grep service"
1026 #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
1027 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704"
1028 # lctl below may fail, it is valid case
1029 $LCTL --device $mdcdev recover
1034 run_test 44b "race in target handle connect"
1037 replay_barrier $SINGLEMDS
1038 createmany -m $DIR/$tfile-%d 100 || error "failed to create directories"
1039 #define OBD_FAIL_TGT_RCVG_FLAG 0x712
1040 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000712"
1041 fail_abort $SINGLEMDS
1042 unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail abort"
1044 unlinkmany $DIR/$tfile-%d 100 && error "unliked after fail"
1047 run_test 44c "race in target handle connect"
1049 # Handle failed close
1051 local mdcdev=$($LCTL get_param -n devices |
1052 awk "/ ${FSNAME}-MDT0000-mdc-/ {print \$1}")
1053 [ "$mdcdev" ] || error "${FSNAME}-MDT0000-mdc not up"
1054 [ $(echo $mdcdev | wc -w) -eq 1 ] ||
1055 { echo mdcdev=$mdcdev; $LCTL dl;
1056 error "more than one ${FSNAME}-MDT0000-mdc"; }
1058 $LCTL --device $mdcdev recover ||
1059 error "$LCTL --device $mdcdev recover failed"
1061 multiop_bg_pause $DIR/$tfile O_c ||
1062 error "multiop_bg_pause $DIR/$tfile failed"
1065 # This will cause the CLOSE to fail before even
1066 # allocating a reply buffer
1067 $LCTL --device $mdcdev deactivate ||
1068 error "$LCTL --device $mdcdev deactivate failed"
1071 kill -USR1 $pid || error "multiop $pid not running"
1072 wait $pid || error "multiop $pid failed"
1074 $LCTL --device $mdcdev activate ||
1075 error "$LCTL --device $mdcdev activate failed"
1078 $CHECKSTAT -t file $DIR/$tfile ||
1079 error "$CHECKSTAT $DIR/$tfile attribute check failed"
1082 run_test 45 "Handle failed close"
1085 drop_reply "touch $DIR/$tfile"
1087 # ironically, the previous test, 45, will cause a real forced close,
1088 # so just look for one for this test
1089 local FID=$($LFS path2fid $tfile)
1090 $LCTL dk | grep -i "force closing file handle $FID" &&
1091 error "found force closing in dmesg"
1094 run_test 46 "Don't leak file handle after open resend (3325)"
1096 test_47() { # bug 2824
1097 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1099 # create some files to make sure precreate has been done on all
1100 # OSTs. (just in case this test is run independently)
1101 createmany -o $DIR/$tfile 20 ||
1102 error "createmany create $DIR/$tfile failed"
1104 # OBD_FAIL_OST_CREATE_NET 0x204
1106 do_facet ost1 "lctl set_param fail_loc=0x80000204"
1107 client_up || error "client_up failed"
1109 # let the MDS discover the OST failure, attempt to recover, fail
1110 # and recover again.
1111 sleep $((3 * TIMEOUT))
1113 # Without 2824, this createmany would hang
1114 createmany -o $DIR/$tfile 20 ||
1115 error "createmany recraete $DIR/$tfile failed"
1116 unlinkmany $DIR/$tfile 20 || error "unlinkmany $DIR/$tfile failed"
1120 run_test 47 "MDS->OSC failure during precreate cleanup (2824)"
1123 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1124 [ "$OSTCOUNT" -lt "2" ] && skip_env "needs >= 2 OSTs" && return
1126 replay_barrier $SINGLEMDS
1127 createmany -o $DIR/$tfile 20 ||
1128 error "createmany -o $DIR/$tfile failed"
1129 # OBD_FAIL_OST_EROFS 0x216
1130 facet_failover $SINGLEMDS
1131 do_facet ost1 "lctl set_param fail_loc=0x80000216"
1132 client_up || error "client_up failed"
1134 # let the MDS discover the OST failure, attempt to recover, fail
1135 # and recover again.
1136 sleep $((3 * TIMEOUT))
1138 createmany -o $DIR/$tfile 20 20 ||
1139 error "createmany recraete $DIR/$tfile failed"
1140 unlinkmany $DIR/$tfile 40 || error "unlinkmany $DIR/$tfile failed"
1143 run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
1146 local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $ost1_svc)
1147 local oscdev=$(do_facet $SINGLEMDS "lctl get_param -n devices" |
1148 grep $mdtosc | awk '{print $1}')
1149 [ "$oscdev" ] || error "could not find OSC device on MDS"
1150 do_facet $SINGLEMDS $LCTL --device $oscdev recover ||
1151 error "OSC device $oscdev recovery failed"
1152 do_facet $SINGLEMDS $LCTL --device $oscdev recover ||
1153 error "second OSC device $oscdev recovery failed"
1154 # give the mds_lov_sync threads a chance to run
1157 run_test 50 "Double OSC recovery, don't LASSERT (3812)"
1159 # b3764 timed out lock replay
1161 [ "$MDS1_VERSION" -lt $(version_code 2.6.90) ] &&
1162 skip "MDS prior to 2.6.90 handle LDLM_REPLY_NET incorrectly"
1164 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
1165 cancel_lru_locks mdc
1167 multiop_bg_pause $DIR/$tfile s_s || error "multiop $DIR/$tfile failed"
1170 #define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157
1171 lctl set_param -n ldlm.cancel_unused_locks_before_replay "0"
1172 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000157"
1174 fail $SINGLEMDS || error "fail $SINGLEMDS failed"
1176 wait $mpid || error "multiop_bg_pause pid failed"
1178 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1179 lctl set_param fail_loc=0x0
1180 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
1183 run_test 52 "time out lock replay (3764)"
1185 # bug 3462 - simultaneous MDC requests
1187 [[ $(lctl get_param mdc.*.import |
1188 grep "connect_flags:.*multi_mod_rpc") ]] ||
1189 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
1191 cancel_lru_locks mdc # cleanup locks from former test cases
1192 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1193 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1194 multiop $DIR/${tdir}-1/f O_c &
1196 # give multiop a change to open
1199 #define OBD_FAIL_MDS_CLOSE_NET 0x115
1200 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
1201 kill -USR1 $close_pid
1202 cancel_lru_locks mdc # force the close
1203 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1205 mcreate $DIR/${tdir}-2/f || error "mcreate $DIR/${tdir}-2/f failed"
1207 # close should still be here
1208 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1210 replay_barrier_nodf $SINGLEMDS
1212 wait $close_pid || error "close_pid $close_pid failed"
1214 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1215 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1216 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1217 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1218 rm -rf $DIR/${tdir}-*
1220 run_test 53a "|X| close request while two MDC requests in flight"
1223 cancel_lru_locks mdc # cleanup locks from former test cases
1225 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1226 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1227 multiop_bg_pause $DIR/${tdir}-1/f O_c ||
1228 error "multiop_bg_pause $DIR/${tdir}-1/f failed"
1231 #define OBD_FAIL_MDS_REINT_NET 0x107
1232 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000107"
1233 mcreate $DIR/${tdir}-2/f &
1237 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1238 kill -USR1 $close_pid
1239 cancel_lru_locks mdc # force the close
1240 wait $close_pid || error "close_pid $close_pid failed"
1241 # open should still be here
1242 [ -d /proc/$open_pid ] || error "open_pid doesn't exist"
1244 replay_barrier_nodf $SINGLEMDS
1246 wait $open_pid || error "open_pid failed"
1248 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1249 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1250 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1251 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1252 rm -rf $DIR/${tdir}-*
1254 run_test 53b "|X| open request while two MDC requests in flight"
1257 cancel_lru_locks mdc # cleanup locks from former test cases
1259 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1260 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1261 multiop $DIR/${tdir}-1/f O_c &
1264 #define OBD_FAIL_MDS_REINT_NET 0x107
1265 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000107"
1266 mcreate $DIR/${tdir}-2/f &
1270 #define OBD_FAIL_MDS_CLOSE_NET 0x115
1271 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
1272 kill -USR1 $close_pid
1273 cancel_lru_locks mdc # force the close
1275 #bz20647: make sure all pids exist before failover
1276 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1277 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1278 replay_barrier_nodf $SINGLEMDS
1279 fail_nodf $SINGLEMDS
1280 wait $open_pid || error "open_pid failed"
1282 # close should be gone
1283 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1284 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1286 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1287 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1288 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1289 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1290 rm -rf $DIR/${tdir}-*
1292 run_test 53c "|X| open request and close request while two MDC requests in flight"
1295 [[ $(lctl get_param mdc.*.import |
1296 grep "connect_flags:.*multi_mod_rpc") ]] ||
1297 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
1299 cancel_lru_locks mdc # cleanup locks from former test cases
1301 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1302 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1303 multiop $DIR/${tdir}-1/f O_c &
1305 # give multiop a chance to open
1308 #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
1309 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013b"
1310 kill -USR1 $close_pid
1311 cancel_lru_locks mdc # force the close
1312 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1313 mcreate $DIR/${tdir}-2/f || error "mcreate $DIR/${tdir}-2/f failed"
1315 # close should still be here
1316 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1318 wait $close_pid || error "close_pid failed"
1320 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1321 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1322 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1323 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1324 rm -rf $DIR/${tdir}-*
1326 run_test 53d "close reply while two MDC requests in flight"
1329 cancel_lru_locks mdc # cleanup locks from former test cases
1331 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1332 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1333 multiop $DIR/${tdir}-1/f O_c &
1336 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
1337 do_facet $SINGLEMDS "lctl set_param fail_loc=0x119"
1338 mcreate $DIR/${tdir}-2/f &
1342 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1343 kill -USR1 $close_pid
1344 cancel_lru_locks mdc # force the close
1345 wait $close_pid || error "close_pid failed"
1346 # open should still be here
1347 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1349 replay_barrier_nodf $SINGLEMDS
1351 wait $open_pid || error "open_pid failed"
1353 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1354 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1355 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1356 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1357 rm -rf $DIR/${tdir}-*
1359 run_test 53e "|X| open reply while two MDC requests in flight"
1362 cancel_lru_locks mdc # cleanup locks from former test cases
1364 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1365 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1366 multiop $DIR/${tdir}-1/f O_c &
1369 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
1370 do_facet $SINGLEMDS "lctl set_param fail_loc=0x119"
1371 mcreate $DIR/${tdir}-2/f &
1375 #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
1376 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013b"
1377 kill -USR1 $close_pid
1378 cancel_lru_locks mdc # force the close
1380 #bz20647: make sure all pids are exists before failover
1381 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1382 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1383 replay_barrier_nodf $SINGLEMDS
1384 fail_nodf $SINGLEMDS
1385 wait $open_pid || error "open_pid failed"
1387 # close should be gone
1388 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1389 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1391 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1392 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1393 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1394 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1395 rm -rf $DIR/${tdir}-*
1397 run_test 53f "|X| open reply and close reply while two MDC requests in flight"
1400 cancel_lru_locks mdc # cleanup locks from former test cases
1402 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1403 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1404 multiop $DIR/${tdir}-1/f O_c &
1407 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
1408 do_facet $SINGLEMDS "lctl set_param fail_loc=0x119"
1409 mcreate $DIR/${tdir}-2/f &
1413 #define OBD_FAIL_MDS_CLOSE_NET 0x115
1414 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000115"
1415 kill -USR1 $close_pid
1416 cancel_lru_locks mdc # force the close
1417 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1419 #bz20647: make sure all pids are exists before failover
1420 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1421 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1422 replay_barrier_nodf $SINGLEMDS
1423 fail_nodf $SINGLEMDS
1424 wait $open_pid || error "open_pid failed"
1426 # close should be gone
1427 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1429 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1430 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1431 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1432 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1433 rm -rf $DIR/${tdir}-*
1435 run_test 53g "|X| drop open reply and close request while close and open are both in flight"
1438 cancel_lru_locks mdc # cleanup locks from former test cases
1440 mkdir_on_mdt0 $DIR/${tdir}-1 || error "mkdir $DIR/${tdir}-1 failed"
1441 mkdir_on_mdt0 $DIR/${tdir}-2 || error "mkdir $DIR/${tdir}-2 failed"
1442 multiop $DIR/${tdir}-1/f O_c &
1445 #define OBD_FAIL_MDS_REINT_NET 0x107
1446 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000107"
1447 mcreate $DIR/${tdir}-2/f &
1451 #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b
1452 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013b"
1453 kill -USR1 $close_pid
1454 cancel_lru_locks mdc # force the close
1457 #bz20647: make sure all pids are exists before failover
1458 [ -d /proc/$close_pid ] || error "close_pid doesn't exist"
1459 [ -d /proc/$open_pid ] || error "open_pid doesn't exists"
1460 replay_barrier_nodf $SINGLEMDS
1461 fail_nodf $SINGLEMDS
1462 wait $open_pid || error "open_pid failed"
1464 # close should be gone
1465 [ -d /proc/$close_pid ] && error "close_pid should not exist"
1466 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1468 $CHECKSTAT -t file $DIR/${tdir}-1/f ||
1469 error "$CHECKSTAT $DIR/${tdir}-1/f attribute check failed"
1470 $CHECKSTAT -t file $DIR/${tdir}-2/f ||
1471 error "$CHECKSTAT $DIR/${tdir}-2/f attribute check failed"
1472 rm -rf $DIR/${tdir}-*
1474 run_test 53h "open request and close reply while two MDC requests in flight"
1476 #b3761 ASSERTION(hash != 0) failed
1478 # OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE
1479 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012b"
1481 # give touch a chance to run
1483 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1487 run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"
1489 #b3440 ASSERTION(rec->ur_fid2->id) failed
1491 ln -s foo $DIR/$tfile
1492 replay_barrier $SINGLEMDS
1493 #drop_reply "cat $DIR/$tfile"
1497 run_test 56 "don't replay a symlink open request (3440)"
1499 #recovery one mds-ost setattr from llog
1501 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1502 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012c"
1503 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
1504 replay_barrier $SINGLEMDS
1506 wait_recovery_complete $SINGLEMDS || error "MDS recovery is not done"
1507 wait_mds_ost_sync || error "wait_mds_ost_sync failed"
1508 $CHECKSTAT -t file $DIR/$tfile ||
1509 error "$CHECKSTAT $DIR/$tfile attribute check failed"
1510 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1513 run_test 57 "test recovery from llog for setattr op"
1516 zconf_umount $(hostname) $MOUNT2
1520 #recovery many mds-ost setattr from llog
1522 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1523 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1524 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000012c"
1525 createmany -o $DIR/$tdir/$tfile-%d 2500
1526 replay_barrier $SINGLEMDS
1529 $CHECKSTAT -t file $DIR/$tdir/$tfile-* >/dev/null ||
1530 error "$CHECKSTAT $DIR/$tfile-* attribute check failed"
1531 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
1532 unlinkmany $DIR/$tdir/$tfile-%d 2500 ||
1533 error "unlinkmany $DIR/$tfile failed"
1536 run_test 58a "test recovery from llog for setattr op (test llog_gen_rec)"
1542 trap cleanup_58 EXIT
1544 large_xattr_enabled &&
1545 orig="$(generate_string $(max_xattr_size))" || orig="bar"
1546 # Original extended attribute can be long. Print a small version of
1547 # attribute if an error occurs
1548 local sm_msg=$(printf "%.9s" $orig)
1550 mount_client $MOUNT2 || error "mount_client on $MOUNT2 failed"
1551 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1552 touch $DIR/$tdir/$tfile || error "touch $DIR/$tdir/$tfile failed"
1553 replay_barrier $SINGLEMDS
1554 setfattr -n trusted.foo -v $orig $DIR/$tdir/$tfile
1556 new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile)
1557 [[ "$new" = "$orig" ]] ||
1558 error "xattr set ($sm_msg...) differs from xattr get ($new)"
1559 rm -f $DIR/$tdir/$tfile
1562 wait_clients_import_state ${CLIENTS:-$HOSTNAME} "mgs" FULL
1564 run_test 58b "test replay of setxattr op"
1566 test_58c() { # bug 16570
1571 trap cleanup_58 EXIT
1573 if large_xattr_enabled; then
1574 local xattr_size=$(max_xattr_size)
1575 orig="$(generate_string $((xattr_size / 2)))"
1576 orig1="$(generate_string $xattr_size)"
1582 # PING_INTERVAL max(obd_timeout / 4, 1U)
1583 sleep $((TIMEOUT / 4))
1585 # Original extended attribute can be long. Print a small version of
1586 # attribute if an error occurs
1587 local sm_msg=$(printf "%.9s" $orig)
1588 local sm_msg1=$(printf "%.9s" $orig1)
1590 mount_client $MOUNT2 || error "mount_client on $MOUNT2 failed"
1591 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1592 touch $DIR/$tdir/$tfile || error "touch $DIR/$tdir/$tfile failed"
1593 drop_request "setfattr -n trusted.foo -v $orig $DIR/$tdir/$tfile" ||
1594 error "drop_request for setfattr failed"
1595 new=$(get_xattr_value trusted.foo $MOUNT2/$tdir/$tfile)
1596 [[ "$new" = "$orig" ]] ||
1597 error "xattr set ($sm_msg...) differs from xattr get ($new)"
1598 drop_reint_reply "setfattr -n trusted.foo1 \
1599 -v $orig1 $DIR/$tdir/$tfile" ||
1600 error "drop_reint_reply for setfattr failed"
1601 new=$(get_xattr_value trusted.foo1 $MOUNT2/$tdir/$tfile)
1602 [[ "$new" = "$orig1" ]] ||
1603 error "second xattr set ($sm_msg1...) differs xattr get ($new)"
1604 rm -f $DIR/$tdir/$tfile
1608 run_test 58c "resend/reconstruct setxattr op"
1610 # log_commit_thread vs filter_destroy race used to lead to import use after free
1613 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1615 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1616 createmany -o $DIR/$tdir/$tfile-%d 200 ||
1617 error "createmany create files failed"
1619 unlinkmany $DIR/$tdir/$tfile-%d 200 ||
1620 error "unlinkmany $DIR/$tdir/$tfile failed"
1621 #define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
1622 do_facet ost1 "lctl set_param fail_loc=0x507"
1625 do_facet ost1 "lctl set_param fail_loc=0x0"
1629 run_test 59 "test log_commit_thread vs filter_destroy race"
1631 # race between add unlink llog vs cat log init in post_recovery (only for b1_6)
1632 # bug 12086: should no oops and No ctxt error for this test
1634 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1635 createmany -o $DIR/$tdir/$tfile-%d 200 ||
1636 error "createmany create files failed"
1637 replay_barrier $SINGLEMDS
1638 unlinkmany $DIR/$tdir/$tfile-%d 0 100
1640 unlinkmany $DIR/$tdir/$tfile-%d 100 100
1641 local no_ctxt=$(dmesg | grep "No ctxt")
1642 [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
1644 run_test 60 "test llog post recovery init vs llog unlink"
1646 #test race llog recovery thread vs llog cleanup
1647 test_61a() { # was test_61
1648 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1650 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1651 createmany -o $DIR/$tdir/$tfile-%d 800 ||
1652 error "createmany create files failed"
1654 unlinkmany $DIR/$tdir/$tfile-%d 800
1655 # OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
1656 set_nodes_failloc "$(osts_nodes)" 0x80000221
1661 set_nodes_failloc "$(osts_nodes)" 0x0
1663 $CHECKSTAT -t file $DIR/$tdir/$tfile-* &&
1664 error "$CHECKSTAT $DIR/$tdir/$tfile attribute check should fail"
1667 run_test 61a "test race llog recovery vs llog cleanup"
1669 #test race mds llog sync vs llog cleanup
1671 # OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
1672 do_facet $SINGLEMDS "lctl set_param fail_loc=0x8000013a"
1673 facet_failover $SINGLEMDS
1676 do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 ||
1679 run_test 61b "test race mds llog sync vs llog cleanup"
1681 #test race cancel cookie cb vs llog cleanup
1683 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1685 # OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
1686 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
1687 set_nodes_failloc "$(osts_nodes)" 0x80000222
1691 set_nodes_failloc "$(osts_nodes)" 0x0
1693 run_test 61c "test race mds llog sync vs llog cleanup"
1695 test_61d() { # bug 16002 # bug 17466 # bug 22137
1696 # OBD_FAIL_OBD_LLOG_SETUP 0x605
1698 do_facet mgs "lctl set_param fail_loc=0x80000605"
1699 start mgs $(mgsdevname) $MGS_MOUNT_OPTS &&
1700 error "mgs start should have failed"
1701 do_facet mgs "lctl set_param fail_loc=0"
1702 start mgs $(mgsdevname) $MGS_MOUNT_OPTS || error "cannot restart mgs"
1704 run_test 61d "error in llog_setup should cleanup the llog context correctly"
1706 test_62() { # Bug 15756 - don't mis-drop resent replay
1707 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1708 replay_barrier $SINGLEMDS
1709 createmany -o $DIR/$tdir/$tfile- 25 ||
1710 error "createmany create files failed"
1711 #define OBD_FAIL_TGT_REPLAY_DROP 0x707
1712 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707"
1714 do_facet $SINGLEMDS "lctl set_param fail_loc=0"
1715 unlinkmany $DIR/$tdir/$tfile- 25 ||
1716 error "unlinkmany $DIR/$tdir/$tfile failed"
1719 run_test 62 "don't mis-drop resent replay"
1721 #Adaptive Timeouts (bug 3055)
1729 echo "Cleaning up AT ..."
1730 if [ -n "$ATOLDBASE" ]; then
1731 local at_history=$($LCTL get_param -n at_history)
1732 do_facet $SINGLEMDS "lctl set_param at_history=$at_history" || true
1733 do_facet ost1 "lctl set_param at_history=$at_history" || true
1736 if [ $AT_MAX_SET -ne 0 ]; then
1737 for facet in mds client ost; do
1738 var=AT_MAX_SAVE_${facet}
1739 echo restore AT on $facet to saved value ${!var}
1740 at_max_set ${!var} $facet
1741 at_new=$(at_max_get $facet)
1742 echo Restored AT value on $facet $at_new
1743 [ $at_new -eq ${!var} ] ||
1744 error "AT value not restored SAVED ${!var} NEW $at_new"
1751 local at_max_new=600
1753 # Save at_max original values
1755 if [ $AT_MAX_SET -eq 0 ]; then
1756 # Suppose that all osts have the same at_max
1757 for facet in mds client ost; do
1758 eval AT_MAX_SAVE_${facet}=$(at_max_get $facet)
1762 for facet in mds client ost; do
1763 at_max=$(at_max_get $facet)
1764 if [ $at_max -ne $at_max_new ]; then
1765 echo "AT value on $facet is $at_max, set it by force temporarily to $at_max_new"
1766 at_max_set $at_max_new $facet
1771 if [ -z "$ATOLDBASE" ]; then
1772 ATOLDBASE=$(do_facet $SINGLEMDS "lctl get_param -n at_history")
1773 # speed up the timebase so we can check decreasing AT
1774 do_facet $SINGLEMDS "lctl set_param at_history=8" || true
1775 do_facet ost1 "lctl set_param at_history=8" || true
1777 # sleep for a while to cool down, should be > 8s and also allow
1778 # at least one ping to be sent. simply use TIMEOUT to be safe.
1783 test_65a() #bug 3055
1785 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1787 at_start || return 0
1788 $LCTL dk > /dev/null
1790 $LCTL set_param debug="other"
1791 # Slow down a request to the current service time, this is critical
1792 # because previous tests may have caused this value to increase.
1793 REQ_DELAY=`lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts |
1794 awk '/portal 12/ {print $5}'`
1795 REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
1797 do_facet $SINGLEMDS lctl set_param fail_val=$((${REQ_DELAY} * 1000))
1798 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1799 do_facet $SINGLEMDS $LCTL set_param fail_loc=0x8000050a
1800 createmany -o $DIR/$tfile 10 > /dev/null
1801 unlinkmany $DIR/$tfile 10 > /dev/null
1802 # check for log message
1803 $LCTL dk | grep -i "Early reply #" || error "No early reply"
1805 # client should show REQ_DELAY estimates
1806 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal
1808 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal
1810 run_test 65a "AT: verify early replies"
1812 test_65b() #bug 3055
1814 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1816 at_start || return 0
1819 $LCTL set_param debug="other trace"
1820 $LCTL dk > /dev/null
1821 # Slow down a request to the current service time, this is critical
1822 # because previous tests may have caused this value to increase.
1823 $LFS setstripe --stripe-index=0 --stripe-count=1 $DIR/$tfile ||
1824 error "$LFS setstripe failed for $DIR/$tfile"
1826 multiop $DIR/$tfile Ow1yc
1827 REQ_DELAY=`lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts |
1828 awk '/portal 6/ {print $5}'`
1829 REQ_DELAY=$((${REQ_DELAY} + ${REQ_DELAY} / 4 + 5))
1831 do_facet ost1 lctl set_param fail_val=${REQ_DELAY}
1832 #define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
1833 do_facet ost1 $LCTL set_param fail_loc=0x224
1836 $LFS setstripe --stripe-index=0 --stripe-count=1 $DIR/$tfile ||
1837 error "$LFS setstripe failed"
1838 # force some real bulk transfer
1839 multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
1841 do_facet ost1 $LCTL set_param fail_loc=0
1842 # check for log message
1843 $LCTL dk | grep -i "Early reply #" || error "No early reply"
1845 # client should show REQ_DELAY estimates
1846 lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts | grep portal
1848 run_test 65b "AT: verify early replies on packed reply / bulk"
1850 test_66a() #bug 3055
1852 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1854 at_start || return 0
1855 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1856 # adjust 5s at a time so no early reply is sent (within deadline)
1857 do_facet $SINGLEMDS "$LCTL set_param fail_val=5000"
1858 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1859 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x8000050a"
1860 createmany -o $DIR/$tfile 20 > /dev/null
1861 unlinkmany $DIR/$tfile 20 > /dev/null
1862 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1863 do_facet $SINGLEMDS "$LCTL set_param fail_val=10000"
1864 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0x8000050a"
1865 createmany -o $DIR/$tfile 20 > /dev/null
1866 unlinkmany $DIR/$tfile 20 > /dev/null
1867 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1868 do_facet $SINGLEMDS "$LCTL set_param fail_loc=0"
1870 createmany -o $DIR/$tfile 20 > /dev/null
1871 unlinkmany $DIR/$tfile 20 > /dev/null
1872 lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12"
1873 CUR=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $5}')
1874 WORST=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $7}')
1875 echo "Current MDT timeout $CUR, worst $WORST"
1876 [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
1878 run_test 66a "AT: verify MDT service time adjusts with no early replies"
1880 test_66b() #bug 3055
1882 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1884 at_start || return 0
1885 ORIG=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
1886 awk '/network/ {print $4}')
1887 $LCTL set_param fail_val=$(($ORIG + 5))
1888 #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
1889 $LCTL set_param fail_loc=0x50c
1890 touch $DIR/$tfile > /dev/null 2>&1
1891 $LCTL set_param fail_loc=0
1892 CUR=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
1893 awk '/network/ {print $4}')
1894 WORST=$(lctl get_param -n mdc.${FSNAME}-MDT0000*.timeouts |
1895 awk '/network/ {print $6}')
1896 echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
1897 [ $WORST -gt $ORIG ] ||
1898 error "Worst $WORST should be worse than orig $ORIG"
1900 run_test 66b "AT: verify net latency adjusts"
1902 test_67a() #bug 3055
1904 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1906 at_start || return 0
1907 CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1908 # sleeping threads may drive values above this
1909 do_facet ost1 "$LCTL set_param fail_val=400"
1910 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1911 do_facet ost1 "$LCTL set_param fail_loc=0x50a"
1912 createmany -o $DIR/$tfile 20 > /dev/null
1913 unlinkmany $DIR/$tfile 20 > /dev/null
1914 do_facet ost1 "$LCTL set_param fail_loc=0"
1915 CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1916 ATTEMPTS=$(($CONN2 - $CONN1))
1917 echo "$ATTEMPTS osc reconnect attempts on gradual slow"
1918 [ $ATTEMPTS -gt 0 ] &&
1919 error_ignore bz13721 "AT should have prevented reconnect"
1922 run_test 67a "AT: verify slow request processing doesn't induce reconnects"
1924 test_67b() #bug 3055
1926 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1928 at_start || return 0
1929 CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1931 # exhaust precreations on ost1
1932 local OST=$(ostname_from_index 0)
1933 local mdtosc=$(get_mdtosc_proc_path mds $OST)
1934 local last_id=$(do_facet $SINGLEMDS lctl get_param -n \
1935 osp.$mdtosc.prealloc_last_id)
1936 local next_id=$(do_facet $SINGLEMDS lctl get_param -n \
1937 osp.$mdtosc.prealloc_next_id)
1939 mkdir -p $DIR/$tdir/${OST} || error "mkdir $DIR/$tdir/${OST} failed"
1940 $LFS setstripe -i 0 -c 1 $DIR/$tdir/${OST} ||
1941 error "$LFS setstripe failed"
1942 echo "Creating to objid $last_id on ost $OST..."
1943 #define OBD_FAIL_OST_PAUSE_CREATE 0x223
1944 do_facet ost1 "$LCTL set_param fail_val=20000"
1945 do_facet ost1 "$LCTL set_param fail_loc=0x80000223"
1946 createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2))
1949 do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
1951 CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1952 ATTEMPTS=$(($CONN2 - $CONN1))
1953 echo "$ATTEMPTS osc reconnect attempts on instant slow"
1954 # do it again; should not timeout
1955 do_facet ost1 "$LCTL set_param fail_loc=0x80000223"
1956 cp /etc/profile $DIR/$tfile || error "cp failed"
1957 do_facet ost1 "$LCTL set_param fail_loc=0"
1959 do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts"
1960 CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}')
1961 ATTEMPTS=$(($CONN3 - $CONN2))
1962 echo "$ATTEMPTS osc reconnect attempts on 2nd slow"
1963 [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
1966 run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
1968 test_68 () #bug 13813
1970 remote_ost_nodsh && skip "remote OST with nodsh" && return 0
1972 at_start || return 0
1973 local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min)
1974 [ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0
1975 local ldlm_enqueue_min_r=$(do_facet ost1 "find /sys -name ldlm_enqueue_min")
1976 [ -z "$ldlm_enqueue_min_r" ] && skip "missing /sys/.../ldlm_enqueue_min in the ost1" && return 0
1977 local ENQ_MIN=$(cat $ldlm_enqueue_min)
1978 local ENQ_MIN_R=$(do_facet ost1 "cat $ldlm_enqueue_min_r")
1979 echo $TIMEOUT >> $ldlm_enqueue_min
1980 do_facet ost1 "echo $TIMEOUT >> $ldlm_enqueue_min_r"
1982 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
1983 $LFS setstripe --stripe-index=0 -c 1 $DIR/$tdir ||
1984 error "$LFS setstripe failed for $DIR/$tdir"
1985 #define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
1986 $LCTL set_param fail_val=$(($TIMEOUT - 1))
1987 $LCTL set_param fail_loc=0x80000312
1988 cp /etc/profile $DIR/$tdir/${tfile}_1 || error "1st cp failed $?"
1989 $LCTL set_param fail_val=$((TIMEOUT * 5 / 4))
1990 $LCTL set_param fail_loc=0x80000312
1991 cp /etc/profile $DIR/$tdir/${tfile}_2 || error "2nd cp failed $?"
1992 $LCTL set_param fail_loc=0
1994 echo $ENQ_MIN >> $ldlm_enqueue_min
1995 do_facet ost1 "echo $ENQ_MIN_R >> $ldlm_enqueue_min_r"
1999 run_test 68 "AT: verify slowing locks"
2002 # end of AT tests includes above lines
2004 # start multi-client tests
2006 [ $CLIENTCOUNT -lt 2 ] &&
2007 { skip "Need two or more clients, have $CLIENTCOUNT" && return; }
2009 echo "mount clients $CLIENTS ..."
2010 zconf_mount_clients $CLIENTS $MOUNT
2012 local clients=${CLIENTS//,/ }
2013 echo "Write/read files on $DIR ; clients $CLIENTS ... "
2014 for CLIENT in $clients; do
2015 do_node $CLIENT dd bs=1M count=10 if=/dev/zero \
2016 of=$DIR/${tfile}_${CLIENT} 2>/dev/null ||
2017 error "dd failed on $CLIENT"
2020 local prev_client=$(echo $clients | sed 's/^.* \(.\+\)$/\1/')
2021 for C in ${CLIENTS//,/ }; do
2022 do_node $prev_client dd if=$DIR/${tfile}_${C} \
2023 of=/dev/null 2>/dev/null ||
2024 error "dd if=$DIR/${tfile}_${C} failed on $prev_client"
2030 run_test 70a "check multi client t-f"
2032 check_for_process () {
2037 killall_process $clients "$prog" -0
2041 local clients=${CLIENTS:-$HOSTNAME}
2043 zconf_mount_clients $clients $MOUNT
2046 [ "$SLOW" = "no" ] && duration=120
2047 # set duration to 900 because it takes some time to boot node
2048 [ "$FAILURE_MODE" = HARD ] && duration=900
2051 local start_ts=$(date +%s)
2052 local cmd="rundbench 1 -t $duration"
2054 if [ $MDSCOUNT -ge 2 ]; then
2055 test_mkdir -p -c$MDSCOUNT $DIR/$tdir
2056 $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir
2058 do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \
2059 PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \
2060 DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \
2061 MOUNT=$MOUNT DIR=$DIR/$tdir/\\\$(hostname) LCTL=$LCTL $cmd" &
2064 #LU-1897 wait for all dbench copies to start
2065 while ! check_for_process $clients dbench; do
2066 elapsed=$(($(date +%s) - start_ts))
2067 if [ $elapsed -gt $duration ]; then
2068 killall_process $clients dbench
2069 error "dbench failed to start on $clients!"
2074 log "Started rundbench load pid=$pid ..."
2076 elapsed=$(($(date +%s) - start_ts))
2077 local num_failovers=0
2079 while [ $elapsed -lt $duration ]; do
2080 if ! check_for_process $clients dbench; then
2081 error_noexit "dbench stopped on some of $clients!"
2082 killall_process $clients dbench
2086 replay_barrier mds$fail_index
2087 sleep 1 # give clients a time to do operations
2088 # Increment the number of failovers
2089 num_failovers=$((num_failovers+1))
2090 log "$TESTNAME fail mds$fail_index $num_failovers times"
2092 elapsed=$(($(date +%s) - start_ts))
2093 if [ $fail_index -ge $MDSCOUNT ]; then
2096 fail_index=$((fail_index+1))
2100 wait $pid || error "rundbench load on $clients failed!"
2102 run_test 70b "dbench ${MDSCOUNT}mdts recovery; $CLIENTCOUNT clients"
2103 # end multi-client tests
2108 local monitor_pid=$3
2110 local start_ts=$(date +%s)
2111 local num_failovers=0
2114 elapsed=$(($(date +%s) - start_ts))
2115 while [ $elapsed -lt $duration ]; do
2116 fail_index=$((RANDOM%max_index+1))
2117 kill -0 $monitor_pid ||
2118 error "$monitor_pid stopped"
2120 replay_barrier mds$fail_index
2122 # Increment the number of failovers
2123 num_failovers=$((num_failovers+1))
2124 log "$TESTNAME fail mds$fail_index $num_failovers times"
2126 elapsed=$(($(date +%s) - start_ts))
2132 rm -f $DIR/replay-single.70c.lck
2137 local clients=${CLIENTS:-$HOSTNAME}
2140 zconf_mount_clients $clients $MOUNT
2143 [ "$SLOW" = "no" ] && duration=180
2144 # set duration to 900 because it takes some time to boot node
2145 [ "$FAILURE_MODE" = HARD ] && duration=600
2148 local start_ts=$(date +%s)
2150 trap cleanup_70c EXIT
2152 while [ ! -e $DIR/replay-single.70c.lck ]; do
2153 test_mkdir -p -c$MDSCOUNT $DIR/$tdir || break
2154 if [ $MDSCOUNT -ge 2 ]; then
2155 $LFS setdirstripe -D -c$MDSCOUNT $DIR/$tdir ||
2156 error "set default dirstripe failed"
2158 cd $DIR/$tdir || break
2159 tar cf - /etc | tar xf - || error "tar failed in loop"
2163 echo "Started tar $tar_70c_pid"
2165 random_fail_mdt $MDSCOUNT $duration $tar_70c_pid
2166 kill -0 $tar_70c_pid || error "tar $tar_70c_pid stopped"
2168 touch $DIR/replay-single.70c.lck
2169 wait $tar_70c_pid || error "$?: tar failed"
2174 run_test 70c "tar ${MDSCOUNT}mdts recovery"
2178 kill -9 $mkdir_70d_pid
2182 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2183 local clients=${CLIENTS:-$HOSTNAME}
2186 zconf_mount_clients $clients $MOUNT
2189 [ "$SLOW" = "no" ] && duration=180
2190 # set duration to 900 because it takes some time to boot node
2191 [ "$FAILURE_MODE" = HARD ] && duration=900
2196 local start_ts=$(date +%s)
2198 trap cleanup_70d EXIT
2201 $LFS mkdir -i0 -c2 $DIR/$tdir/test || {
2205 $LFS mkdir -i1 -c2 $DIR/$tdir/test1 || {
2210 touch $DIR/$tdir/test/a || {
2214 mkdir $DIR/$tdir/test/b || {
2218 rm -rf $DIR/$tdir/test || {
2224 touch $DIR/$tdir/test1/a || {
2228 mkdir $DIR/$tdir/test1/b || {
2233 rm -rf $DIR/$tdir/test1 || {
2235 ls -lR $DIR/$tdir/test1
2241 echo "Started $mkdir_70d_pid"
2243 random_fail_mdt $MDSCOUNT $duration $mkdir_70d_pid
2244 kill -0 $mkdir_70d_pid || error "mkdir/rmdir $mkdir_70d_pid stopped"
2249 run_test 70d "mkdir/rmdir striped dir ${MDSCOUNT}mdts recovery"
2252 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2253 local clients=${CLIENTS:-$HOSTNAME}
2256 lctl set_param debug=+ha
2257 zconf_mount_clients $clients $MOUNT
2260 [ "$SLOW" = "no" ] && duration=180
2261 # set duration to 900 because it takes some time to boot node
2262 [ "$FAILURE_MODE" = HARD ] && duration=900
2265 $LFS mkdir -i0 $DIR/$tdir/test_0
2266 $LFS mkdir -i0 $DIR/$tdir/test_1
2267 touch $DIR/$tdir/test_0/a
2268 touch $DIR/$tdir/test_1/b
2271 mrename $DIR/$tdir/test_0/a $DIR/$tdir/test_1/b > /dev/null || {
2276 checkstat $DIR/$tdir/test_0/a && {
2277 echo "a still exists"
2281 checkstat $DIR/$tdir/test_1/b || {
2282 echo "b still exists"
2286 touch $DIR/$tdir/test_0/a || {
2287 echo "touch a fails"
2291 mrename $DIR/$tdir/test_1/b $DIR/$tdir/test_0/a > /dev/null || {
2298 stack_trap "kill -9 $rename_70e_pid" EXIT
2299 echo "Started PID=$rename_70e_pid"
2301 random_fail_mdt 2 $duration $rename_70e_pid
2302 kill -0 $rename_70e_pid || error "rename $rename_70e_pid stopped"
2304 run_test 70e "rename cross-MDT with random fails"
2306 test_70f_write_and_read(){
2311 echo "Write/read files in: '$DIR/$tdir', clients: '$CLIENTS' ..."
2312 for client in ${CLIENTS//,/ }; do
2313 [ -f $stopflag ] || return
2315 local tgtfile=$DIR/$tdir/$tfile.$client
2316 do_node $client dd $DD_OPTS bs=1M count=10 if=$srcfile \
2317 of=$tgtfile 2>/dev/null ||
2318 error "dd $DD_OPTS bs=1M count=10 if=$srcfile " \
2319 "of=$tgtfile failed on $client, rc=$?"
2322 local prev_client=$(echo ${CLIENTS//,/ } | awk '{ print $NF }')
2325 for client in ${CLIENTS//,/ }; do
2326 [ -f $stopflag ] || return
2328 # flush client cache in case test is running on only one client
2329 # do_node $client cancel_lru_locks osc
2330 do_node $client $LCTL set_param ldlm.namespaces.*.lru_size=clear
2332 tgtfile=$DIR/$tdir/$tfile.$client
2333 local md5=$(do_node $prev_client "md5sum $tgtfile")
2334 [ ${checksum[$index]// */} = ${md5// */} ] ||
2335 error "$tgtfile: checksum doesn't match on $prev_client"
2336 index=$((index + 1))
2346 mkdir -p $DIR/$tdir || error "cannot create $DIR/$tdir directory"
2347 $LFS setstripe -c -1 $DIR/$tdir ||
2348 error "cannot $LFS setstripe $DIR/$tdir"
2351 while [ -f $stopflag ]; do
2352 test_70f_write_and_read $srcfile $stopflag
2353 # use direct IO and buffer cache in turns if loop
2354 [ -n "$DD_OPTS" ] && DD_OPTS="" || DD_OPTS="oflag=direct"
2358 test_70f_cleanup() {
2360 rm -f $TMP/$tfile.stop
2361 do_nodes $CLIENTS rm -f $TMP/$tfile
2362 rm -f $DIR/$tdir/$tfile.*
2366 # [ x$ost1failover_HOST = x$ost_HOST ] &&
2367 # { skip "Failover host not defined" && return; }
2368 # [ $CLIENTCOUNT -lt 2 ] &&
2369 # { skip "Need 2 or more clients, have $CLIENTCOUNT" && return; }
2371 [[ "$OST1_VERSION" -lt $(version_code 2.9.53) ]] &&
2372 skip "Need server version at least 2.9.53"
2374 echo "mount clients $CLIENTS ..."
2375 zconf_mount_clients $CLIENTS $MOUNT
2377 local srcfile=$TMP/$tfile
2381 trap test_70f_cleanup EXIT
2382 # create a different source file local to each client node so we can
2383 # detect if the file wasn't written out properly after failover
2384 do_nodes $CLIENTS dd bs=1M count=10 if=/dev/urandom of=$srcfile \
2385 2>/dev/null || error "can't create $srcfile on $CLIENTS"
2386 for client in ${CLIENTS//,/ }; do
2387 checksum[$index]=$(do_node $client "md5sum $srcfile")
2388 index=$((index + 1))
2392 [ "$SLOW" = "no" ] && duration=60
2393 # set duration to 900 because it takes some time to boot node
2394 [ "$FAILURE_MODE" = HARD ] && duration=900
2396 local stopflag=$TMP/$tfile.stop
2397 test_70f_loop $srcfile $stopflag &
2401 local num_failovers=0
2402 local start_ts=$SECONDS
2403 while [ $elapsed -lt $duration ]; do
2407 num_failovers=$((num_failovers + 1))
2408 log "$TESTNAME failing OST $num_failovers times"
2411 elapsed=$((SECONDS - start_ts))
2418 run_test 70f "OSS O_DIRECT recovery with $CLIENTCOUNT clients"
2422 kill -9 $mkdir_71a_pid
2425 random_double_fail_mdt() {
2428 local monitor_pid=$3
2430 local start_ts=$(date +%s)
2431 local num_failovers=0
2435 elapsed=$(($(date +%s) - start_ts))
2436 while [ $elapsed -lt $duration ]; do
2437 fail_index=$((RANDOM%max_index + 1))
2438 if [ $fail_index -eq $max_index ]; then
2441 second_index=$((fail_index + 1))
2443 kill -0 $monitor_pid ||
2444 error "$monitor_pid stopped"
2446 replay_barrier mds$fail_index
2447 replay_barrier mds$second_index
2449 # Increment the number of failovers
2450 num_failovers=$((num_failovers+1))
2451 log "fail mds$fail_index mds$second_index $num_failovers times"
2452 fail mds${fail_index},mds${second_index}
2453 elapsed=$(($(date +%s) - start_ts))
2458 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2459 local clients=${CLIENTS:-$HOSTNAME}
2462 zconf_mount_clients $clients $MOUNT
2465 [ "$SLOW" = "no" ] && duration=180
2466 # set duration to 900 because it takes some time to boot node
2467 [ "$FAILURE_MODE" = HARD ] && duration=900
2469 mkdir_on_mdt0 $DIR/$tdir
2472 local start_ts=$(date +%s)
2474 trap cleanup_71a EXIT
2477 $LFS mkdir -i0 -c2 $DIR/$tdir/test
2478 rmdir $DIR/$tdir/test
2482 echo "Started $mkdir_71a_pid"
2484 random_double_fail_mdt 2 $duration $mkdir_71a_pid
2485 kill -0 $mkdir_71a_pid || error "mkdir/rmdir $mkdir_71a_pid stopped"
2490 run_test 71a "mkdir/rmdir striped dir with 2 mdts recovery"
2493 multiop_bg_pause $DIR/$tfile O_tSc ||
2494 error "multiop_bg_pause $DIR/$tfile failed"
2498 replay_barrier $SINGLEMDS
2499 #define OBD_FAIL_LDLM_ENQUEUE_NET 0x302
2500 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000302"
2503 wait $pid || error "multiop pid failed"
2504 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
2507 run_test 73a "open(O_CREAT), unlink, replay, reconnect before open replay, close"
2510 multiop_bg_pause $DIR/$tfile O_tSc ||
2511 error "multiop_bg_pause $DIR/$tfile failed"
2515 replay_barrier $SINGLEMDS
2516 #define OBD_FAIL_MDS_LDLM_REPLY_NET 0x157
2517 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000157"
2520 wait $pid || error "multiop pid failed"
2521 [ -e $DIR/$tfile ] && error "file $DIR/$tfile should not exist"
2524 run_test 73b "open(O_CREAT), unlink, replay, reconnect at open_replay reply, close"
2528 local clients=${CLIENTS:-$HOSTNAME}
2530 zconf_umount_clients $clients $MOUNT
2532 facet_failover $SINGLEMDS
2533 zconf_mount_clients $clients $MOUNT
2535 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
2536 rm $DIR/$tfile || error "rm $DIR/$tfile failed"
2537 clients_up || error "client evicted: $?"
2540 run_test 74 "Ensure applications don't fail waiting for OST recovery"
2542 remote_dir_check_80() {
2547 diridx=$($LFS getstripe -m $remote_dir) ||
2548 error "$LFS getstripe -m $remote_dir failed"
2549 [ $diridx -eq $mdtidx ] || error "$diridx != $mdtidx"
2551 createmany -o $remote_dir/f-%d 20 || error "creation failed"
2552 fileidx=$($LFS getstripe -m $remote_dir/f-1) ||
2553 error "$LFS getstripe -m $remote_dir/f-1 failed"
2554 [ $fileidx -eq $mdtidx ] || error "$fileidx != $mdtidx"
2560 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2561 ([ $FAILURE_MODE == "HARD" ] &&
2562 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2563 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2567 local remote_dir=$DIR/$tdir/remote_dir
2569 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2570 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2571 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2572 $LFS mkdir -i $MDTIDX $remote_dir &
2578 wait $CLIENT_PID || error "remote creation failed"
2580 remote_dir_check_80 || error "remote dir check failed"
2581 rm -rf $DIR/$tdir || error "rmdir failed"
2585 run_test 80a "DNE: create remote dir, drop update rep from MDT0, fail MDT0"
2588 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2589 ([ $FAILURE_MODE == "HARD" ] &&
2590 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2591 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2595 local remote_dir=$DIR/$tdir/remote_dir
2597 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2598 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2599 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2600 $LFS mkdir -i $MDTIDX $remote_dir &
2605 fail mds$((MDTIDX + 1))
2607 wait $CLIENT_PID || error "remote creation failed"
2609 remote_dir_check_80 || error "remote dir check failed"
2610 rm -rf $DIR/$tdir || error "rmdir failed"
2614 run_test 80b "DNE: create remote dir, drop update rep from MDT0, fail MDT1"
2617 [[ "$mds1_FSTYPE" = zfs ]] &&
2618 [[ $MDS1_VERSION -lt $(version_code 2.12.51) ]] &&
2619 skip "requires LU-10143 fix on MDS"
2620 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
2621 ([ $FAILURE_MODE == "HARD" ] &&
2622 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2623 skip "MDTs needs to be on diff hosts for HARD fail mode"
2626 local remote_dir=$DIR/$tdir/remote_dir
2628 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2629 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2630 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2631 $LFS mkdir -i $MDTIDX $remote_dir &
2637 fail mds$((MDTIDX + 1))
2639 wait $CLIENT_PID || error "remote creation failed"
2641 remote_dir_check_80 || error "remote dir check failed"
2642 rm -rf $DIR/$tdir || error "rmdir failed"
2646 run_test 80c "DNE: create remote dir, drop update rep from MDT1, fail MDT[0,1]"
2649 [[ "$mds1_FSTYPE" = zfs ]] &&
2650 [[ $MDS1_VERSION -lt $(version_code 2.12.51) ]] &&
2651 skip "requires LU-10143 fix on MDS"
2652 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs"
2654 local remote_dir=$DIR/$tdir/remote_dir
2656 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2657 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2658 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2659 $LFS mkdir -i $MDTIDX $remote_dir &
2662 # sleep 3 seconds to make sure MDTs are failed after
2663 # lfs mkdir -i has finished on all of MDTs.
2668 fail mds${MDTIDX},mds$((MDTIDX + 1))
2670 wait $CLIENT_PID || error "remote creation failed"
2672 remote_dir_check_80 || error "remote dir check failed"
2673 rm -rf $DIR/$tdir || error "rmdir failed"
2677 run_test 80d "DNE: create remote dir, drop update rep from MDT1, fail 2 MDTs"
2680 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2681 ([ $FAILURE_MODE == "HARD" ] &&
2682 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2683 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2687 local remote_dir=$DIR/$tdir/remote_dir
2689 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2690 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2691 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2692 $LFS mkdir -i $MDTIDX $remote_dir &
2695 # sleep 3 seconds to make sure MDTs are failed after
2696 # lfs mkdir -i has finished on all of MDTs.
2702 wait $CLIENT_PID || error "remote creation failed"
2704 remote_dir_check_80 || error "remote dir check failed"
2705 rm -rf $DIR/$tdir || error "rmdir failed"
2709 run_test 80e "DNE: create remote dir, drop MDT1 rep, fail MDT0"
2712 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2713 ([ $FAILURE_MODE == "HARD" ] &&
2714 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2715 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2718 local remote_dir=$DIR/$tdir/remote_dir
2720 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2721 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2722 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2723 $LFS mkdir -i $MDTIDX $remote_dir &
2727 fail mds$((MDTIDX + 1))
2729 wait $CLIENT_PID || error "remote creation failed"
2731 remote_dir_check_80 || error "remote dir check failed"
2732 rm -rf $DIR/$tdir || error "rmdir failed"
2736 run_test 80f "DNE: create remote dir, drop MDT1 rep, fail MDT1"
2739 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2740 ([ $FAILURE_MODE == "HARD" ] &&
2741 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2742 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2746 local remote_dir=$DIR/$tdir/remote_dir
2748 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2749 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2750 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2751 $LFS mkdir -i $MDTIDX $remote_dir &
2754 # sleep 3 seconds to make sure MDTs are failed after
2755 # lfs mkdir -i has finished on all of MDTs.
2761 fail mds$((MDTIDX + 1))
2763 wait $CLIENT_PID || error "remote creation failed"
2765 remote_dir_check_80 || error "remote dir check failed"
2766 rm -rf $DIR/$tdir || error "rmdir failed"
2770 run_test 80g "DNE: create remote dir, drop MDT1 rep, fail MDT0, then MDT1"
2773 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2775 local remote_dir=$DIR/$tdir/remote_dir
2777 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2778 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2779 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2780 $LFS mkdir -i $MDTIDX $remote_dir &
2783 # sleep 3 seconds to make sure MDTs are failed after
2784 # lfs mkdir -i has finished on all of MDTs.
2789 fail mds${MDTIDX},mds$((MDTIDX + 1))
2791 wait $CLIENT_PID || error "remote dir creation failed"
2793 remote_dir_check_80 || error "remote dir check failed"
2794 rm -rf $DIR/$tdir || error "rmdir failed"
2798 run_test 80h "DNE: create remote dir, drop MDT1 rep, fail 2 MDTs"
2801 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2802 ([ $FAILURE_MODE == "HARD" ] &&
2803 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2804 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2808 local remote_dir=$DIR/$tdir/remote_dir
2810 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2811 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2813 touch $remote_dir || error "touch $remote_dir failed"
2814 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2815 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2820 fail mds$((MDTIDX + 1))
2822 wait $CLIENT_PID || error "rm remote dir failed"
2824 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2826 rm -rf $DIR/$tdir || error "rmdir failed"
2830 run_test 81a "DNE: unlink remote dir, drop MDT0 update rep, fail MDT1"
2833 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2834 ([ $FAILURE_MODE == "HARD" ] &&
2835 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2836 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2839 local remote_dir=$DIR/$tdir/remote_dir
2841 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2842 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2844 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2845 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2852 wait $CLIENT_PID || error "rm remote dir failed"
2854 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2856 rm -rf $DIR/$tdir || error "rmdir failed"
2860 run_test 81b "DNE: unlink remote dir, drop MDT0 update reply, fail MDT0"
2863 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2864 ([ $FAILURE_MODE == "HARD" ] &&
2865 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2866 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2870 local remote_dir=$DIR/$tdir/remote_dir
2872 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2873 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2875 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2876 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2883 fail mds$((MDTIDX + 1))
2885 wait $CLIENT_PID || error "rm remote dir failed"
2887 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2889 rm -rf $DIR/$tdir || error "rmdir failed"
2893 run_test 81c "DNE: unlink remote dir, drop MDT0 update reply, fail MDT0,MDT1"
2896 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2898 local remote_dir=$DIR/$tdir/remote_dir
2900 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2901 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2903 # OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
2904 do_facet mds${MDTIDX} lctl set_param fail_loc=0x1701
2910 fail mds${MDTIDX},mds$((MDTIDX + 1))
2912 wait $CLIENT_PID || error "rm remote dir failed"
2914 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2916 rm -rf $DIR/$tdir || error "rmdir failed"
2920 run_test 81d "DNE: unlink remote dir, drop MDT0 update reply, fail 2 MDTs"
2923 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2924 ([ $FAILURE_MODE == "HARD" ] &&
2925 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2926 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2930 local remote_dir=$DIR/$tdir/remote_dir
2932 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2933 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2935 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2936 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2939 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0
2944 wait $CLIENT_PID || error "rm remote dir failed"
2946 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2948 rm -rf $DIR/$tdir || error "rmdir failed"
2952 run_test 81e "DNE: unlink remote dir, drop MDT1 req reply, fail MDT0"
2955 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2956 ([ $FAILURE_MODE == "HARD" ] &&
2957 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2958 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2962 local remote_dir=$DIR/$tdir/remote_dir
2964 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2965 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2967 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2968 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
2973 fail mds$((MDTIDX + 1))
2975 wait $CLIENT_PID || error "rm remote dir failed"
2977 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
2979 rm -rf $DIR/$tdir || error "rmdir failed"
2983 run_test 81f "DNE: unlink remote dir, drop MDT1 req reply, fail MDT1"
2986 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
2987 ([ $FAILURE_MODE == "HARD" ] &&
2988 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
2989 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
2993 local remote_dir=$DIR/$tdir/remote_dir
2995 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
2996 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
2998 # OBD_FAIL_MDS_REINT_NET_REP 0x119
2999 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
3006 fail mds$((MDTIDX + 1))
3008 wait $CLIENT_PID || error "rm remote dir failed"
3010 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
3012 rm -rf $DIR/$tdir || error "rmdir failed"
3016 run_test 81g "DNE: unlink remote dir, drop req reply, fail M0, then M1"
3019 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3021 local remote_dir=$DIR/$tdir/remote_dir
3023 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3024 $LFS mkdir -i $MDTIDX $remote_dir || error "lfs mkdir failed"
3026 # OBD_FAIL_MDS_REINT_NET_REP 0x119
3027 do_facet mds$((MDTIDX + 1)) lctl set_param fail_loc=0x119
3033 fail mds${MDTIDX},mds$((MDTIDX + 1))
3035 wait $CLIENT_PID || error "rm remote dir failed"
3037 stat $remote_dir &>/dev/null && error "$remote_dir still exist!"
3039 rm -rf $DIR/$tdir || error "rmdir failed"
3043 run_test 81h "DNE: unlink remote dir, drop request reply, fail 2 MDTs"
3046 #define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144
3047 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000144"
3048 createmany -o $DIR/$tfile- 1 &
3052 client_up || client_up || true # reconnect
3054 run_test 84a "stale open during export disconnect"
3056 test_85a() { #bug 16774
3057 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
3059 for i in $(seq 100); do
3060 echo "tag-$i" > $DIR/$tfile-$i
3061 grep -q "tag-$i" $DIR/$tfile-$i || error "f2-$i"
3064 lov_id=$(lctl dl | grep "clilov")
3065 addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}')
3066 count=$(lctl get_param -n \
3067 ldlm.namespaces.*MDT0000*$addr.lock_unused_count)
3068 echo "before recovery: unused locks count = $count"
3072 count2=$(lctl get_param -n \
3073 ldlm.namespaces.*MDT0000*$addr.lock_unused_count)
3074 echo "after recovery: unused locks count = $count2"
3076 if [ $count2 -ge $count ]; then
3077 error "unused locks are not canceled"
3080 run_test 85a "check the cancellation of unused locks during recovery(IBITS)"
3082 test_85b() { #bug 16774
3086 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
3088 $LFS setstripe -c 1 -i 0 $DIR/$tdir
3090 for i in $(seq 100); do
3091 dd if=/dev/urandom of=$DIR/$tdir/$tfile-$i bs=4096 \
3092 count=32 >/dev/null 2>&1
3095 cancel_lru_locks osc
3097 for i in $(seq 100); do
3098 dd if=$DIR/$tdir/$tfile-$i of=/dev/null bs=4096 \
3099 count=32 >/dev/null 2>&1
3102 lov_id=$(lctl dl | grep "clilov")
3103 addr=$(echo $lov_id | awk '{print $4}' | awk -F '-' '{print $NF}')
3104 count=$(lctl get_param -n \
3105 ldlm.namespaces.*OST0000*$addr.lock_unused_count)
3106 echo "before recovery: unused locks count = $count"
3107 [ $count -ne 0 ] || error "unused locks ($count) should be zero"
3111 count2=$(lctl get_param \
3112 -n ldlm.namespaces.*OST0000*$addr.lock_unused_count)
3113 echo "after recovery: unused locks count = $count2"
3115 if [ $count2 -ge $count ]; then
3116 error "unused locks are not canceled"
3121 run_test 85b "check the cancellation of unused locks during recovery(EXTENT)"
3124 local clients=${CLIENTS:-$HOSTNAME}
3126 zconf_umount_clients $clients $MOUNT
3127 do_facet $SINGLEMDS lctl set_param mdt.${FSNAME}-MDT*.exports.clear=0
3128 remount_facet $SINGLEMDS
3129 zconf_mount_clients $clients $MOUNT
3131 run_test 86 "umount server after clear nid_stats should not hit LBUG"
3134 do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
3137 $LFS setstripe -i 0 -c 1 $DIR/$tfile
3138 dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 ||
3139 error "dd to $DIR/$tfile failed"
3140 cksum=$(md5sum $DIR/$tfile | awk '{print $1}')
3141 cancel_lru_locks osc
3143 dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
3144 cksum2=$(md5sum $DIR/$tfile | awk '{print $1}')
3145 if [ $cksum != $cksum2 ] ; then
3146 error "New checksum $cksum2 does not match original $cksum"
3149 run_test 87a "write replay"
3152 do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
3155 $LFS setstripe -i 0 -c 1 $DIR/$tfile
3156 dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 ||
3157 error "dd to $DIR/$tfile failed"
3158 sleep 1 # Give it a chance to flush dirty data
3159 echo TESTTEST | dd of=$DIR/$tfile bs=1 count=8 seek=64
3160 cksum=$(md5sum $DIR/$tfile | awk '{print $1}')
3161 cancel_lru_locks osc
3163 dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
3164 cksum2=$(md5sum $DIR/$tfile | awk '{print $1}')
3165 if [ $cksum != $cksum2 ] ; then
3166 error "New checksum $cksum2 does not match original $cksum"
3169 run_test 87b "write replay with changed data (checksum resend)"
3171 test_88() { #bug 17485
3172 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3173 mkdir -p $TMP/$tdir || error "mkdir $TMP/$tdir failed"
3175 $LFS setstripe -i 0 -c 1 $DIR/$tdir || error "$LFS setstripe failed"
3178 replay_barrier $SINGLEMDS
3180 # exhaust precreations on ost1
3181 local OST=$(ostname_from_index 0)
3182 local mdtosc=$(get_mdtosc_proc_path $SINGLEMDS $OST)
3183 local last_id=$(do_facet $SINGLEMDS lctl get_param -n osp.$mdtosc.prealloc_last_id)
3184 local next_id=$(do_facet $SINGLEMDS lctl get_param -n osp.$mdtosc.prealloc_next_id)
3185 echo "before test: last_id = $last_id, next_id = $next_id"
3187 echo "Creating to objid $last_id on ost $OST..."
3188 createmany -o $DIR/$tdir/f-%d $next_id $((last_id - next_id + 2)) ||
3189 error "createmany create files to last_id failed"
3191 #create some files to use some uncommitted objids
3192 last_id=$(($last_id + 1))
3193 createmany -o $DIR/$tdir/f-%d $last_id 8 ||
3194 error "createmany create files with uncommitted objids failed"
3196 last_id2=$(do_facet $SINGLEMDS lctl get_param -n osp.$mdtosc.prealloc_last_id)
3197 next_id2=$(do_facet $SINGLEMDS lctl get_param -n osp.$mdtosc.prealloc_next_id)
3198 echo "before recovery: last_id = $last_id2, next_id = $next_id2"
3200 # if test uses shutdown_facet && reboot_facet instead of facet_failover ()
3201 # it has to take care about the affected facets, bug20407
3202 local affected_mds1=$(affected_facets mds1)
3203 local affected_ost1=$(affected_facets ost1)
3205 shutdown_facet $SINGLEMDS
3208 reboot_facet $SINGLEMDS
3209 change_active $affected_mds1
3210 wait_for_facet $affected_mds1
3211 mount_facets $affected_mds1 || error "Restart of mds failed"
3214 change_active $affected_ost1
3215 wait_for_facet $affected_ost1
3216 mount_facets $affected_ost1 || error "Restart of ost1 failed"
3220 last_id2=$(do_facet $SINGLEMDS lctl get_param -n osp.$mdtosc.prealloc_last_id)
3221 next_id2=$(do_facet $SINGLEMDS lctl get_param -n osp.$mdtosc.prealloc_next_id)
3222 echo "after recovery: last_id = $last_id2, next_id = $next_id2"
3224 # create new files, which should use new objids, and ensure the orphan
3225 # cleanup phase for ost1 is completed at the same time
3226 for i in $(seq 8); do
3227 file_id=$(($last_id + 10 + $i))
3228 dd if=/dev/urandom of=$DIR/$tdir/f-$file_id bs=4096 count=128
3231 # if the objids were not recreated, then "ls" will fail with -ENOENT
3232 ls -l $DIR/$tdir/* || error "can't get the status of precreated files"
3235 # write into previously created files
3236 for i in $(seq 8); do
3237 file_id=$(($last_id + $i))
3238 dd if=/dev/urandom of=$DIR/$tdir/f-$file_id bs=4096 count=128
3239 cp -f $DIR/$tdir/f-$file_id $TMP/$tdir/
3242 # compare the content
3243 for i in $(seq 8); do
3244 file_id=$(($last_id + $i))
3245 cmp $TMP/$tdir/f-$file_id $DIR/$tdir/f-$file_id ||
3246 error "the content of file is modified!"
3251 run_test 88 "MDS should not assign same objid to different files "
3253 function calc_osc_kbytes_used() {
3254 local kbtotal=$(calc_osc_kbytes kbytestotal)
3255 local kbfree=$(calc_osc_kbytes kbytesfree)
3256 echo $((kbtotal-kbfree))
3260 cancel_lru_locks osc
3261 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3262 rm -f $DIR/$tdir/$tfile
3263 wait_mds_ost_sync || error "initial MDS-OST sync timed out"
3264 wait_delete_completed || error "initial wait delete timed out"
3265 local blocks1=$(calc_osc_kbytes_used)
3266 local write_size=$(fs_log_size)
3268 $LFS setstripe -i 0 -c 1 $DIR/$tdir/$tfile
3269 [ $write_size -lt 1024 ] && write_size=1024
3270 dd if=/dev/zero bs=${write_size}k count=10 of=$DIR/$tdir/$tfile
3273 facet_failover $SINGLEMDS
3274 rm $DIR/$tdir/$tfile
3277 zconf_mount $(hostname) $MOUNT || error "mount fails"
3278 client_up || error "client_up failed"
3280 # wait for the remounted client to connect to ost1
3281 local target=$(get_osc_import_name client ost1)
3282 wait_import_state "FULL" "osc.${target}.ost_server_uuid" \
3283 $(max_recovery_time)
3285 wait_mds_ost_sync || error "MDS-OST sync timed out"
3286 wait_delete_completed || error "wait delete timed out"
3287 local blocks2=$(calc_osc_kbytes_used)
3289 [ $((blocks2 - blocks1)) -le $(fs_log_size) ] ||
3290 error $((blocks2 - blocks1)) blocks leaked
3292 run_test 89 "no disk space leak on late ost connection"
3299 change_active $facet
3300 wait_for_facet $facet
3301 mount_facet $facet || error "Restart of $facet failed"
3305 test_90() { # bug 19494
3306 local dir=$DIR/$tdir
3307 local ostfail=$(get_random_entry $(get_facets OST))
3309 if [[ $FAILURE_MODE = HARD ]]; then
3310 local affected=$(affected_facets $ostfail);
3312 [[ "$affected" == $ostfail ]] ||
3313 skip "cannot use FAILURE_MODE=$FAILURE_MODE, affected: $affected"
3315 # ensure all OSTs are active to allow allocations
3318 mkdir $dir || error "mkdir $dir failed"
3320 echo "Create the files"
3322 # file "f${index}" striped over 1 OST
3323 # file "all" striped over all OSTs
3325 $LFS setstripe -c $OSTCOUNT $dir/all ||
3326 error "setstripe failed to create $dir/all"
3328 for ((i = 0; i < $OSTCOUNT; i++)); do
3331 $LFS setstripe -i $i -c 1 $f ||
3332 error "$LFS setstripe failed to create $f"
3334 # confirm setstripe actually created stripe on requested OST
3335 local uuid=$(ostuuid_from_index $i)
3337 for file in f$i all; do
3338 local found=$($LFS find --obd $uuid --name $file $dir)
3340 if [[ $dir/$file != $found ]]; then
3341 $LFS getstripe $dir/$file
3342 error "wrong stripe: $file, uuid: $uuid"
3347 # Before failing an OST, get its obd name and index
3348 local varsvc=${ostfail}_svc
3349 local obd=$(do_facet $ostfail lctl get_param \
3350 -n obdfilter.${!varsvc}.uuid)
3351 local index=$(($(facet_number $ostfail) - 1))
3353 echo "Fail $ostfail $obd, display the list of affected files"
3354 shutdown_facet $ostfail || error "shutdown_facet $ostfail failed"
3356 trap "cleanup_90 $ostfail" EXIT INT
3357 echo "General Query: lfs find $dir"
3358 local list=$($LFS find $dir)
3360 for (( i=0; i<$OSTCOUNT; i++ )); do
3361 list_member "$list" $dir/f$i ||
3362 error_noexit "lfs find $dir: no file f$i"
3364 list_member "$list" $dir/all ||
3365 error_noexit "lfs find $dir: no file all"
3367 # focus on the missing OST,
3368 # we expect to see only two files affected: "f$(index)" and "all"
3370 echo "Querying files on shutdown $ostfail: lfs find --obd $obd"
3371 list=$($LFS find --obd $obd $dir)
3373 for file in all f$index; do
3374 list_member "$list" $dir/$file ||
3375 error_noexit "lfs find does not report the affected $obd for $file"
3378 [[ $(echo $list | wc -w) -eq 2 ]] ||
3379 error_noexit "lfs find reports the wrong list of affected files ${#list[@]}"
3381 echo "Check getstripe: $LFS getstripe -r --obd $obd"
3382 list=$($LFS getstripe -r --obd $obd $dir)
3384 for file in all f$index; do
3385 echo "$list" | grep $dir/$file ||
3386 error_noexit "lfs getsripe does not report the affected $obd for $file"
3391 run_test 90 "lfs find identifies the missing striped file segments"
3394 [[ "$MDS1_VERSION" -ge $(version_code 2.6.90) ]] ||
3395 [[ "$MDS1_VERSION" -ge $(version_code 2.5.4) &&
3396 "$MDS1_VERSION" -lt $(version_code 2.5.50) ]] ||
3397 skip "Need MDS version 2.5.4+ or 2.6.90+"
3399 cancel_lru_locks osc
3401 $LFS setstripe -i 0 -c 1 $DIR/$tfile ||
3402 error "$LFS setstripe $DIR/$tfile failed"
3403 dd if=/dev/zero of=$DIR/$tfile bs=1024 count=1 ||
3404 error "dd to $DIR/$tfile failed"
3405 #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715
3406 # We need to emulate a state that OST is waiting for other clients
3407 # not completing the recovery. Final ping is queued, but reply will be
3408 # sent on the recovery completion. It is done by sleep before
3409 # processing final pings
3410 do_facet ost1 "$LCTL set_param fail_val=40"
3411 do_facet ost1 "$LCTL set_param fail_loc=0x715"
3414 run_test 93a "replay + reconnect"
3417 [[ "$MDS1_VERSION" -ge $(version_code 2.7.90) ]] ||
3418 skip "Need MDS version 2.7.90+"
3420 cancel_lru_locks mdc
3422 createmany -o $DIR/$tfile 20 ||
3423 error "createmany -o $DIR/$tfile failed"
3425 #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x715
3426 # We need to emulate a state that MDT is waiting for other clients
3427 # not completing the recovery. Final ping is queued, but reply will be
3428 # sent on the recovery completion. It is done by sleep before
3429 # processing final pings
3430 do_facet mds1 "$LCTL set_param fail_val=80"
3431 do_facet mds1 "$LCTL set_param fail_loc=0x715"
3434 run_test 93b "replay + reconnect on mds"
3436 striped_dir_check_100() {
3437 local striped_dir=$DIR/$tdir/striped_dir
3438 local stripe_count=$($LFS getdirstripe -c $striped_dir)
3440 $LFS getdirstripe $striped_dir
3441 [ $stripe_count -eq 2 ] || error "$stripe_count != 2"
3443 createmany -o $striped_dir/f-%d 20 ||
3444 error "creation failed under striped dir"
3448 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3449 ([ $FAILURE_MODE == "HARD" ] &&
3450 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3451 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3454 local striped_dir=$DIR/$tdir/striped_dir
3457 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3459 #To make sure MDT1 and MDT0 are connected
3460 #otherwise it may create single stripe dir here
3461 $LFS setdirstripe -i1 $DIR/$tdir/remote_dir
3463 #define OBD_FAIL_OUT_UPDATE_NET_REP 0x1701
3464 do_facet mds$((MDTIDX+1)) lctl set_param fail_loc=0x1701
3465 $LFS setdirstripe -i0 -c2 $striped_dir &
3468 fail mds$((MDTIDX + 1))
3470 wait $CLIENT_PID || error "striped dir creation failed"
3472 striped_dir_check_100 || error "striped dir check failed"
3473 rm -rf $DIR/$tdir || error "rmdir failed"
3475 run_test 100a "DNE: create striped dir, drop update rep from MDT1, fail MDT1"
3478 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3479 ([ $FAILURE_MODE == "HARD" ] &&
3480 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3481 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3484 local striped_dir=$DIR/$tdir/striped_dir
3487 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3489 #To make sure MDT1 and MDT0 are connected
3490 #otherwise it may create single stripe dir here
3491 $LFS setdirstripe -i1 $DIR/$tdir/remote_dir
3493 # OBD_FAIL_MDS_REINT_NET_REP 0x119
3494 do_facet mds$MDTIDX lctl set_param fail_loc=0x119
3495 $LFS mkdir -i0 -c2 $striped_dir &
3500 wait $CLIENT_PID || error "striped dir creation failed"
3502 striped_dir_check_100 || error "striped dir check failed"
3503 rm -rf $DIR/$tdir || error "rmdir failed"
3505 run_test 100b "DNE: create striped dir, fail MDT0"
3508 (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
3509 [[ "$FAILURE_MODE" != "HARD" ||
3510 "$(facet_host mds1)" != "$(facet_host mds2)" ]] ||
3511 skip "MDTs needs to be on diff hosts for HARD fail mode"
3513 local striped_dir=$DIR/$tdir/striped_dir
3515 mkdir_on_mdt0 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3517 #To make sure MDT1 and MDT0 are connected
3518 #otherwise it may create single stripe dir here
3519 $LFS setdirstripe -i1 $DIR/$tdir/remote_dir
3522 $LFS mkdir -i1 -c2 $striped_dir
3524 stack_trap fail_abort_cleanup RETURN
3525 fail_abort mds2 abort_recov_mdt
3527 createmany -o $striped_dir/f-%d 20 &&
3528 error "createmany -o $DIR/$tfile should fail"
3532 # LU-16159 abort_recovery will cancel update logs, the second recovery
3533 # won't replay $striped_dir creation
3534 (( $MDS1_VERSION >= $(version_code 2.15.52) )) ||
3535 striped_dir_check_100 || error "striped dir check failed"
3537 run_test 100c "DNE: create striped dir, abort_recov_mdt mds2"
3540 (( $MDSCOUNT > 1 )) || skip "needs > 1 MDTs"
3541 (( $MDS1_VERSION >= $(version_code 2.15.52.144) )) ||
3542 skip "Need MDS version 2.15.52.144+"
3544 test_mkdir -c $MDSCOUNT $DIR/$tdir || error "mkdir $tdir failed"
3545 $LFS setdirstripe -D -i -1 -c $MDSCOUNT $DIR/$tdir ||
3546 error "set $tdir default LMV failed"
3547 createmany -d $DIR/$tdir/s 100 || error "create subdir failed"
3549 local index=$((RANDOM % MDSCOUNT))
3550 local devname=$(mdtname_from_index $index)
3551 local mdt=mds$((index + 1))
3556 # cancel update llog upon recovery abort
3557 do_facet $mdt $LCTL --device $devname llog_print update_log
3558 log=$(do_facet $mdt "$LCTL --device $devname llog_print update_log |
3559 awk '/index/ { print \\\$4; exit }'")
3561 count=$(do_facet $mdt "$LCTL --device $devname llog_print update_log |
3563 (( count > 0 )) || error "no update logs found"
3564 stack_trap fail_abort_cleanup RETURN
3565 fail_abort $mdt || error "fail_abort $mdt failed"
3566 wait_update_facet $mdt "$LCTL --device $devname llog_print update_log |
3567 grep -c index" 0 60 || error "update logs not canceled"
3569 run_test 100d "DNE: cancel update logs upon recovery abort"
3571 test_101() { #LU-5648
3572 mkdir -p $DIR/$tdir/d1
3573 mkdir -p $DIR/$tdir/d2
3574 touch $DIR/$tdir/file0
3577 replay_barrier $SINGLEMDS
3578 for i in $(seq $num) ; do
3579 echo test$i > $DIR/$tdir/d1/file$i
3582 fail_abort $SINGLEMDS
3583 for i in $(seq $num) ; do
3584 touch $DIR/$tdir/d2/file$i
3585 test -s $DIR/$tdir/d2/file$i &&
3586 ls -al $DIR/$tdir/d2/file$i && error "file$i's size > 0"
3591 run_test 101 "Shouldn't reassign precreated objs to other files after recovery"
3600 [[ $(lctl get_param mdc.*.import |
3601 grep "connect_flags:.*multi_mod_rpc") ]] ||
3602 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3604 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3605 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3606 facet="mds$((0x$idx + 1))"
3608 # get current value of max_mod_rcps_in_flight
3609 num=$($LCTL get_param -n \
3610 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3611 # set default value if client does not support multi mod RPCs
3612 [ -z "$num" ] && num=1
3614 echo "creating $num files ..."
3616 for i in $(seq $num); do
3617 touch $DIR/$tdir/file-$i
3620 # drop request on MDT to force resend
3621 #define OBD_FAIL_MDS_REINT_MULTI_NET 0x159
3622 do_facet $facet "$LCTL set_param fail_loc=0x159"
3623 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3624 for i in $(seq $num); do
3625 chmod 0600 $DIR/$tdir/file-$i &
3629 do_facet $facet "$LCTL set_param fail_loc=0"
3630 for pid in $pids; do
3631 wait $pid || error "chmod failed"
3633 echo "done ($(date +%H:%M:%S))"
3635 # check chmod succeed
3636 for i in $(seq $num); do
3637 checkstat -vp 0600 $DIR/$tdir/file-$i
3642 run_test 102a "check resend (request lost) with multiple modify RPCs in flight"
3651 [[ $(lctl get_param mdc.*.import |
3652 grep "connect_flags:.*multi_mod_rpc") ]] ||
3653 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3655 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3656 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3657 facet="mds$((0x$idx + 1))"
3659 # get current value of max_mod_rcps_in_flight
3660 num=$($LCTL get_param -n \
3661 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3662 # set default value if client does not support multi mod RPCs
3663 [ -z "$num" ] && num=1
3665 echo "creating $num files ..."
3667 for i in $(seq $num); do
3668 touch $DIR/$tdir/file-$i
3671 # drop reply on MDT to force reconstruction
3672 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
3673 do_facet $facet "$LCTL set_param fail_loc=0x15a"
3674 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3675 for i in $(seq $num); do
3676 chmod 0600 $DIR/$tdir/file-$i &
3680 do_facet $facet "$LCTL set_param fail_loc=0"
3681 for pid in $pids; do
3682 wait $pid || error "chmod failed"
3684 echo "done ($(date +%H:%M:%S))"
3686 # check chmod succeed
3687 for i in $(seq $num); do
3688 checkstat -vp 0600 $DIR/$tdir/file-$i
3693 run_test 102b "check resend (reply lost) with multiple modify RPCs in flight"
3702 [[ $(lctl get_param mdc.*.import |
3703 grep "connect_flags:.*multi_mod_rpc") ]] ||
3704 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3706 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3707 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3708 facet="mds$((0x$idx + 1))"
3710 # get current value of max_mod_rcps_in_flight
3711 num=$($LCTL get_param -n \
3712 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3713 # set default value if client does not support multi mod RPCs
3714 [ -z "$num" ] && num=1
3716 echo "creating $num files ..."
3718 for i in $(seq $num); do
3719 touch $DIR/$tdir/file-$i
3722 replay_barrier $facet
3725 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
3726 do_facet $facet "$LCTL set_param fail_loc=0x15a"
3727 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3728 for i in $(seq $num); do
3729 chmod 0600 $DIR/$tdir/file-$i &
3733 do_facet $facet "$LCTL set_param fail_loc=0"
3738 for pid in $pids; do
3739 wait $pid || error "chmod failed"
3741 echo "done ($(date +%H:%M:%S))"
3743 # check chmod succeed
3744 for i in $(seq $num); do
3745 checkstat -vp 0600 $DIR/$tdir/file-$i
3750 run_test 102c "check replay w/o reconstruction with multiple mod RPCs in flight"
3759 [[ $(lctl get_param mdc.*.import |
3760 grep "connect_flags:.*multi_mod_rpc") ]] ||
3761 { skip "Need MDC with 'multi_mod_rpcs' feature"; return 0; }
3763 $LFS mkdir -c1 $DIR/$tdir || error "mkdir $DIR/$tdir failed"
3764 idx=$(printf "%04x" $($LFS getdirstripe -i $DIR/$tdir))
3765 facet="mds$((0x$idx + 1))"
3767 # get current value of max_mod_rcps_in_flight
3768 num=$($LCTL get_param -n \
3769 mdc.$FSNAME-MDT$idx-mdc-*.max_mod_rpcs_in_flight)
3770 # set default value if client does not support multi mod RPCs
3771 [ -z "$num" ] && num=1
3773 echo "creating $num files ..."
3775 for i in $(seq $num); do
3776 touch $DIR/$tdir/file-$i
3780 #define OBD_FAIL_MDS_REINT_MULTI_NET_REP 0x15a
3781 do_facet $facet "$LCTL set_param fail_loc=0x15a"
3782 echo "launch $num chmod in parallel ($(date +%H:%M:%S)) ..."
3783 for i in $(seq $num); do
3784 chmod 0600 $DIR/$tdir/file-$i &
3789 # write MDT transactions to disk
3790 do_facet $facet "sync; sync; sync"
3792 do_facet $facet "$LCTL set_param fail_loc=0"
3797 for pid in $pids; do
3798 wait $pid || error "chmod failed"
3800 echo "done ($(date +%H:%M:%S))"
3802 # check chmod succeed
3803 for i in $(seq $num); do
3804 checkstat -vp 0600 $DIR/$tdir/file-$i
3809 run_test 102d "check replay & reconstruction with multiple mod RPCs in flight"
3812 remote_mds_nodsh && skip "remote MDS with nodsh"
3813 [[ "$MDS1_VERSION" -gt $(version_code 2.8.54) ]] ||
3814 skip "Need MDS version 2.8.54+"
3816 #define OBD_FAIL_MDS_TRACK_OVERFLOW 0x162
3817 do_facet mds1 $LCTL set_param fail_loc=0x80000162
3820 createmany -o $DIR/$tdir/t- 30 ||
3821 error "create files on remote directory failed"
3823 rm -rf $DIR/$tdir/t-*
3825 #MDS should crash with tr->otr_next_id overflow
3828 run_test 103 "Check otr_next_id overflow"
3831 check_striped_dir_110()
3833 $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
3834 error "create striped dir failed"
3835 local stripe_count=$($LFS getdirstripe -c $DIR/$tdir/striped_dir)
3836 [ $stripe_count -eq $MDSCOUNT ] ||
3837 error "$stripe_count != 2 after recovery"
3841 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3842 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
3843 skip "Need MDS version at least 2.7.56"
3845 ([ $FAILURE_MODE == "HARD" ] &&
3846 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3847 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3852 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3855 check_striped_dir_110 || error "check striped_dir failed"
3856 rm -rf $DIR/$tdir || error "rmdir failed"
3860 run_test 110a "DNE: create striped dir, fail MDT1"
3863 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3864 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
3865 skip "Need MDS version at least 2.7.56"
3867 ([ $FAILURE_MODE == "HARD" ] &&
3868 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3869 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3874 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3877 zconf_mount $(hostname) $MOUNT
3878 client_up || return 1
3880 check_striped_dir_110 || error "check striped_dir failed"
3882 rm -rf $DIR/$tdir || error "rmdir failed"
3886 run_test 110b "DNE: create striped dir, fail MDT1 and client"
3889 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3890 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
3891 skip "Need MDS version at least 2.7.56"
3893 ([ $FAILURE_MODE == "HARD" ] &&
3894 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3895 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3900 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3903 check_striped_dir_110 || error "check striped_dir failed"
3905 rm -rf $DIR/$tdir || error "rmdir failed"
3909 run_test 110c "DNE: create striped dir, fail MDT2"
3912 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3913 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
3914 skip "Need MDS version at least 2.7.56"
3916 ([ $FAILURE_MODE == "HARD" ] &&
3917 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3918 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3923 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3926 zconf_mount $(hostname) $MOUNT
3927 client_up || return 1
3929 check_striped_dir_110 || error "check striped_dir failed"
3931 rm -rf $DIR/$tdir || error "rmdir failed"
3935 run_test 110d "DNE: create striped dir, fail MDT2 and client"
3938 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3939 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
3940 skip "Need MDS version at least 2.7.56"
3942 ([ $FAILURE_MODE == "HARD" ] &&
3943 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3944 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3949 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3953 zconf_mount $(hostname) $MOUNT
3954 client_up || return 1
3956 check_striped_dir_110 || error "check striped_dir failed"
3958 rm -rf $DIR/$tdir || error "rmdir failed"
3962 run_test 110e "DNE: create striped dir, uncommit on MDT2, fail client/MDT1/MDT2"
3965 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3966 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
3967 skip "Need MDS version at least 2.7.56"
3969 ([ $FAILURE_MODE == "HARD" ] &&
3970 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3971 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
3977 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
3980 check_striped_dir_110 || error "check striped_dir failed"
3982 rm -rf $DIR/$tdir || error "rmdir failed"
3986 run_test 110f "DNE: create striped dir, fail MDT1/MDT2"
3989 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
3990 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
3991 skip "Need MDS version at least 2.7.56"
3993 ([ $FAILURE_MODE == "HARD" ] &&
3994 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
3995 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4000 $LFS mkdir -i1 -c$MDSCOUNT $DIR/$tdir/striped_dir
4004 zconf_mount $(hostname) $MOUNT
4005 client_up || return 1
4007 check_striped_dir_110 || error "check striped_dir failed"
4009 rm -rf $DIR/$tdir || error "rmdir failed"
4013 run_test 110g "DNE: create striped dir, uncommit on MDT1, fail client/MDT1/MDT2"
4016 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4017 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4018 skip "Need MDS version at least 2.7.56"
4020 ([ $FAILURE_MODE == "HARD" ] &&
4021 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4022 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4026 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4028 rm -rf $DIR/$tdir/striped_dir
4031 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4032 error "striped dir still exists"
4035 run_test 111a "DNE: unlink striped dir, fail MDT1"
4038 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4039 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4040 skip "Need MDS version at least 2.7.56"
4042 ([ $FAILURE_MODE == "HARD" ] &&
4043 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4044 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4048 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4050 rm -rf $DIR/$tdir/striped_dir
4053 zconf_mount $(hostname) $MOUNT
4054 client_up || return 1
4056 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4057 error "striped dir still exists"
4060 run_test 111b "DNE: unlink striped dir, fail MDT2"
4063 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4064 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4065 skip "Need MDS version at least 2.7.56"
4067 ([ $FAILURE_MODE == "HARD" ] &&
4068 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4069 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4073 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4075 rm -rf $DIR/$tdir/striped_dir
4079 zconf_mount $(hostname) $MOUNT
4080 client_up || return 1
4081 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4082 error "striped dir still exists"
4085 run_test 111c "DNE: unlink striped dir, uncommit on MDT1, fail client/MDT1/MDT2"
4088 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4089 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4090 skip "Need MDS version at least 2.7.56"
4092 ([ $FAILURE_MODE == "HARD" ] &&
4093 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4094 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4098 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4100 rm -rf $DIR/$tdir/striped_dir
4104 zconf_mount $(hostname) $MOUNT
4105 client_up || return 1
4106 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4107 error "striped dir still exists"
4111 run_test 111d "DNE: unlink striped dir, uncommit on MDT2, fail client/MDT1/MDT2"
4114 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4115 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4116 skip "Need MDS version at least 2.7.56"
4118 ([ $FAILURE_MODE == "HARD" ] &&
4119 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4120 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4124 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4126 rm -rf $DIR/$tdir/striped_dir
4129 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4130 error "striped dir still exists"
4133 run_test 111e "DNE: unlink striped dir, uncommit on MDT2, fail MDT1/MDT2"
4136 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4137 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4138 skip "Need MDS version at least 2.7.56"
4140 ([ $FAILURE_MODE == "HARD" ] &&
4141 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4142 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4146 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4148 rm -rf $DIR/$tdir/striped_dir
4151 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4152 error "striped dir still exists"
4155 run_test 111f "DNE: unlink striped dir, uncommit on MDT1, fail MDT1/MDT2"
4158 (( $MDSCOUNT >= 2 )) || skip "needs >= 2 MDTs"
4159 (( $MDS1_VERSION >= $(version_code 2.7.56) )) ||
4160 skip "Need MDS version at least 2.7.56"
4162 ([ $FAILURE_MODE != "HARD" ] ||
4163 [ "$(facet_host mds1)" != "$(facet_host mds2)" ]) ||
4164 skip "MDTs needs to be on diff hosts for HARD fail mode"
4167 $LFS mkdir -i1 -c2 $DIR/$tdir/striped_dir
4171 rm -rf $DIR/$tdir/striped_dir
4173 $CHECKSTAT -t dir $DIR/$tdir/striped_dir &&
4174 error "striped dir still exists"
4177 run_test 111g "DNE: unlink striped dir, fail MDT1/MDT2"
4179 test_112_rename_prepare() {
4180 mkdir_on_mdt0 $DIR/$tdir
4181 mkdir -p $DIR/$tdir/src_dir
4182 $LFS mkdir -i 1 $DIR/$tdir/src_dir/src_child ||
4183 error "create remote source failed"
4185 touch $DIR/$tdir/src_dir/src_child/a
4187 $LFS mkdir -i 2 $DIR/$tdir/tgt_dir ||
4188 error "create remote target dir failed"
4190 $LFS mkdir -i 3 $DIR/$tdir/tgt_dir/tgt_child ||
4191 error "create remote target child failed"
4196 $CHECKSTAT -t dir $DIR/$tdir/src_dir/src_child &&
4197 error "src_child still exists after rename"
4199 $CHECKSTAT -t file $DIR/$tdir/tgt_dir/tgt_child/a ||
4200 error "missing file(a) after rename"
4204 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4205 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4206 skip "Need MDS version at least 2.7.56"
4208 ([ $FAILURE_MODE == "HARD" ] &&
4209 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4210 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4213 test_112_rename_prepare
4216 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4217 error "rename dir cross MDT failed!"
4221 rm -rf $DIR/$tdir || error "rmdir failed"
4223 run_test 112a "DNE: cross MDT rename, fail MDT1"
4226 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4227 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4228 skip "Need MDS version at least 2.7.56"
4230 ([ $FAILURE_MODE == "HARD" ] &&
4231 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4232 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4235 test_112_rename_prepare
4238 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4239 error "rename dir cross MDT failed!"
4244 rm -rf $DIR/$tdir || error "rmdir failed"
4246 run_test 112b "DNE: cross MDT rename, fail MDT2"
4249 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4250 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4251 skip "Need MDS version at least 2.7.56"
4253 ([ $FAILURE_MODE == "HARD" ] &&
4254 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4255 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4258 test_112_rename_prepare
4261 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4262 error "rename dir cross MDT failed!"
4267 rm -rf $DIR/$tdir || error "rmdir failed"
4269 run_test 112c "DNE: cross MDT rename, fail MDT3"
4272 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4273 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4274 skip "Need MDS version at least 2.7.56"
4276 ([ $FAILURE_MODE == "HARD" ] &&
4277 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4278 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4281 test_112_rename_prepare
4284 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4285 error "rename dir cross MDT failed!"
4290 rm -rf $DIR/$tdir || error "rmdir failed"
4292 run_test 112d "DNE: cross MDT rename, fail MDT4"
4295 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4296 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4297 skip "Need MDS version at least 2.7.56"
4299 ([ $FAILURE_MODE == "HARD" ] &&
4300 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4301 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4304 test_112_rename_prepare
4308 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4309 error "rename dir cross MDT failed!"
4314 rm -rf $DIR/$tdir || error "rmdir failed"
4316 run_test 112e "DNE: cross MDT rename, fail MDT1 and MDT2"
4319 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4320 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4321 skip "Need MDS version at least 2.7.56"
4323 ([ $FAILURE_MODE == "HARD" ] &&
4324 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4325 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4328 test_112_rename_prepare
4332 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4333 error "rename dir cross MDT failed!"
4338 rm -rf $DIR/$tdir || error "rmdir failed"
4340 run_test 112f "DNE: cross MDT rename, fail MDT1 and MDT3"
4343 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4344 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4345 skip "Need MDS version at least 2.7.56"
4347 ([ $FAILURE_MODE == "HARD" ] &&
4348 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4349 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4352 test_112_rename_prepare
4356 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4357 error "rename dir cross MDT failed!"
4362 rm -rf $DIR/$tdir || error "rmdir failed"
4364 run_test 112g "DNE: cross MDT rename, fail MDT1 and MDT4"
4367 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4368 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4369 skip "Need MDS version at least 2.7.56"
4371 ([ $FAILURE_MODE == "HARD" ] &&
4372 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4373 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4376 test_112_rename_prepare
4380 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4381 error "rename dir cross MDT failed!"
4386 rm -rf $DIR/$tdir || error "rmdir failed"
4388 run_test 112h "DNE: cross MDT rename, fail MDT2 and MDT3"
4391 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4392 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4393 skip "Need MDS version at least 2.7.56"
4395 ([ $FAILURE_MODE == "HARD" ] &&
4396 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4397 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4400 test_112_rename_prepare
4404 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4405 error "rename dir cross MDT failed!"
4410 rm -rf $DIR/$tdir || error "rmdir failed"
4412 run_test 112i "DNE: cross MDT rename, fail MDT2 and MDT4"
4415 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4416 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4417 skip "Need MDS version at least 2.7.56"
4419 ([ $FAILURE_MODE == "HARD" ] &&
4420 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4421 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4424 test_112_rename_prepare
4428 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4429 error "rename dir cross MDT failed!"
4434 rm -rf $DIR/$tdir || error "rmdir failed"
4436 run_test 112j "DNE: cross MDT rename, fail MDT3 and MDT4"
4439 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4440 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4441 skip "Need MDS version at least 2.7.56"
4443 ([ $FAILURE_MODE == "HARD" ] &&
4444 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4445 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4448 test_112_rename_prepare
4453 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4454 error "rename dir cross MDT failed!"
4459 rm -rf $DIR/$tdir || error "rmdir failed"
4461 run_test 112k "DNE: cross MDT rename, fail MDT1,MDT2,MDT3"
4464 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4465 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4466 skip "Need MDS version at least 2.7.56"
4468 ([ $FAILURE_MODE == "HARD" ] &&
4469 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4470 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4473 test_112_rename_prepare
4478 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4479 error "rename dir cross MDT failed!"
4484 rm -rf $DIR/$tdir || error "rmdir failed"
4486 run_test 112l "DNE: cross MDT rename, fail MDT1,MDT2,MDT4"
4489 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4490 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4491 skip "Need MDS version at least 2.7.56"
4493 ([ $FAILURE_MODE == "HARD" ] &&
4494 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4495 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4498 test_112_rename_prepare
4503 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4504 error "rename dir cross MDT failed!"
4509 rm -rf $DIR/$tdir || error "rmdir failed"
4511 run_test 112m "DNE: cross MDT rename, fail MDT1,MDT3,MDT4"
4514 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4515 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4516 skip "Need MDS version at least 2.7.56"
4518 ([ $FAILURE_MODE == "HARD" ] &&
4519 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4520 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4523 test_112_rename_prepare
4528 mrename $DIR/$tdir/src_dir/src_child $DIR/$tdir/tgt_dir/tgt_child ||
4529 error "rename dir cross MDT failed!"
4534 rm -rf $DIR/$tdir || error "rmdir failed"
4536 run_test 112n "DNE: cross MDT rename, fail MDT2,MDT3,MDT4"
4539 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4540 [[ "$MDS1_VERSION" -ge $(version_code 2.7.56) ]] ||
4541 skip "Need MDS version at least 2.7.56"
4543 ([ $FAILURE_MODE == "HARD" ] &&
4544 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4545 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4553 for ((j=0;j<$((MDSCOUNT));j++)); do
4554 fail_index=$((fail_index+1))
4555 index=$((fail_index % MDSCOUNT))
4556 replay_barrier mds$((index + 1))
4557 for ((i=0;i<5;i++)); do
4558 test_mkdir -i$index -c$MDSCOUNT $DIR/$tdir/test_$i ||
4559 error "create striped dir $DIR/$tdir/test_$i"
4562 fail mds$((index + 1))
4563 for ((i=0;i<5;i++)); do
4564 checkstat -t dir $DIR/$tdir/test_$i ||
4565 error "$DIR/$tdir/test_$i does not exist!"
4567 rm -rf $DIR/$tdir/test_* ||
4571 run_test 115 "failover for create/unlink striped directory"
4574 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4575 [ "$MDS1_VERSION" -lt $(version_code 2.7.55) ] &&
4576 skip "Do not support large update log before 2.7.55" &&
4578 ([ $FAILURE_MODE == "HARD" ] &&
4579 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4580 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4584 mkdir_on_mdt0 $DIR/$tdir
4587 # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
4588 do_facet mds1 "lctl set_param fail_loc=0x80001702"
4589 $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/striped_dir
4592 $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
4593 error "stried_dir does not exists"
4595 run_test 116a "large update log master MDT recovery"
4598 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4599 [ "$MDS1_VERSION" -lt $(version_code 2.7.55) ] &&
4600 skip "Do not support large update log before 2.7.55" &&
4603 ([ $FAILURE_MODE == "HARD" ] &&
4604 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4605 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4609 mkdir_on_mdt0 $DIR/$tdir
4612 # OBD_FAIL_SPLIT_UPDATE_REC 0x1702
4613 do_facet mds2 "lctl set_param fail_loc=0x80001702"
4614 $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/striped_dir
4617 $CHECKSTAT -t dir $DIR/$tdir/striped_dir ||
4618 error "stried_dir does not exists"
4620 run_test 116b "large update log slave MDT recovery"
4623 [ $MDSCOUNT -lt 4 ] && skip "needs >= 4 MDTs" && return 0
4624 ([ $FAILURE_MODE == "HARD" ] &&
4625 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4626 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4632 $LFS setdirstripe -i0 -c$MDSCOUNT $DIR/$tdir/remote_dir
4633 $LFS setdirstripe -i1 -c$MDSCOUNT $DIR/$tdir/remote_dir_1
4636 # Let's set rdonly on all MDTs, so client will send
4637 # replay requests on all MDTs and replay these requests
4638 # at the same time. This test will verify the recovery
4639 # will not be deadlock in this case, LU-7531.
4640 for ((index = 0; index < $((MDSCOUNT)); index++)); do
4641 replay_barrier mds$((index + 1))
4642 if [ -z $mds_indexs ]; then
4643 mds_indexs="${mds_indexs}mds$((index+1))"
4645 mds_indexs="${mds_indexs},mds$((index+1))"
4649 rm -rf $DIR/$tdir/remote_dir
4650 rm -rf $DIR/$tdir/remote_dir_1
4654 rm -rf $DIR/$tdir || error "rmdir failed"
4656 run_test 117 "DNE: cross MDT unlink, fail MDT1 and MDT2"
4659 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4660 [ "$MDS1_VERSION" -lt $(version_code 2.7.64) ] &&
4661 skip "Do not support large update log before 2.7.64" &&
4666 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir ||
4667 error "setdirstripe fails"
4668 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1 ||
4669 error "setdirstripe fails 1"
4670 rm -rf $DIR/$tdir/striped_dir* || error "rmdir fails"
4672 # OBD_FAIL_INVALIDATE_UPDATE 0x1705
4673 do_facet mds1 "lctl set_param fail_loc=0x1705"
4674 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
4675 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
4676 do_facet mds1 "lctl set_param fail_loc=0x0"
4679 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir
4680 $LFS setdirstripe -c2 $DIR/$tdir/striped_dir1
4685 run_test 118 "invalidate osp update will not cause update log corruption"
4688 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4689 [ "$MDS1_VERSION" -lt $(version_code 2.7.64) ] &&
4690 skip "Do not support large update log before 2.7.64" &&
4693 local hard_timeout=$(do_facet mds1 \
4694 "lctl get_param -n mdt.$FSNAME-MDT0000.recovery_time_hard")
4696 local clients=${CLIENTS:-$HOSTNAME}
4697 local time_min=$(recovery_time_min)
4699 mkdir_on_mdt0 $DIR/$tdir
4700 mkdir $DIR/$tdir/tmp
4701 rmdir $DIR/$tdir/tmp
4704 mkdir $DIR/$tdir/dir_1
4705 for ((i = 0; i < 20; i++)); do
4706 $LFS setdirstripe -i0 -c2 $DIR/$tdir/stripe_dir-$i
4713 #define OBD_FAIL_TGT_REPLAY_DELAY 0x714
4714 do_facet mds1 $LCTL set_param fail_loc=0x80000714
4715 #sleep (timeout + 5), so mds will evict the client exports,
4716 #but DNE update recovery will keep going.
4717 do_facet mds1 $LCTL set_param fail_val=$((time_min + 5))
4719 mount_facet mds1 "-o recovery_time_hard=$time_min"
4721 wait_clients_import_state "$clients" mds1 FULL
4723 clients_up || clients_up || error "failover df: $?"
4725 #revert back the hard timeout
4726 do_facet mds1 $LCTL set_param \
4727 mdt.$FSNAME-MDT0000.recovery_time_hard=$hard_timeout
4729 for ((i = 0; i < 20; i++)); do
4730 stripe_count=$($LFS getdirstripe -c $DIR/$tdir/stripe_dir-$i)
4731 [ $stripe_count == 2 ] || {
4732 error "stripe_dir-$i creation replay fails"
4737 run_test 119 "timeout of normal replay does not cause DNE replay fails "
4740 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4741 [ "$MDS1_VERSION" -lt $(version_code 2.7.64) ] &&
4742 skip "Do not support large update log before 2.7.64" &&
4745 mkdir_on_mdt0 $DIR/$tdir
4746 replay_barrier_nosync mds1
4747 for ((i = 0; i < 20; i++)); do
4748 mkdir $DIR/$tdir/dir-$i || {
4749 error "create dir-$i fails"
4752 $LFS setdirstripe -i0 -c2 $DIR/$tdir/stripe_dir-$i || {
4753 error "create stripe_dir-$i fails"
4758 stack_trap fail_abort_cleanup RETURN
4761 for ((i = 0; i < 20; i++)); do
4762 [ ! -e "$DIR/$tdir/dir-$i" ] || {
4763 error "dir-$i still exists"
4766 [ ! -e "$DIR/$tdir/stripe_dir-$i" ] || {
4767 error "stripe_dir-$i still exists"
4772 run_test 120 "DNE fail abort should stop both normal and DNE replay"
4775 [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
4776 skip "Don't support it before 2.11" &&
4779 local at_max_saved=$(at_max_get mds)
4781 touch $DIR/$tfile || error "touch $DIR/$tfile failed"
4782 cancel_lru_locks mdc
4784 multiop_bg_pause $DIR/$tfile s_s || error "multiop $DIR/$tfile failed"
4787 lctl set_param -n ldlm.cancel_unused_locks_before_replay "0"
4793 #define OBD_FAIL_TGT_RECOVERY_REQ_RACE 0x721
4794 do_facet $SINGLEMDS "lctl set_param fail_loc=0x721 fail_val=0"
4798 wait_clients_import_state "$clients" mds1 FULL
4799 clients_up || clients_up || error "failover df: $?"
4802 wait $mpid || error "multiop_bg_pause pid failed"
4804 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0"
4805 lctl set_param -n ldlm.cancel_unused_locks_before_replay "1"
4806 at_max_set $at_max_saved mds
4809 run_test 121 "lock replay timed out and race"
4812 [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
4813 skip "Do not support Data-on-MDT before 2.11"
4815 replay_barrier $SINGLEMDS
4816 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
4819 [ $($LFS getstripe -L $DIR/$tfile) == "mdt" ] ||
4820 error "Fail to replay DoM file creation"
4822 run_test 130a "DoM file create (setstripe) replay"
4825 [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
4826 skip "Do not support Data-on-MDT before 2.11"
4828 mkdir_on_mdt0 $DIR/$tdir
4829 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tdir
4830 replay_barrier $SINGLEMDS
4831 touch $DIR/$tdir/$tfile
4834 [ $($LFS getstripe -L $DIR/$tdir/$tfile) == "mdt" ] ||
4835 error "Fail to replay DoM file creation"
4837 run_test 130b "DoM file create (inherited) replay"
4840 [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
4841 skip "Do not support Data-on-MDT before 2.11"
4843 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
4844 replay_barrier $SINGLEMDS
4845 echo "dom_data" | dd of=$DIR/$tfile bs=8 count=1
4846 # lock is not canceled and will be replayed
4849 [ $(cat $DIR/$tfile) == "dom_data" ] ||
4850 error "Wrong file content after failover"
4852 run_test 131a "DoM file write lock replay"
4855 [ "$MDS1_VERSION" -lt $(version_code 2.10.90) ] &&
4856 skip "Do not support Data-on-MDT before 2.11"
4858 $LFS setstripe -E 1M -L mdt -E EOF -c 2 $DIR/$tfile
4859 replay_barrier $SINGLEMDS
4860 echo "dom_data" | dd of=$DIR/$tfile bs=8 count=1
4861 cancel_lru_locks mdc
4865 [ $(cat $DIR/$tfile) == "dom_data" ] ||
4866 error "Wrong file content after failover"
4868 run_test 131b "DoM file write replay"
4871 [ "$MDS1_VERSION" -lt $(version_code 2.12.0) ] &&
4872 skip "Need MDS version 2.12.0 or later"
4874 $LFS setstripe -E 1M -c 1 -E EOF -c 2 $DIR/$tfile
4875 replay_barrier $SINGLEMDS
4876 # write over the first component size cause next component instantiation
4877 dd if=/dev/urandom of=$DIR/$tfile bs=1M count=1 seek=1 ||
4878 error "dd to $DIR/$tfile failed"
4879 lfs getstripe $DIR/$tfile
4881 cksum=$(md5sum $DIR/$tfile | awk '{print $1}')
4882 $LFS getstripe -I2 $DIR/$tfile | grep -q lmm_objects ||
4883 error "Component #1 was not instantiated"
4887 lfs getstripe $DIR/$tfile
4888 $LFS getstripe -I2 $DIR/$tfile | grep -q lmm_objects ||
4889 error "Component #1 instantiation was not replayed"
4890 cksum2=$(md5sum $DIR/$tfile | awk '{print $1}')
4891 if [ $cksum != $cksum2 ] ; then
4892 error_noexit "New cksum $cksum2 does not match original $cksum"
4895 run_test 132a "PFL new component instantiate replay"
4898 [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0
4899 ([ $FAILURE_MODE == "HARD" ] &&
4900 [ "$(facet_host mds1)" == "$(facet_host mds2)" ]) &&
4901 skip "MDTs needs to be on diff hosts for HARD fail mode" &&
4904 local remote_dir=$DIR/$tdir/remote_dir
4906 mkdir -p $DIR/$tdir || error "mkdir $DIR/$tdir failed"
4907 $LFS mkdir -i 1 $remote_dir
4910 do_facet mds2 $LCTL set_param seq.srv*MDT0001.space=clear
4912 zconf_mount $(hostname) $MOUNT
4913 client_up || return 1
4915 #define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123
4917 do_facet mds1 $LCTL set_param fail_val=700 fail_loc=0x80000123
4918 cp /etc/hosts $remote_dir/file &
4924 wait $pid || error "cp failed"
4925 rm -rf $DIR/$tdir || error "rmdir failed"
4929 run_test 133 "check resend of ongoing requests for lwp during failover"
4932 [ $OSTCOUNT -lt 2 ] && skip "needs >= 2 OSTs" && return 0
4933 (( $MDS1_VERSION >= $(version_code 2.13.56) )) ||
4934 skip "need MDS version >= 2.13.56"
4937 pool_add_targets pool_134 1 1
4939 mkdir -p $DIR/$tdir/{A,B}
4940 $LFS setstripe -p pool_134 $DIR/$tdir/A
4941 $LFS setstripe -E EOF -p pool_134 $DIR/$tdir/B
4945 touch $DIR/$tdir/A/$tfile || error "touch non-pfl file failed"
4946 touch $DIR/$tdir/B/$tfile || error "touch pfl failed"
4950 [ -f $DIR/$tdir/A/$tfile ] || error "non-pfl file does not exist"
4951 [ -f $DIR/$tdir/B/$tfile ] || error "pfl file does not exist"
4953 run_test 134 "replay creation of a file created in a pool"
4957 mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed"
4960 $LFS setstripe -S $((128 * 1024)) -i 0 $DIR/$tdir
4964 # Create 20 files so we have 20 ost locks
4965 for i in $(seq 20) ; do
4966 echo blah > $DIR/$tdir/file.${i}
4974 #define OBD_FAIL_TGT_REPLAY_RECONNECT 0x32d
4975 # Make sure lock replay server side never completes and errors out.
4976 do_facet ost1 "$LCTL set_param fail_val=20"
4977 do_facet ost1 "$LCTL set_param fail_loc=0x32d"
4981 # Now make sure we notice
4984 sleep 20 # should we do something proactive to make reconnects go?
4985 kill -0 $PID || error "Unexpected sync success"
4992 do_facet ost1 "$LCTL set_param fail_loc=0"
4994 echo blah > $DIR/$tdir/file.test2
4998 run_test 135 "Server failure in lock replay phase"
5001 (( $MDSCOUNT >= 3 )) || skip "needs > 2 MDTs"
5002 (( MDS1_VERSION >= $(version_code 2.15.53) )) ||
5003 skip "need MDS version >= 2.15.53 for LU-16536 fix"
5005 $LFS mkdir -i0 -c3 $DIR/$tdir || error "can't mkdir"
5006 $LFS getdirstripe $DIR/$tdir
5009 #define OBD_FAIL_OUT_DROP_DESTROY 0x170b
5010 local mdts=$(comma_list $(mdts_nodes))
5011 do_nodes $mdts $LCTL set_param fail_loc=0x170b
5017 start mds1 $(mdsdevname 1) $MDS_MOUNT_OPTS || error "MDT1 start failed"
5018 start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || error "MDT2 start failed"
5019 start mds3 $(mdsdevname 3) $MDS_MOUNT_OPTS || error "MDT3 star"
5021 run_test 136 "MDS to disconnect all OSPs first, then cleanup ldlm"
5024 check_and_cleanup_lustre