7 # This test needs to be run on the client
10 LUSTRE=${LUSTRE:-`dirname $0`/..}
13 . $LUSTRE/tests/test-framework.sh
15 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
16 CHECK_GRANT=${CHECK_GRANT:-"yes"}
17 GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
21 ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT"
23 # 63 min 7 min AT AT AT AT"
24 [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 6b 12 16 44 44b 65 66 67 68"
28 cleanup_and_setup_lustre
32 rm -rf $DIR/${TESTSUITE}/[df][0-9]* # bug 13798 new t-f tdir staff
33 rm -rf $DIR/[df][0-9]*
39 run_test 0 "empty replay"
42 # this test attempts to trigger a race in the precreation code,
43 # and must run before any other objects are created on the filesystem
45 createmany -o $DIR/$tfile 20 || return 1
46 unlinkmany $DIR/$tfile 20 || return 2
48 run_test 0b "ensure object created after recover exists. (3284)"
54 $CHECKSTAT -t file $DIR/$tfile || return 1
57 run_test 1 "simple create"
60 do_facet ost1 "sysctl -w lustre.fail_loc=0"
63 local old_last_id=`cat $LPROC/obdfilter/*/last_id`
64 touch -o $DIR/$tfile 1
66 local new_last_id=`cat $LPROC/obdfilter/*/last_id`
68 test "$old_last_id" = "$new_last_id" || {
69 echo "OST object create is caused by MDS"
73 old_last_id=`cat $LPROC/obdfilter/*/last_id`
74 echo "data" > $DIR/$tfile
76 new_last_id=`cat $LPROC/obdfilter/*/last_id`
77 test "$old_last_id" = "$new_last_id "&& {
78 echo "CROW does not work on write"
84 #define OBD_FAIL_OST_CROW_EIO | OBD_FAIL_ONCE
85 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000801"
88 old_last_id=`cat $LPROC/obdfilter/*/last_id`
89 echo "data" > $DIR/1a1
91 new_last_id=`cat $LPROC/obdfilter/*/last_id`
92 test "$old_last_id" = "$new_last_id" || {
93 echo "CROW does work with fail_loc=0x80000801"
99 do_facet ost1 "sysctl -w lustre.fail_loc=0"
101 #CROW run_test 1a "CROW object create (check OST last_id)"
107 $CHECKSTAT -t file $DIR/$tfile || return 1
117 $CHECKSTAT -t file $DIR/$tfile || return 1
125 o_directory $DIR/$tfile
127 $CHECKSTAT -t file $DIR/$tfile || return 2
130 run_test 3a "replay failed open(O_DIRECTORY)"
134 #define OBD_FAIL_MDS_OPEN_PACK | OBD_FAIL_ONCE
135 do_facet mds "sysctl -w lustre.fail_loc=0x80000114"
137 do_facet mds "sysctl -w lustre.fail_loc=0"
139 $CHECKSTAT -t file $DIR/$tfile && return 2
142 run_test 3b "replay failed open -ENOMEM"
146 #define OBD_FAIL_MDS_ALLOC_OBDO | OBD_FAIL_ONCE
147 do_facet mds "sysctl -w lustre.fail_loc=0x80000128"
149 do_facet mds "sysctl -w lustre.fail_loc=0"
152 $CHECKSTAT -t file $DIR/$tfile && return 2
155 run_test 3c "replay failed open -ENOMEM"
159 for i in `seq 10`; do
160 echo "tag-$i" > $DIR/$tfile-$i
163 for i in `seq 10`; do
164 grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
167 run_test 4 "|x| 10 open(O_CREAT)s"
173 $CHECKSTAT -t file $DIR/$tfile-* && return 1 || true
175 run_test 4b "|x| rm 10 files"
177 # The idea is to get past the first block of precreated files on both
178 # osts, and then replay.
181 for i in `seq 220`; do
182 echo "tag-$i" > $DIR/$tfile-$i
185 for i in `seq 220`; do
186 grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
190 # waiting for commitment of removal
192 run_test 5 "|x| 220 open(O_CREAT)"
197 mcreate $DIR/$tdir/$tfile
199 $CHECKSTAT -t dir $DIR/$tdir || return 1
200 $CHECKSTAT -t file $DIR/$tdir/$tfile || return 2
202 # waiting for log process thread
204 run_test 6 "mkdir + contained create"
210 $CHECKSTAT -t dir $DIR/$tdir && return 1 || true
212 run_test 6b "|X| rmdir"
216 mcreate $DIR/$tdir/$tfile
218 $CHECKSTAT -t dir $DIR/$tdir || return 1
219 $CHECKSTAT -t file $DIR/$tdir/$tfile || return 2
222 run_test 7 "mkdir |X| contained create"
226 multiop $DIR/$tfile mo_c &
231 $CHECKSTAT -t file $DIR/$tfile || return 1
232 kill -USR1 $MULTIPID || return 2
233 wait $MULTIPID || return 3
236 run_test 8 "creat open |X| close"
241 local old_inum=`ls -i $DIR/$tfile | awk '{print $1}'`
243 local new_inum=`ls -i $DIR/$tfile | awk '{print $1}'`
245 echo " old_inum == $old_inum, new_inum == $new_inum"
246 if [ $old_inum -eq $new_inum ] ;
248 echo " old_inum and new_inum match"
250 echo "!!!! old_inum and new_inum NOT match"
255 run_test 9 "|X| create (same inum/gen)"
260 mv $DIR/$tfile $DIR/$tfile-2
263 $CHECKSTAT $DIR/$tfile && return 1
264 $CHECKSTAT $DIR/$tfile-2 ||return 2
268 run_test 10 "create |X| rename unlink"
272 echo "old" > $DIR/$tfile
273 mv $DIR/$tfile $DIR/$tfile-2
275 echo "new" > $DIR/$tfile
277 grep old $DIR/$tfile-2
279 grep new $DIR/$tfile || return 1
280 grep old $DIR/$tfile-2 || return 2
282 run_test 11 "create open write rename |X| create-old-name read"
286 multiop $DIR/$tfile o_tSc &
288 # give multiop a chance to open
293 wait $pid || return 1
296 [ -e $DIR/$tfile ] && return 2
299 run_test 12 "open, unlink |X| close"
302 # 1777 - replay open after committed chmod that would make
303 # a regular open a failure
306 multiop $DIR/$tfile O_wc &
308 # give multiop a chance to open
311 $CHECKSTAT -p 0 $DIR/$tfile
315 wait $pid || return 1
317 $CHECKSTAT -s 1 -p 0 $DIR/$tfile || return 2
320 run_test 13 "open chmod 0 |x| write close"
323 multiop $DIR/$tfile O_tSc &
325 # give multiop a chance to open
329 kill -USR1 $pid || return 1
330 wait $pid || return 2
333 [ -e $DIR/$tfile ] && return 3
336 run_test 14 "open(O_CREAT), unlink |X| close"
339 multiop $DIR/$tfile O_tSc &
341 # give multiop a chance to open
345 touch $DIR/g11 || return 1
347 wait $pid || return 2
350 [ -e $DIR/$tfile ] && return 3
351 touch $DIR/h11 || return 4
354 run_test 15 "open(O_CREAT), unlink |X| touch new, close"
361 mcreate $DIR/$tfile-2
363 [ -e $DIR/$tfile ] && return 1
364 [ -e $DIR/$tfile-2 ] || return 2
365 munlink $DIR/$tfile-2 || return 3
367 run_test 16 "|X| open(O_CREAT), unlink, touch new, unlink new"
371 multiop $DIR/$tfile O_c &
373 # give multiop a chance to open
376 kill -USR1 $pid || return 1
377 wait $pid || return 2
378 $CHECKSTAT -t file $DIR/$tfile || return 3
381 run_test 17 "|X| open(O_CREAT), |replay| close"
385 multiop $DIR/$tfile O_tSc &
387 # give multiop a chance to open
390 touch $DIR/$tfile-2 || return 1
391 echo "pid: $pid will close"
393 wait $pid || return 2
396 [ -e $DIR/$tfile ] && return 3
397 [ -e $DIR/$tfile-2 ] || return 4
398 # this touch frequently fails
399 touch $DIR/$tfile-3 || return 5
400 munlink $DIR/$tfile-2 || return 6
401 munlink $DIR/$tfile-3 || return 7
404 run_test 18 "|X| open(O_CREAT), unlink, touch new, close, touch, unlink"
406 # bug 1855 (a simpler form of test_11 above)
410 echo "old" > $DIR/$tfile
411 mv $DIR/$tfile $DIR/$tfile-2
412 grep old $DIR/$tfile-2
414 grep old $DIR/$tfile-2 || return 2
416 run_test 19 "|X| mcreate, open, write, rename "
420 multiop $DIR/$tfile O_tSc &
422 # give multiop a chance to open
428 wait $pid || return 1
429 [ -e $DIR/$tfile ] && return 2
432 run_test 20 "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
434 test_20b() { # bug 10480
435 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
437 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 &
439 while [ ! -e $DIR/$tfile ] ; do
440 sleep 0.060s # give dd a chance to start
443 lfs getstripe $DIR/$tfile || return 1
444 rm -f $DIR/$tfile || return 2 # make it an orphan
446 df -P $DIR || df -P $DIR || true # reconnect
448 fail mds # start orphan recovery
449 df -P $DIR || df -P $DIR || true # reconnect
450 wait_mds_recovery_done || error "MDS recovery not done"
452 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
453 log "before $BEFOREUSED, after $AFTERUSED"
454 [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
455 error "after $AFTERUSED > before $BEFOREUSED" && return 5
458 run_test 20b "write, unlink, eviction, replay, (test mds_cleanup_orphans)"
460 test_20c() { # bug 10480
461 multiop $DIR/$tfile Ow_c &
463 # give multiop a chance to open
470 df -P $DIR || df -P $DIR || true # reconnect
473 test -s $DIR/$tfile || error "File was truncated"
477 run_test 20c "check that client eviction does not affect file content"
481 multiop $DIR/$tfile O_tSc &
483 # give multiop a chance to open
486 touch $DIR/g11 || return 1
490 wait $pid || return 2
491 [ -e $DIR/$tfile ] && return 3
492 touch $DIR/h11 || return 4
495 run_test 21 "|X| open(O_CREAT), unlink touch new, replay, close (test mds_cleanup_orphans)"
498 multiop $DIR/$tfile O_tSc &
500 # give multiop a chance to open
508 wait $pid || return 1
509 [ -e $DIR/$tfile ] && return 2
512 run_test 22 "open(O_CREAT), |X| unlink, replay, close (test mds_cleanup_orphans)"
515 multiop $DIR/$tfile O_tSc &
517 # give multiop a chance to open
522 touch $DIR/g11 || return 1
526 wait $pid || return 2
527 [ -e $DIR/$tfile ] && return 3
528 touch $DIR/h11 || return 4
531 run_test 23 "open(O_CREAT), |X| unlink touch new, replay, close (test mds_cleanup_orphans)"
534 multiop $DIR/$tfile O_tSc &
536 # give multiop a chance to open
543 wait $pid || return 1
544 [ -e $DIR/$tfile ] && return 2
547 run_test 24 "open(O_CREAT), replay, unlink, close (test mds_cleanup_orphans)"
550 multiop $DIR/$tfile O_tSc &
552 # give multiop a chance to open
559 wait $pid || return 1
560 [ -e $DIR/$tfile ] && return 2
563 run_test 25 "open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
567 multiop $DIR/$tfile-1 O_tSc &
569 multiop $DIR/$tfile-2 O_tSc &
571 # give multiop a chance to open
576 wait $pid2 || return 1
580 wait $pid1 || return 2
581 [ -e $DIR/$tfile-1 ] && return 3
582 [ -e $DIR/$tfile-2 ] && return 4
585 run_test 26 "|X| open(O_CREAT), unlink two, close one, replay, close one (test mds_cleanup_orphans)"
589 multiop $DIR/$tfile-1 O_tSc &
591 multiop $DIR/$tfile-2 O_tSc &
593 # give multiop a chance to open
600 wait $pid1 || return 1
602 wait $pid2 || return 2
603 [ -e $DIR/$tfile-1 ] && return 3
604 [ -e $DIR/$tfile-2 ] && return 4
607 run_test 27 "|X| open(O_CREAT), unlink two, replay, close two (test mds_cleanup_orphans)"
610 multiop $DIR/$tfile-1 O_tSc &
612 multiop $DIR/$tfile-2 O_tSc &
614 # give multiop a chance to open
620 wait $pid2 || return 1
624 wait $pid1 || return 2
625 [ -e $DIR/$tfile-1 ] && return 3
626 [ -e $DIR/$tfile-2 ] && return 4
629 run_test 28 "open(O_CREAT), |X| unlink two, close one, replay, close one (test mds_cleanup_orphans)"
632 multiop $DIR/$tfile-1 O_tSc &
634 multiop $DIR/$tfile-2 O_tSc &
636 # give multiop a chance to open
644 wait $pid1 || return 1
646 wait $pid2 || return 2
647 [ -e $DIR/$tfile-1 ] && return 3
648 [ -e $DIR/$tfile-2 ] && return 4
651 run_test 29 "open(O_CREAT), |X| unlink two, replay, close two (test mds_cleanup_orphans)"
654 multiop $DIR/$tfile-1 O_tSc &
656 multiop $DIR/$tfile-2 O_tSc &
658 # give multiop a chance to open
666 wait $pid1 || return 1
668 wait $pid2 || return 2
669 [ -e $DIR/$tfile-1 ] && return 3
670 [ -e $DIR/$tfile-2 ] && return 4
673 run_test 30 "open(O_CREAT) two, unlink two, replay, close two (test mds_cleanup_orphans)"
676 multiop $DIR/$tfile-1 O_tSc &
678 multiop $DIR/$tfile-2 O_tSc &
680 # give multiop a chance to open
688 wait $pid1 || return 1
690 wait $pid2 || return 2
691 [ -e $DIR/$tfile-1 ] && return 3
692 [ -e $DIR/$tfile-2 ] && return 4
695 run_test 31 "open(O_CREAT) two, unlink one, |X| unlink one, close two (test mds_cleanup_orphans)"
697 # tests for bug 2104; completion without crashing is success. The close is
698 # stale, but we always return 0 for close, so the app never sees it.
700 multiop $DIR/$tfile O_c &
702 multiop $DIR/$tfile O_c &
704 # give multiop a chance to open
707 df $MOUNT || sleep 1 && df $MOUNT || return 1
713 run_test 32 "close() notices client eviction; close() after client eviction"
715 # Abort recovery before client complete
718 createmany -o $DIR/$tfile-%d 100
720 # this file should be gone, because the replay was aborted
721 $CHECKSTAT -t file $DIR/$tfile-* && return 3
722 unlinkmany $DIR/$tfile-%d 0 100
725 run_test 33 "abort recovery before client does replay"
728 multiop $DIR/$tfile O_c &
730 # give multiop a chance to open
737 [ -e $DIR/$tfile ] && return 1
741 run_test 34 "abort recovery before client does replay (test mds_cleanup_orphans)"
743 # bug 2278 - generate one orphan on OST, then destroy it during recovery from llog
747 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
748 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
753 # give a chance to remove from MDS
755 $CHECKSTAT -t file $DIR/$tfile && return 1 || true
757 run_test 35 "test recovery from llog for unlink op"
759 # b=2432 resent cancel after replay uses wrong cookie,
760 # so don't resend cancels
764 checkstat $DIR/$tfile
767 if dmesg | grep "unknown lock cookie"; then
768 echo "cancel after replay failed"
772 run_test 36 "don't resend cancel"
775 # directory orphans can't be unlinked from PENDING directory
777 rmdir $DIR/$tfile 2>/dev/null
778 multiop $DIR/$tfile dD_c &
780 # give multiop a chance to open
785 # clear the dmesg buffer so we only see errors from this recovery
789 dmesg | grep "mds_unlink_orphan.*error .* unlinking orphan" && return 1
793 run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)"
796 createmany -o $DIR/$tfile-%d 800
797 unlinkmany $DIR/$tfile-%d 0 400
800 unlinkmany $DIR/$tfile-%d 400 400
802 $CHECKSTAT -t file $DIR/$tfile-* && return 1 || true
804 run_test 38 "test recovery from unlink llog (test llog_gen_rec) "
806 test_39() { # bug 4176
807 createmany -o $DIR/$tfile-%d 800
809 unlinkmany $DIR/$tfile-%d 0 400
811 unlinkmany $DIR/$tfile-%d 400 400
814 $CHECKSTAT -t file $DIR/$tfile-* && return 1 || true
816 run_test 39 "test recovery from unlink llog (test llog_gen_rec) "
819 awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }' $LPROC/osc/*/stats
824 $LCTL mark multiop $MOUNT/$tfile OS_c
825 multiop $MOUNT/$tfile OS_c &
827 writeme -s $MOUNT/${tfile}-2 &
831 #define OBD_FAIL_MDS_CONNECT_NET 0x117
832 do_facet mds "sysctl -w lustre.fail_loc=0x80000117"
834 stat1=`count_ost_writes`
836 stat2=`count_ost_writes`
837 echo "$stat1, $stat2"
838 if [ $stat1 -lt $stat2 ]; then
839 echo "writes continuing during recovery"
842 echo "writes not continuing during recovery, bug 2477"
845 echo "waiting for writeme $WRITE_PID"
849 echo "waiting for multiop $PID"
850 wait $PID || return 2
851 do_facet client munlink $MOUNT/$tfile || return 3
852 do_facet client munlink $MOUNT/${tfile}-2 || return 3
855 run_test 40 "cause recovery in ptlrpc, ensure IO continues"
859 # make sure that a read to one osc doesn't try to double-unlock its page just
860 # because another osc is invalid. trigger_group_io used to mistakenly return
861 # an error if any oscs were invalid even after having successfully put rpcs
862 # on valid oscs. This was fatal if the caller was ll_readpage who unlocked
863 # the page, guarnateeing that the unlock from the RPC completion would
864 # assert on trying to unlock the unlocked page.
866 [ $OSTCOUNT -lt 2 ] && \
867 skip "skipping test 41: we don't have a second OST to test with" && \
870 local f=$MOUNT/$tfile
871 # make sure the start of the file is ost1
872 lfs setstripe $f -s $((128 * 1024)) -i 0
873 do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3
875 # fail ost2 and read from ost1
876 local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'`
877 [ "$osc2dev" ] || return 4
878 $LCTL --device $osc2dev deactivate || return 1
879 do_facet client dd if=$f of=/dev/null bs=4k count=1 || return 3
880 $LCTL --device $osc2dev activate || return 2
883 run_test 41 "read from a valid osc while other oscs are invalid"
885 # test MDS recovery after ost failure
887 blocks=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'`
888 createmany -o $DIR/$tfile-%d 800
890 unlinkmany $DIR/$tfile-%d 0 400
892 sysctl -w lnet.debug=-1
895 # osc is evicted, fs is smaller (but only with failout OSTs (bug 7287)
896 #blocks_after=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'`
897 #[ $blocks_after -lt $blocks ] || return 1
898 echo wait for MDS to timeout and recover
899 sleep $((TIMEOUT * 2))
901 unlinkmany $DIR/$tfile-%d 400 400
902 $CHECKSTAT -t file $DIR/$tfile-* && return 2 || true
904 run_test 42 "recovery after ost failure"
906 # timeout in MDS/OST recovery RPC will LBUG MDS
907 test_43() { # bug 2530
910 # OBD_FAIL_OST_CREATE_NET 0x204
911 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000204"
914 do_facet ost1 "sysctl -w lustre.fail_loc=0"
918 run_test 43 "mds osc import failure during recovery; don't LBUG"
921 mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
922 [ "$mdcdev" ] || exit 2
923 # adaptive timeouts slow this way down
924 local at_max=$(do_facet mds "find /sys/ -name at_max")
925 [ -z "$at_max" ] && skip "missing /sys/.../at_max" && return 0
926 MDS_AT_MAX=$(do_facet mds "cat $at_max")
927 do_facet mds "echo 40 >> $at_max"
928 for i in `seq 1 10`; do
929 echo "$i of 10 ($(date +%s))"
930 do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
931 #define OBD_FAIL_TGT_CONN_RACE 0x701
932 do_facet mds "sysctl -w lustre.fail_loc=0x80000701"
933 $LCTL --device $mdcdev recover
936 do_facet mds "sysctl -w lustre.fail_loc=0"
937 do_facet mds "echo $MDS_AT_MAX >> $at_max"
940 run_test 44 "race in target handle connect"
943 mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
944 [ "$mdcdev" ] || exit 2
945 for i in `seq 1 10`; do
946 echo "$i of 10 ($(date +%s))"
947 do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
948 #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
949 do_facet mds "sysctl -w lustre.fail_loc=0x80000704"
950 $LCTL --device $mdcdev recover
953 do_facet mds "sysctl -w lustre.fail_loc=0"
956 run_test 44b "race in target handle connect"
958 # Handle failed close
960 mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
961 [ "$mdcdev" ] || exit 2
962 $LCTL --device $mdcdev recover
964 multiop $DIR/$tfile O_c &
968 # This will cause the CLOSE to fail before even
969 # allocating a reply buffer
970 $LCTL --device $mdcdev deactivate || return 4
974 wait $pid || return 1
976 $LCTL --device $mdcdev activate || return 5
979 $CHECKSTAT -t file $DIR/$tfile || return 2
982 run_test 45 "Handle failed close"
986 drop_reply "touch $DIR/$tfile"
988 # ironically, the previous test, 45, will cause a real forced close,
989 # so just look for one for this test
990 dmesg | grep -i "force closing client file handle for $tfile" && return 1
993 run_test 46 "Don't leak file handle after open resend (3325)"
995 test_47() { # bug 2824
996 # create some files to make sure precreate has been done on all
997 # OSTs. (just in case this test is run independently)
998 createmany -o $DIR/$tfile 20 || return 1
1000 # OBD_FAIL_OST_CREATE_NET 0x204
1002 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000204"
1003 df $MOUNT || return 2
1005 # let the MDS discover the OST failure, attempt to recover, fail
1006 # and recover again.
1007 sleep $((3 * TIMEOUT))
1009 # Without 2824, this createmany would hang
1010 createmany -o $DIR/$tfile 20 || return 3
1011 unlinkmany $DIR/$tfile 20 || return 4
1013 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1016 run_test 47 "MDS->OSC failure during precreate cleanup (2824)"
1020 createmany -o $DIR/$tfile 20 || return 1
1021 # OBD_FAIL_OST_EROFS 0x216
1023 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000216"
1024 df $MOUNT || return 2
1026 createmany -o $DIR/$tfile 20 20 || return 2
1027 unlinkmany $DIR/$tfile 40 || return 3
1029 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1032 run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
1035 local oscdev=`do_facet mds grep \'${ost1_svc}-osc \' $LPROC/devices | awk '{print $1}' | head -1`
1036 [ "$oscdev" ] || return 1
1037 do_facet mds $LCTL --device $oscdev recover || return 2
1038 do_facet mds $LCTL --device $oscdev recover || return 3
1039 # give the mds_lov_sync threads a chance to run
1042 run_test 50 "Double OSC recovery, don't LASSERT (3812)"
1044 # b3764 timed out lock replay
1047 cancel_lru_locks mdc
1049 multiop $DIR/$tfile s || return 1
1051 #define OBD_FAIL_LDLM_REPLY 0x30c
1052 do_facet mds "sysctl -w lustre.fail_loc=0x8000030c"
1053 fail mds || return 2
1054 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1056 $CHECKSTAT -t file $DIR/$tfile-* && return 3 || true
1058 run_test 52 "time out lock replay (3764)"
1060 # bug 3462 - simultaneous MDC requests
1062 mkdir -p $DIR/${tdir}-1
1063 mkdir -p $DIR/${tdir}-2
1064 multiop $DIR/${tdir}-1/f O_c &
1066 # give multiop a chance to open
1069 #define OBD_FAIL_MDS_CLOSE_NET 0x115
1070 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
1071 kill -USR1 $close_pid
1072 cancel_lru_locks MDC # force the close
1073 do_facet mds "sysctl -w lustre.fail_loc=0"
1074 mcreate $DIR/${tdir}-2/f || return 1
1076 # close should still be here
1077 [ -d /proc/$close_pid ] || return 2
1078 replay_barrier_nodf mds
1080 wait $close_pid || return 3
1082 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
1083 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
1084 rm -rf $DIR/${tdir}-*
1086 run_test 53a "|X| close request while two MDC requests in flight"
1089 mkdir -p $DIR/$tdir-1
1090 mkdir -p $DIR/$tdir-2
1091 multiop $DIR/$tdir-1/f O_c &
1094 #define OBD_FAIL_MDS_REINT_NET 0x107
1095 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
1096 mcreate $DIR/${tdir}-2/f &
1100 do_facet mds "sysctl -w lustre.fail_loc=0"
1101 kill -USR1 $close_pid
1102 cancel_lru_locks MDC # force the close
1103 wait $close_pid || return 1
1104 # open should still be here
1105 [ -d /proc/$open_pid ] || return 2
1107 replay_barrier_nodf mds
1109 wait $open_pid || return 3
1111 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
1112 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
1113 rm -rf $DIR/${tdir}-*
1115 run_test 53b "|X| open request while two MDC requests in flight"
1118 mkdir -p $DIR/${tdir}-1
1119 mkdir -p $DIR/${tdir}-2
1120 multiop $DIR/${tdir}-1/f O_c &
1123 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
1124 mcreate $DIR/${tdir}-2/f &
1128 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
1129 kill -USR1 $close_pid
1130 cancel_lru_locks MDC # force the close
1132 replay_barrier_nodf mds
1134 wait $open_pid || return 1
1136 # close should be gone
1137 [ -d /proc/$close_pid ] && return 2
1138 do_facet mds "sysctl -w lustre.fail_loc=0"
1140 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
1141 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
1142 rm -rf $DIR/${tdir}-*
1144 run_test 53c "|X| open request and close request while two MDC requests in flight"
1147 mkdir -p $DIR/${tdir}-1
1148 mkdir -p $DIR/${tdir}-2
1149 multiop $DIR/${tdir}-1/f O_c &
1151 # give multiop a chance to open
1154 # define OBD_FAIL_MDS_CLOSE_NET_REP 0X138
1155 do_facet mds "sysctl -w lustre.fail_loc=0x8000013b"
1156 kill -USR1 $close_pid
1157 cancel_lru_locks MDC # force the close
1158 do_facet mds "sysctl -w lustre.fail_loc=0"
1159 mcreate $DIR/${tdir}-2/f || return 1
1161 # close should still be here
1162 [ -d /proc/$close_pid ] || return 2
1163 replay_barrier_nodf mds
1165 wait $close_pid || return 3
1167 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
1168 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
1169 rm -rf $DIR/${tdir}-*
1171 run_test 53d "|X| close reply while two MDC requests in flight"
1174 mkdir -p $DIR/$tdir-1
1175 mkdir -p $DIR/$tdir-2
1176 multiop $DIR/$tdir-1/f O_c &
1179 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
1180 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
1181 mcreate $DIR/${tdir}-2/f &
1185 do_facet mds "sysctl -w lustre.fail_loc=0"
1186 kill -USR1 $close_pid
1187 cancel_lru_locks MDC # force the close
1188 wait $close_pid || return 1
1189 # open should still be here
1190 [ -d /proc/$open_pid ] || return 2
1192 replay_barrier_nodf mds
1194 wait $open_pid || return 3
1196 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 4
1197 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 5
1198 rm -rf $DIR/${tdir}-*
1200 run_test 53e "|X| open reply while two MDC requests in flight"
1203 mkdir -p $DIR/${tdir}-1
1204 mkdir -p $DIR/${tdir}-2
1205 multiop $DIR/${tdir}-1/f O_c &
1208 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
1209 mcreate $DIR/${tdir}-2/f &
1213 do_facet mds "sysctl -w lustre.fail_loc=0x8000013b"
1214 kill -USR1 $close_pid
1215 cancel_lru_locks MDC
1217 replay_barrier_nodf mds
1219 wait $open_pid || return 1
1221 #close should be gone
1222 [ -d /proc/$close_pid ] && return 2
1223 do_facet mds "sysctl -w lustre.fail_loc=0"
1225 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
1226 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
1227 rm -rf $DIR/${tdir}-*
1229 run_test 53f "|X| open reply and close reply while two MDC requests in flight"
1232 mkdir -p $DIR/${tdir}-1
1233 mkdir -p $DIR/${tdir}-2
1234 multiop $DIR/${tdir}-1/f O_c &
1237 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
1238 mcreate $DIR/${tdir}-2/f &
1242 do_facet mds "sysctl -w lustre.fail_loc=0x80000115"
1243 kill -USR1 $close_pid
1244 cancel_lru_locks MDC # force the close
1246 do_facet mds "sysctl -w lustre.fail_loc=0"
1247 replay_barrier_nodf mds
1249 wait $open_pid || return 1
1251 # close should be gone
1252 [ -d /proc/$close_pid ] && return 2
1254 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
1255 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
1256 rm -rf $DIR/${tdir}-*
1258 run_test 53g "|X| drop open reply and close request while close and open are both in flight"
1261 mkdir -p $DIR/${tdir}-1
1262 mkdir -p $DIR/${tdir}-2
1263 multiop $DIR/${tdir}-1/f O_c &
1266 do_facet mds "sysctl -w lustre.fail_loc=0x80000107"
1267 mcreate $DIR/${tdir}-2/f &
1271 do_facet mds "sysctl -w lustre.fail_loc=0x8000013b"
1272 kill -USR1 $close_pid
1273 cancel_lru_locks MDC # force the close
1276 replay_barrier_nodf mds
1278 wait $open_pid || return 1
1280 # close should be gone
1281 [ -d /proc/$close_pid ] && return 2
1282 do_facet mds "sysctl -w lustre.fail_loc=0"
1284 $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3
1285 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4
1286 rm -rf $DIR/${tdir}-*
1288 run_test 53h "|X| open request and close reply while two MDC requests in flight"
1290 #b3761 ASSERTION(hash != 0) failed
1292 # OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE
1293 do_facet mds "sysctl -w lustre.fail_loc=0x8000012b"
1295 # give touch a chance to run
1297 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1301 run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"
1303 #b3440 ASSERTION(rec->ur_fid2->id) failed
1305 ln -s foo $DIR/$tfile
1307 #drop_reply "cat $DIR/$tfile"
1311 run_test 56 "don't replay a symlink open request (3440)"
1313 #recovery one mds-ost setattr from llog
1315 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1316 do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
1321 $CHECKSTAT -t file $DIR/$tfile || return 1
1322 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1325 run_test 57 "test recovery from llog for setattr op"
1327 #recovery many mds-ost setattr from llog
1329 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1330 do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
1331 createmany -o $DIR/$tdir/$tfile-%d 2500
1335 $CHECKSTAT -t file $DIR/$tdir/$tfile-* >/dev/null || return 1
1336 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1337 unlinkmany $DIR/$tdir/$tfile-%d 2500
1340 run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)"
1342 # log_commit_thread vs filter_destroy race used to lead to import use after free
1345 createmany -o $DIR/$tdir/$tfile-%d 200
1347 unlinkmany $DIR/$tdir/$tfile-%d 200
1348 #define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
1349 do_facet ost1 "sysctl -w lustre.fail_loc=0x507"
1352 do_facet ost1 "sysctl -w lustre.fail_loc=0x0"
1356 run_test 59 "test log_commit_thread vs filter_destroy race"
1358 # race between add unlink llog vs cat log init in post_recovery (only for b1_6)
1359 # bug 12086: should no oops and No ctxt error for this test
1361 createmany -o $DIR/$tdir/$tfile-%d 200
1363 unlinkmany $DIR/$tdir/$tfile-%d 0 100
1365 unlinkmany $DIR/$tdir/$tfile-%d 100 100
1366 local no_ctxt=`dmesg | grep "No ctxt"`
1367 [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
1369 run_test 60 "test llog post recovery init vs llog unlink"
1371 #test race llog recovery thread vs llog cleanup
1373 createmany -o $DIR/$tdir/$tfile-%d 800
1375 # OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
1376 unlinkmany $DIR/$tdir/$tfile-%d 800
1377 do_facet ost "sysctl -w lustre.fail_loc=0x80000221"
1382 do_facet ost "sysctl -w lustre.fail_loc=0x0"
1383 $CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1
1386 run_test 61a "test race llog recovery vs llog cleanup"
1388 #test race mds llog sync vs llog cleanup
1390 # OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
1391 do_facet mds "sysctl -w lustre.fail_loc=0x8000013a"
1395 do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 || return 1
1397 run_test 61b "test race mds llog sync vs llog cleanup"
1399 #test race cancel cookie cb vs llog cleanup
1401 # OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
1403 do_facet ost "sysctl -w lustre.fail_loc=0x80000222"
1408 run_test 61c "test race mds llog sync vs llog cleanup"
1411 at_start() #bug 3055
1413 if [ -z "$ATOLDBASE" ]; then
1414 local at_history=$(do_facet mds "find /sys/ -name at_history")
1415 [ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1
1416 ATOLDBASE=$(do_facet mds "cat $at_history")
1417 # speed up the timebase so we can check decreasing AT
1418 do_facet mds "echo 8 >> $at_history"
1419 do_facet ost1 "echo 8 >> $at_history"
1423 test_65a() #bug 3055
1425 at_start || return 0
1426 $LCTL dk > /dev/null
1428 sysctl -w lnet.debug="+other"
1429 # slow down a request
1430 do_facet mds sysctl -w lustre.fail_val=30000
1431 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1432 do_facet mds sysctl -w lustre.fail_loc=0x8000050a
1433 createmany -o $DIR/$tfile 10 > /dev/null
1434 unlinkmany $DIR/$tfile 10 > /dev/null
1435 # check for log message
1436 $LCTL dk | grep "Early reply #" || error "No early reply"
1437 # client should show 30s estimates
1438 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1440 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1442 run_test 65a "AT: verify early replies"
1444 test_65b() #bug 3055
1446 at_start || return 0
1449 sysctl -w lnet.debug="+other"
1450 $LCTL dk > /dev/null
1451 # slow down bulk i/o
1452 do_facet ost1 sysctl -w lustre.fail_val=30
1453 #define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224
1454 do_facet ost1 sysctl -w lustre.fail_loc=0x224
1457 lfs setstripe $DIR/$tfile --index=0 --count=1
1458 # force some real bulk transfer
1459 multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c
1461 do_facet ost1 sysctl -w lustre.fail_loc=0
1462 # check for log message
1463 $LCTL dk | grep "Early reply #" || error "No early reply"
1465 # client should show 30s estimates
1466 grep portal $LPROC/osc/${FSNAME}-OST0000-osc-*/timeouts
1468 run_test 65b "AT: verify early replies on packed reply / bulk"
1470 test_66a() #bug 3055
1472 at_start || return 0
1473 grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1474 # adjust 5s at a time so no early reply is sent (within deadline)
1475 do_facet mds "sysctl -w lustre.fail_val=5000"
1476 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1477 do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
1478 createmany -o $DIR/$tfile 20 > /dev/null
1479 unlinkmany $DIR/$tfile 20 > /dev/null
1480 grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1481 do_facet mds "sysctl -w lustre.fail_val=10000"
1482 do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
1483 createmany -o $DIR/$tfile 20 > /dev/null
1484 unlinkmany $DIR/$tfile 20 > /dev/null
1485 grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1486 do_facet mds "sysctl -w lustre.fail_loc=0"
1488 createmany -o $DIR/$tfile 20 > /dev/null
1489 unlinkmany $DIR/$tfile 20 > /dev/null
1490 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12"
1491 CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
1492 WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
1493 echo "Current MDT timeout $CUR, worst $WORST"
1494 [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
1496 run_test 66a "AT: verify MDT service time adjusts with no early replies"
1498 test_66b() #bug 3055
1500 at_start || return 0
1501 ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts)
1502 sysctl -w lustre.fail_val=$(($ORIG + 5))
1503 #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
1504 sysctl -w lustre.fail_loc=0x50c
1505 ls $DIR/$tfile > /dev/null 2>&1
1506 sysctl -w lustre.fail_loc=0
1507 CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts)
1508 WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts)
1509 echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
1510 [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG"
1512 run_test 66b "AT: verify net latency adjusts"
1514 test_67a() #bug 3055
1516 at_start || return 0
1517 CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1518 # sleeping threads may drive values above this
1519 do_facet ost1 "sysctl -w lustre.fail_val=400"
1520 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1521 do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
1522 createmany -o $DIR/$tfile 20 > /dev/null
1523 unlinkmany $DIR/$tfile 20 > /dev/null
1524 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1525 CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1526 ATTEMPTS=$(($CONN2 - $CONN1))
1527 echo "$ATTEMPTS osc reconnect attemps on gradual slow"
1528 [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
1531 run_test 67a "AT: verify slow request processing doesn't induce reconnects"
1533 test_67b() #bug 3055
1535 at_start || return 0
1536 CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1537 #define OBD_FAIL_OST_PAUSE_CREATE 0x223
1538 do_facet ost1 "sysctl -w lustre.fail_val=20000"
1539 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
1540 cp /etc/profile $DIR/$tfile || error "cp failed"
1542 cat $LPROC/ost/OSS/ost_create/timeouts
1544 CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1545 ATTEMPTS=$(($CONN2 - $CONN1))
1546 echo "$ATTEMPTS osc reconnect attemps on instant slow"
1547 # do it again; should not timeout
1548 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
1549 cp /etc/profile $DIR/$tfile || error "cp failed"
1550 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1552 cat $LPROC/ost/OSS/ost_create/timeouts
1553 CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1554 ATTEMPTS=$(($CONN3 - $CONN2))
1555 echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
1556 [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect"
1559 run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
1561 test_68 () #bug 13813
1563 at_start || return 0
1564 local ldlm_enqueue_min=$(find /sys -name ldlm_enqueue_min)
1565 [ -z "$ldlm_enqueue_min" ] && skip "missing /sys/.../ldlm_enqueue_min" && return 0
1566 local ENQ_MIN=$(cat $ldlm_enqueue_min)
1567 echo $TIMEOUT >> $ldlm_enqueue_min
1568 rm -f $DIR/${tfile}_[1-2]
1569 lfs setstripe $DIR/$tfile --index=0 --count=1
1570 #define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312
1571 sysctl -w lustre.fail_val=$(($TIMEOUT - 1))
1572 sysctl -w lustre.fail_loc=0x80000312
1573 cp /etc/profile $DIR/${tfile}_1 || error "1st cp failed $?"
1574 sysctl -w lustre.fail_val=$((TIMEOUT * 3 / 2))
1575 sysctl -w lustre.fail_loc=0x80000312
1576 cp /etc/profile $DIR/${tfile}_2 || error "2nd cp failed $?"
1577 sysctl -w lustre.fail_loc=0
1578 echo $ENQ_MIN >> $ldlm_enqueue_min
1581 run_test 68 "AT: verify slowing locks"
1583 if [ -n "$ATOLDBASE" ]; then
1584 at_history=$(do_facet mds "find /sys/ -name at_history")
1585 do_facet mds "echo $ATOLDBASE >> $at_history" || true
1586 do_facet ost1 "echo $ATOLDBASE >> $at_history" || true
1588 # end of AT tests includes above lines
1590 equals_msg `basename $0`: test complete, cleaning up
1591 check_and_cleanup_lustre
1592 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true