7 # This test needs to be run on the client
10 LUSTRE=${LUSTRE:-`dirname $0`/..}
13 . $LUSTRE/tests/test-framework.sh
15 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
16 CHECK_GRANT=${CHECK_GRANT:-"yes"}
17 GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""}
21 ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT"
25 cleanup_and_setup_lustre
33 run_test 0 "empty replay"
36 # this test attempts to trigger a race in the precreation code,
37 # and must run before any other objects are created on the filesystem
39 createmany -o $DIR/$tfile 20 || return 1
40 unlinkmany $DIR/$tfile 20 || return 2
42 run_test 0b "ensure object created after recover exists. (3284)"
48 $CHECKSTAT -t file $DIR/$tfile || return 1
51 run_test 1 "simple create"
54 do_facet ost1 "sysctl -w lustre.fail_loc=0"
57 local old_last_id=`cat $LPROC/obdfilter/*/last_id`
58 touch -o $DIR/$tfile 1
60 local new_last_id=`cat $LPROC/obdfilter/*/last_id`
62 test "$old_last_id" = "$new_last_id" || {
63 echo "OST object create is caused by MDS"
67 old_last_id=`cat $LPROC/obdfilter/*/last_id`
68 echo "data" > $DIR/$tfile
70 new_last_id=`cat $LPROC/obdfilter/*/last_id`
71 test "$old_last_id" = "$new_last_id "&& {
72 echo "CROW does not work on write"
78 #define OBD_FAIL_OST_CROW_EIO | OBD_FAIL_ONCE
79 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000801"
82 old_last_id=`cat $LPROC/obdfilter/*/last_id`
83 echo "data" > $DIR/1a1
85 new_last_id=`cat $LPROC/obdfilter/*/last_id`
86 test "$old_last_id" = "$new_last_id" || {
87 echo "CROW does work with fail_loc=0x80000801"
93 do_facet ost1 "sysctl -w lustre.fail_loc=0"
95 #CROW run_test 1a "CROW object create (check OST last_id)"
101 $CHECKSTAT -t file $DIR/$tfile || return 1
111 $CHECKSTAT -t file $DIR/$tfile || return 1
119 o_directory $DIR/$tfile
121 $CHECKSTAT -t file $DIR/$tfile || return 2
124 run_test 3a "replay failed open(O_DIRECTORY)"
128 #define OBD_FAIL_MDS_OPEN_PACK | OBD_FAIL_ONCE
129 do_facet mds "sysctl -w lustre.fail_loc=0x80000114"
131 do_facet mds "sysctl -w lustre.fail_loc=0"
133 $CHECKSTAT -t file $DIR/$tfile && return 2
136 run_test 3b "replay failed open -ENOMEM"
140 #define OBD_FAIL_MDS_ALLOC_OBDO | OBD_FAIL_ONCE
141 do_facet mds "sysctl -w lustre.fail_loc=0x80000128"
143 do_facet mds "sysctl -w lustre.fail_loc=0"
146 $CHECKSTAT -t file $DIR/$tfile && return 2
149 run_test 3c "replay failed open -ENOMEM"
153 for i in `seq 10`; do
154 echo "tag-$i" > $DIR/$tfile-$i
157 for i in `seq 10`; do
158 grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i"
161 run_test 4 "|x| 10 open(O_CREAT)s"
167 $CHECKSTAT -t file $DIR/$tfile-* && return 1 || true
169 run_test 4b "|x| rm 10 files"
171 # The idea is to get past the first block of precreated files on both
172 # osts, and then replay.
175 for i in `seq 220`; do
176 echo "tag-$i" > $DIR/$tfile-$i
179 for i in `seq 220`; do
180 grep -q "tag-$i" $DIR/$tfile-$i || error "f1c-$i"
184 # waiting for commitment of removal
186 run_test 5 "|x| 220 open(O_CREAT)"
192 mcreate $DIR/$tdir/$tfile
194 $CHECKSTAT -t dir $DIR/$tdir || return 1
195 $CHECKSTAT -t file $DIR/$tdir/$tfile || return 2
197 # waiting for log process thread
199 run_test 6 "mkdir + contained create"
205 $CHECKSTAT -t dir $DIR/$tdir && return 1 || true
207 run_test 6b "|X| rmdir"
212 mcreate $DIR/$tdir/$tfile
214 $CHECKSTAT -t dir $DIR/$tdir || return 1
215 $CHECKSTAT -t file $DIR/$tdir/$tfile || return 2
218 run_test 7 "mkdir |X| contained create"
222 multiop $DIR/$tfile mo_c &
227 $CHECKSTAT -t file $DIR/$tfile || return 1
228 kill -USR1 $MULTIPID || return 2
229 wait $MULTIPID || return 3
232 run_test 8 "creat open |X| close"
237 local old_inum=`ls -i $DIR/$tfile | awk '{print $1}'`
239 local new_inum=`ls -i $DIR/$tfile | awk '{print $1}'`
241 echo " old_inum == $old_inum, new_inum == $new_inum"
242 if [ $old_inum -eq $new_inum ] ;
244 echo " old_inum and new_inum match"
246 echo "!!!! old_inum and new_inum NOT match"
251 run_test 9 "|X| create (same inum/gen)"
256 mv $DIR/$tfile $DIR/$tfile-2
259 $CHECKSTAT $DIR/$tfile && return 1
260 $CHECKSTAT $DIR/$tfile-2 ||return 2
264 run_test 10 "create |X| rename unlink"
268 echo "old" > $DIR/$tfile
269 mv $DIR/$tfile $DIR/$tfile-2
271 echo "new" > $DIR/$tfile
273 grep old $DIR/$tfile-2
275 grep new $DIR/$tfile || return 1
276 grep old $DIR/$tfile-2 || return 2
278 run_test 11 "create open write rename |X| create-old-name read"
282 multiop $DIR/$tfile o_tSc &
284 # give multiop a chance to open
289 wait $pid || return 1
292 [ -e $DIR/$tfile ] && return 2
295 run_test 12 "open, unlink |X| close"
298 # 1777 - replay open after committed chmod that would make
299 # a regular open a failure
302 multiop $DIR/$tfile O_wc &
304 # give multiop a chance to open
307 $CHECKSTAT -p 0 $DIR/$tfile
311 wait $pid || return 1
313 $CHECKSTAT -s 1 -p 0 $DIR/$tfile || return 2
316 run_test 13 "open chmod 0 |x| write close"
319 multiop $DIR/$tfile O_tSc &
321 # give multiop a chance to open
325 kill -USR1 $pid || return 1
326 wait $pid || return 2
329 [ -e $DIR/$tfile ] && return 3
332 run_test 14 "open(O_CREAT), unlink |X| close"
335 multiop $DIR/$tfile O_tSc &
337 # give multiop a chance to open
341 touch $DIR/g11 || return 1
343 wait $pid || return 2
346 [ -e $DIR/$tfile ] && return 3
347 touch $DIR/h11 || return 4
350 run_test 15 "open(O_CREAT), unlink |X| touch new, close"
357 mcreate $DIR/$tfile-2
359 [ -e $DIR/$tfile ] && return 1
360 [ -e $DIR/$tfile-2 ] || return 2
361 munlink $DIR/$tfile-2 || return 3
363 run_test 16 "|X| open(O_CREAT), unlink, touch new, unlink new"
367 multiop $DIR/$tfile O_c &
369 # give multiop a chance to open
372 kill -USR1 $pid || return 1
373 wait $pid || return 2
374 $CHECKSTAT -t file $DIR/$tfile || return 3
377 run_test 17 "|X| open(O_CREAT), |replay| close"
381 multiop $DIR/$tfile O_tSc &
383 # give multiop a chance to open
386 touch $DIR/$tfile-2 || return 1
387 echo "pid: $pid will close"
389 wait $pid || return 2
392 [ -e $DIR/$tfile ] && return 3
393 [ -e $DIR/$tfile-2 ] || return 4
394 # this touch frequently fails
395 touch $DIR/$tfile-3 || return 5
396 munlink $DIR/$tfile-2 || return 6
397 munlink $DIR/$tfile-3 || return 7
400 run_test 18 "|X| open(O_CREAT), unlink, touch new, close, touch, unlink"
402 # bug 1855 (a simpler form of test_11 above)
406 echo "old" > $DIR/$tfile
407 mv $DIR/$tfile $DIR/$tfile-2
408 grep old $DIR/$tfile-2
410 grep old $DIR/$tfile-2 || return 2
412 run_test 19 "|X| mcreate, open, write, rename "
416 multiop $DIR/$tfile O_tSc &
418 # give multiop a chance to open
424 wait $pid || return 1
425 [ -e $DIR/$tfile ] && return 2
428 run_test 20 "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
430 test_20b() { # bug 10480
431 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
433 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 &
435 while [ ! -e $DIR/$tfile ] ; do
436 sleep 0.060s # give dd a chance to start
439 lfs getstripe $DIR/$tfile || return 1
440 rm -f $DIR/$tfile || return 2 # make it an orphan
442 df -P $DIR || df -P $DIR || true # reconnect
444 fail mds # start orphan recovery
445 df -P $DIR || df -P $DIR || true # reconnect
448 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'`
449 log "before $BEFOREUSED, after $AFTERUSED"
450 [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \
451 error "after $AFTERUSED > before $BEFOREUSED" && return 5
454 run_test 20b "write, unlink, eviction, replay, (test mds_cleanup_orphans)"
456 test_20c() { # bug 10480
457 dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000
459 exec 100< $DIR/$tfile
465 df -P $DIR || df -P $DIR || true # reconnect
469 test -s $DIR/$tfile || error "File was truncated"
473 run_test 20c "check that client eviction does not affect file content"
477 multiop $DIR/$tfile O_tSc &
479 # give multiop a chance to open
482 touch $DIR/g11 || return 1
486 wait $pid || return 2
487 [ -e $DIR/$tfile ] && return 3
488 touch $DIR/h11 || return 4
491 run_test 21 "|X| open(O_CREAT), unlink touch new, replay, close (test mds_cleanup_orphans)"
494 multiop $DIR/$tfile O_tSc &
496 # give multiop a chance to open
504 wait $pid || return 1
505 [ -e $DIR/$tfile ] && return 2
508 run_test 22 "open(O_CREAT), |X| unlink, replay, close (test mds_cleanup_orphans)"
511 multiop $DIR/$tfile O_tSc &
513 # give multiop a chance to open
518 touch $DIR/g11 || return 1
522 wait $pid || return 2
523 [ -e $DIR/$tfile ] && return 3
524 touch $DIR/h11 || return 4
527 run_test 23 "open(O_CREAT), |X| unlink touch new, replay, close (test mds_cleanup_orphans)"
530 multiop $DIR/$tfile O_tSc &
532 # give multiop a chance to open
539 wait $pid || return 1
540 [ -e $DIR/$tfile ] && return 2
543 run_test 24 "open(O_CREAT), replay, unlink, close (test mds_cleanup_orphans)"
546 multiop $DIR/$tfile O_tSc &
548 # give multiop a chance to open
555 wait $pid || return 1
556 [ -e $DIR/$tfile ] && return 2
559 run_test 25 "open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)"
563 multiop $DIR/$tfile-1 O_tSc &
565 multiop $DIR/$tfile-2 O_tSc &
567 # give multiop a chance to open
572 wait $pid2 || return 1
576 wait $pid1 || return 2
577 [ -e $DIR/$tfile-1 ] && return 3
578 [ -e $DIR/$tfile-2 ] && return 4
581 run_test 26 "|X| open(O_CREAT), unlink two, close one, replay, close one (test mds_cleanup_orphans)"
585 multiop $DIR/$tfile-1 O_tSc &
587 multiop $DIR/$tfile-2 O_tSc &
589 # give multiop a chance to open
596 wait $pid1 || return 1
598 wait $pid2 || return 2
599 [ -e $DIR/$tfile-1 ] && return 3
600 [ -e $DIR/$tfile-2 ] && return 4
603 run_test 27 "|X| open(O_CREAT), unlink two, replay, close two (test mds_cleanup_orphans)"
606 multiop $DIR/$tfile-1 O_tSc &
608 multiop $DIR/$tfile-2 O_tSc &
610 # give multiop a chance to open
616 wait $pid2 || return 1
620 wait $pid1 || return 2
621 [ -e $DIR/$tfile-1 ] && return 3
622 [ -e $DIR/$tfile-2 ] && return 4
625 run_test 28 "open(O_CREAT), |X| unlink two, close one, replay, close one (test mds_cleanup_orphans)"
628 multiop $DIR/$tfile-1 O_tSc &
630 multiop $DIR/$tfile-2 O_tSc &
632 # give multiop a chance to open
640 wait $pid1 || return 1
642 wait $pid2 || return 2
643 [ -e $DIR/$tfile-1 ] && return 3
644 [ -e $DIR/$tfile-2 ] && return 4
647 run_test 29 "open(O_CREAT), |X| unlink two, replay, close two (test mds_cleanup_orphans)"
650 multiop $DIR/$tfile-1 O_tSc &
652 multiop $DIR/$tfile-2 O_tSc &
654 # give multiop a chance to open
662 wait $pid1 || return 1
664 wait $pid2 || return 2
665 [ -e $DIR/$tfile-1 ] && return 3
666 [ -e $DIR/$tfile-2 ] && return 4
669 run_test 30 "open(O_CREAT) two, unlink two, replay, close two (test mds_cleanup_orphans)"
672 multiop $DIR/$tfile-1 O_tSc &
674 multiop $DIR/$tfile-2 O_tSc &
676 # give multiop a chance to open
684 wait $pid1 || return 1
686 wait $pid2 || return 2
687 [ -e $DIR/$tfile-1 ] && return 3
688 [ -e $DIR/$tfile-2 ] && return 4
691 run_test 31 "open(O_CREAT) two, unlink one, |X| unlink one, close two (test mds_cleanup_orphans)"
693 # tests for bug 2104; completion without crashing is success. The close is
694 # stale, but we always return 0 for close, so the app never sees it.
696 multiop $DIR/$tfile O_c &
698 multiop $DIR/$tfile O_c &
700 # give multiop a chance to open
703 df $MOUNT || sleep 1 && df $MOUNT || return 1
709 run_test 32 "close() notices client eviction; close() after client eviction"
711 # Abort recovery before client complete
714 createmany -o $DIR/$tfile-%d 100
716 # this file should be gone, because the replay was aborted
717 $CHECKSTAT -t file $DIR/$tfile-* && return 3
718 unlinkmany $DIR/$tfile-%d 0 100
721 run_test 33 "abort recovery before client does replay"
724 multiop $DIR/$tfile O_c &
726 # give multiop a chance to open
733 [ -e $DIR/$tfile ] && return 1
737 run_test 34 "abort recovery before client does replay (test mds_cleanup_orphans)"
739 # bug 2278 - generate one orphan on OST, then destroy it during recovery from llog
743 #define OBD_FAIL_MDS_REINT_NET_REP 0x119
744 do_facet mds "sysctl -w lustre.fail_loc=0x80000119"
749 # give a chance to remove from MDS
751 $CHECKSTAT -t file $DIR/$tfile && return 1 || true
753 run_test 35 "test recovery from llog for unlink op"
755 # b=2432 resent cancel after replay uses wrong cookie,
756 # so don't resend cancels
760 checkstat $DIR/$tfile
763 if dmesg | grep "unknown lock cookie"; then
764 echo "cancel after replay failed"
768 run_test 36 "don't resend cancel"
771 # directory orphans can't be unlinked from PENDING directory
773 rmdir $DIR/$tfile 2>/dev/null
774 multiop $DIR/$tfile dD_c &
776 # give multiop a chance to open
781 # clear the dmesg buffer so we only see errors from this recovery
785 dmesg | grep "mds_unlink_orphan.*error .* unlinking orphan" && return 1
789 run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)"
792 createmany -o $DIR/$tfile-%d 800
793 unlinkmany $DIR/$tfile-%d 0 400
796 unlinkmany $DIR/$tfile-%d 400 400
798 $CHECKSTAT -t file $DIR/$tfile-* && return 1 || true
800 run_test 38 "test recovery from unlink llog (test llog_gen_rec) "
802 test_39() { # bug 4176
803 createmany -o $DIR/$tfile-%d 800
805 unlinkmany $DIR/$tfile-%d 0 400
807 unlinkmany $DIR/$tfile-%d 400 400
810 $CHECKSTAT -t file $DIR/$tfile-* && return 1 || true
812 run_test 39 "test recovery from unlink llog (test llog_gen_rec) "
815 awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }' $LPROC/osc/*/stats
820 $LCTL mark multiop $MOUNT/$tfile OS_c
821 multiop $MOUNT/$tfile OS_c &
823 writeme -s $MOUNT/${tfile}-2 &
827 #define OBD_FAIL_MDS_CONNECT_NET 0x117
828 do_facet mds "sysctl -w lustre.fail_loc=0x80000117"
830 stat1=`count_ost_writes`
832 stat2=`count_ost_writes`
833 echo "$stat1, $stat2"
834 if [ $stat1 -lt $stat2 ]; then
835 echo "writes continuing during recovery"
838 echo "writes not continuing during recovery, bug 2477"
841 echo "waiting for writeme $WRITE_PID"
845 echo "waiting for multiop $PID"
846 wait $PID || return 2
847 do_facet client munlink $MOUNT/$tfile || return 3
848 do_facet client munlink $MOUNT/${tfile}-2 || return 3
851 run_test 40 "cause recovery in ptlrpc, ensure IO continues"
855 # make sure that a read to one osc doesn't try to double-unlock its page just
856 # because another osc is invalid. trigger_group_io used to mistakenly return
857 # an error if any oscs were invalid even after having successfully put rpcs
858 # on valid oscs. This was fatal if the caller was ll_readpage who unlocked
859 # the page, guarnateeing that the unlock from the RPC completion would
860 # assert on trying to unlock the unlocked page.
862 [ $OSTCOUNT -lt 2 ] && \
863 skip "skipping test 41: we don't have a second OST to test with" && \
866 local f=$MOUNT/$tfile
867 # make sure the start of the file is ost1
868 lfs setstripe $f $((128 * 1024)) 0 0
869 do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3
871 # fail ost2 and read from ost1
872 local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'`
873 [ "$osc2dev" ] || return 4
874 $LCTL --device $osc2dev deactivate || return 1
875 do_facet client dd if=$f of=/dev/null bs=4k count=1 || return 3
876 $LCTL --device $osc2dev activate || return 2
879 run_test 41 "read from a valid osc while other oscs are invalid"
881 # test MDS recovery after ost failure
883 blocks=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'`
884 createmany -o $DIR/$tfile-%d 800
886 unlinkmany $DIR/$tfile-%d 0 400
888 sysctl -w lnet.debug=-1
891 # osc is evicted, fs is smaller (but only with failout OSTs (bug 7287)
892 #blocks_after=`df -P $MOUNT | tail -n 1 | awk '{ print $2 }'`
893 #[ $blocks_after -lt $blocks ] || return 1
894 echo wait for MDS to timeout and recover
895 sleep $((TIMEOUT * 2))
897 unlinkmany $DIR/$tfile-%d 400 400
898 $CHECKSTAT -t file $DIR/$tfile-* && return 2 || true
900 run_test 42 "recovery after ost failure"
902 # timeout in MDS/OST recovery RPC will LBUG MDS
903 test_43() { # bug 2530
906 # OBD_FAIL_OST_CREATE_NET 0x204
907 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000204"
910 do_facet ost1 "sysctl -w lustre.fail_loc=0"
914 run_test 43 "mds osc import failure during recovery; don't LBUG"
917 mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
918 [ "$mdcdev" ] || exit 2
919 # adaptive timeouts slow this way down
920 MDS_AT_MAX=$(do_facet mds "sysctl -n lustre.adaptive_max")
921 do_facet mds "sysctl -w lustre.adaptive_max=40"
922 for i in `seq 1 10`; do
923 echo "$i of 10 ($(date +%s))"
924 do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
925 #define OBD_FAIL_TGT_CONN_RACE 0x701
926 do_facet mds "sysctl -w lustre.fail_loc=0x80000701"
927 $LCTL --device $mdcdev recover
930 do_facet mds "sysctl -w lustre.fail_loc=0"
931 do_facet mds "sysctl -w lustre.adaptive_max=$MDS_AT_MAX"
934 run_test 44 "race in target handle connect"
937 mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
938 [ "$mdcdev" ] || exit 2
939 for i in `seq 1 10`; do
940 echo "$i of 10 ($(date +%s))"
941 do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts"
942 #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704
943 do_facet mds "sysctl -w lustre.fail_loc=0x80000704"
944 $LCTL --device $mdcdev recover
947 do_facet mds "sysctl -w lustre.fail_loc=0"
950 run_test 44b "race in target handle connect"
952 # Handle failed close
954 mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices`
955 [ "$mdcdev" ] || exit 2
956 $LCTL --device $mdcdev recover
958 multiop $DIR/$tfile O_c &
962 # This will cause the CLOSE to fail before even
963 # allocating a reply buffer
964 $LCTL --device $mdcdev deactivate || return 4
968 wait $pid || return 1
970 $LCTL --device $mdcdev activate || return 5
973 $CHECKSTAT -t file $DIR/$tfile || return 2
976 run_test 45 "Handle failed close"
980 drop_reply "touch $DIR/$tfile"
982 # ironically, the previous test, 45, will cause a real forced close,
983 # so just look for one for this test
984 dmesg | grep -i "force closing client file handle for $tfile" && return 1
987 run_test 46 "Don't leak file handle after open resend (3325)"
989 test_47() { # bug 2824
990 # create some files to make sure precreate has been done on all
991 # OSTs. (just in case this test is run independently)
992 createmany -o $DIR/$tfile 20 || return 1
994 # OBD_FAIL_OST_CREATE_NET 0x204
996 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000204"
997 df $MOUNT || return 2
999 # let the MDS discover the OST failure, attempt to recover, fail
1000 # and recover again.
1001 sleep $((3 * TIMEOUT))
1003 # Without 2824, this createmany would hang
1004 createmany -o $DIR/$tfile 20 || return 3
1005 unlinkmany $DIR/$tfile 20 || return 4
1007 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1010 run_test 47 "MDS->OSC failure during precreate cleanup (2824)"
1014 createmany -o $DIR/$tfile 20 || return 1
1015 # OBD_FAIL_OST_EROFS 0x216
1017 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000216"
1018 df $MOUNT || return 2
1020 createmany -o $DIR/$tfile 20 20 || return 2
1021 unlinkmany $DIR/$tfile 40 || return 3
1023 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1026 run_test 48 "MDS->OSC failure during precreate cleanup (2824)"
1029 local oscdev=`grep ${ost1_svc}-osc- $LPROC/devices | awk '{print $1}'`
1030 [ "$oscdev" ] || return 1
1031 $LCTL --device $oscdev recover && $LCTL --device $oscdev recover
1032 # give the mds_lov_sync threads a chance to run
1035 run_test 50 "Double OSC recovery, don't LASSERT (3812)"
1037 # b3764 timed out lock replay
1040 cancel_lru_locks mdc
1042 multiop $DIR/$tfile s || return 1
1044 #define OBD_FAIL_LDLM_REPLY 0x30c
1045 do_facet mds "sysctl -w lustre.fail_loc=0x8000030c"
1046 fail mds || return 2
1047 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1049 $CHECKSTAT -t file $DIR/$tfile-* && return 3 || true
1051 run_test 52 "time out lock replay (3764)"
1053 #b_cray 53 "|X| open request and close reply while two MDC requests in flight"
1054 #b_cray 54 "|X| open request and close reply while two MDC requests in flight"
1056 #b3761 ASSERTION(hash != 0) failed
1058 # OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE
1059 do_facet mds "sysctl -w lustre.fail_loc=0x8000012b"
1061 # give touch a chance to run
1063 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1067 run_test 55 "let MDS_CHECK_RESENT return the original return code instead of 0"
1069 #b3440 ASSERTION(rec->ur_fid2->id) failed
1071 ln -s foo $DIR/$tfile
1073 #drop_reply "cat $DIR/$tfile"
1077 run_test 56 "don't replay a symlink open request (3440)"
1079 #recovery one mds-ost setattr from llog
1081 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1082 do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
1087 $CHECKSTAT -t file $DIR/$tfile || return 1
1088 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1091 run_test 57 "test recovery from llog for setattr op"
1093 #recovery many mds-ost setattr from llog
1095 #define OBD_FAIL_MDS_OST_SETATTR 0x12c
1096 do_facet mds "sysctl -w lustre.fail_loc=0x8000012c"
1098 createmany -o $DIR/$tdir/$tfile-%d 2500
1102 $CHECKSTAT -t file $DIR/$tdir/$tfile-* || return 1
1103 do_facet mds "sysctl -w lustre.fail_loc=0x0"
1104 unlinkmany $DIR/$tdir/$tfile-%d 2500
1107 run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)"
1109 # log_commit_thread vs filter_destroy race used to lead to import use after free
1113 createmany -o $DIR/$tdir/$tfile-%d 200
1115 unlinkmany $DIR/$tdir/$tfile-%d 200
1116 #define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507
1117 do_facet ost1 "sysctl -w lustre.fail_loc=0x507"
1120 do_facet ost1 "sysctl -w lustre.fail_loc=0x0"
1124 run_test 59 "test log_commit_thread vs filter_destroy race"
1126 # race between add unlink llog vs cat log init in post_recovery (only for b1_6)
1127 # bug 12086: should no oops and No ctxt error for this test
1130 createmany -o $DIR/$tdir/$tfile-%d 200
1132 unlinkmany $DIR/$tdir/$tfile-%d 0 100
1134 unlinkmany $DIR/$tdir/$tfile-%d 100 100
1135 local no_ctxt=`dmesg | grep "No ctxt"`
1136 [ -z "$no_ctxt" ] || error "ctxt is not initialized in recovery"
1138 run_test 60 "test llog post recovery init vs llog unlink"
1140 #test race llog recovery thread vs llog cleanup
1143 createmany -o $DIR/$tdir/$tfile-%d 800
1145 # OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221
1146 unlinkmany $DIR/$tdir/$tfile-%d 800
1147 do_facet ost "sysctl -w lustre.fail_loc=0x80000221"
1152 do_facet ost "sysctl -w lustre.fail_loc=0x0"
1153 $CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1
1156 run_test 61a "test race llog recovery vs llog cleanup"
1158 #test race mds llog sync vs llog cleanup
1160 # OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a
1161 do_facet mds "sysctl -w lustre.fail_loc=0x8000013a"
1165 do_facet client dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 || return 1
1167 run_test 61b "test race mds llog sync vs llog cleanup"
1169 #test race cancel cookie cb vs llog cleanup
1171 # OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222
1173 do_facet ost "sysctl -w lustre.fail_loc=0x80000222"
1178 run_test 61c "test race mds llog sync vs llog cleanup"
1181 at_start() #bug 3055
1183 if [ -z "$ATOLDBASE" ]; then
1184 ATOLDBASE=$(do_facet mds "sysctl -n lustre.adaptive_history")
1185 # speed up the timebase so we can check decreasing AT
1186 do_facet mds "sysctl -w lustre.adaptive_history=8"
1187 do_facet ost1 "sysctl -w lustre.adaptive_history=8"
1194 $LCTL dk > /dev/null
1195 # slow down a request
1196 do_facet mds sysctl -w lustre.fail_val=30000
1197 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1198 do_facet mds sysctl -w lustre.fail_loc=0x8000050a
1199 createmany -o $DIR/$tfile 10 > /dev/null
1200 unlinkmany $DIR/$tfile 10 > /dev/null
1201 # check for log message
1202 $LCTL dk | grep "Early reply #" || error "No early reply"
1203 # client should show 30s timeouts
1204 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1206 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1208 run_test 65 "AT: verify early replies"
1210 test_66a() #bug 3055
1213 grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1214 # adjust 5s at a time so no early reply is sent (within deadline)
1215 do_facet mds "sysctl -w lustre.fail_val=5000"
1216 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1217 do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
1218 createmany -o $DIR/$tfile 20 > /dev/null
1219 unlinkmany $DIR/$tfile 20 > /dev/null
1220 grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1221 do_facet mds "sysctl -w lustre.fail_val=10000"
1222 do_facet mds "sysctl -w lustre.fail_loc=0x8000050a"
1223 createmany -o $DIR/$tfile 20 > /dev/null
1224 unlinkmany $DIR/$tfile 20 > /dev/null
1225 grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts
1226 do_facet mds "sysctl -w lustre.fail_loc=0"
1228 createmany -o $DIR/$tfile 20 > /dev/null
1229 unlinkmany $DIR/$tfile 20 > /dev/null
1230 grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12"
1231 CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
1232 WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts)
1233 echo "Current MDT timeout $CUR, worst $WORST"
1234 [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST"
1236 run_test 66a "AT: verify MDT service time adjusts with no early replies"
1238 test_66b() #bug 3055
1241 ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts)
1242 sysctl -w lustre.fail_val=$(($ORIG + 5))
1243 #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c
1244 sysctl -w lustre.fail_loc=0x50c
1245 ls $DIR/$tfile > /dev/null 2>&1
1246 sysctl -w lustre.fail_loc=0
1247 CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts)
1248 WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts)
1249 echo "network timeout orig $ORIG, cur $CUR, worst $WORST"
1250 [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG"
1252 run_test 66b "AT: verify net latency adjusts"
1254 test_67a() #bug 3055
1257 CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1258 # sleeping threads may drive values above this
1259 do_facet ost1 "sysctl -w lustre.fail_val=400"
1260 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a
1261 do_facet ost1 "sysctl -w lustre.fail_loc=0x50a"
1262 createmany -o $DIR/$tfile 20 > /dev/null
1263 unlinkmany $DIR/$tfile 20 > /dev/null
1264 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1265 CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1266 ATTEMPTS=$(($CONN2 - $CONN1))
1267 echo "$ATTEMPTS osc reconnect attemps on gradual slow"
1268 [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
1271 run_test 67a "AT: verify slow request processing doesn't induce reconnects"
1273 test_67b() #bug 3055
1276 CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1277 #define OBD_FAIL_OST_PAUSE_CREATE 0x223
1278 do_facet ost1 "sysctl -w lustre.fail_val=20000"
1279 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
1280 cp /etc/profile $DIR/$tfile || error "cp failed"
1282 cat $LPROC/ost/OSS/ost_create/timeouts
1284 CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1285 ATTEMPTS=$(($CONN2 - $CONN1))
1286 echo "$ATTEMPTS osc reconnect attemps on instant slow"
1287 # do it again; should not timeout
1288 do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223"
1289 cp /etc/profile $DIR/$tfile || error "cp failed"
1290 do_facet ost1 "sysctl -w lustre.fail_loc=0"
1292 cat $LPROC/ost/OSS/ost_create/timeouts
1293 CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats)
1294 ATTEMPTS=$(($CONN3 - $CONN2))
1295 echo "$ATTEMPTS osc reconnect attemps on 2nd slow"
1296 [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect"
1299 run_test 67b "AT: verify instant slowdown doesn't induce reconnects"
1301 if [ -n "$ATOLDBASE" ]; then
1302 do_facet mds "sysctl -w lustre.adaptive_history=$ATOLDBASE"
1303 do_facet ost1 "sysctl -w lustre.adaptive_history=$ATOLDBASE"
1305 # end of AT tests includes above lines
1308 equals_msg `basename $0`: test complete, cleaning up
1309 check_and_cleanup_lustre
1310 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true