X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Ftests%2Freplay-single.sh;h=4b0f5c0df349a608b8c46ad39e521fb14451af5d;hp=416bf94697f0b33899fc44a9bb3ae51344f5c55d;hb=e5849abd06bf5ec6a636b1608ec82901fd6447a9;hpb=829dd8b39a8dd0ec409ed24a3178afe1b14e516e diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 416bf94..4b0f5c0 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -13,14 +13,15 @@ CLEANUP=${CLEANUP:-} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +init_logging CHECK_GRANT=${CHECK_GRANT:-"yes"} GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} -remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 +require_dsh_mds || exit 0 # Skip these tests -# bug number: 17466 15962 -ALWAYS_EXCEPT="61d 33b $REPLAY_SINGLE_EXCEPT" +# bug number: 17466 18857 +ALWAYS_EXCEPT="61d 33a 33b $REPLAY_SINGLE_EXCEPT" if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then CONFIG_EXCEPTIONS="0b 42 47 61a 61c" @@ -126,7 +127,9 @@ test_0c() { touch $DIR/$tfile || return 3 rm $DIR/$tfile || return 4 } +start_full_debug_logging run_test 0c "fld create" +stop_full_debug_logging test_1() { replay_barrier $SINGLEMDS @@ -353,6 +356,7 @@ test_13() { wait $pid || return 1 $CHECKSTAT -s 1 -p 0 $DIR/$tfile || return 2 + rm $DIR/$tfile || return 4 return 0 } run_test 13 "open chmod 0 |x| write close" @@ -460,8 +464,6 @@ test_20a() { # was test_20 run_test 20a "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)" test_20b() { # bug 10480 - # XXX increase the debug level temporary - do_nodes $(comma_list $(nodes_list)) "$LCTL set_param debug=0x33f0406; $LCTL set_param debug_mb=150" BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 & @@ -473,21 +475,15 @@ test_20b() { # bug 10480 lfs getstripe $DIR/$tfile || return 1 rm -f $DIR/$tfile || return 2 # make it an orphan mds_evict_client - df -P $DIR || df -P $DIR || true # reconnect + client_up || client_up || true # reconnect fail $SINGLEMDS # start orphan recovery - df -P $DIR || df -P $DIR || true # reconnect wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" - - # FIXME just because recovery is done doesn't mean we've finished - # orphan cleanup. Fake it with a sleep for now... - sleep 10 + wait_mds_ost_sync || return 3 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` log "before $BEFOREUSED, after $AFTERUSED" [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \ error "after $AFTERUSED > before $BEFOREUSED" - # XXX decrease it back - do_nodes $(comma_list $(nodes_list)) "$LCTL set_param debug=$PTLDEBUG; $LCTL set_param debug_mb=$DEBUG_SIZE" return 0 } run_test 20b "write, unlink, eviction, replay, (test mds_cleanup_orphans)" @@ -499,13 +495,12 @@ test_20c() { # bug 10480 ls -la $DIR/$tfile mds_evict_client - - df -P $DIR || df -P $DIR || true # reconnect + client_up || client_up || true # reconnect kill -USR1 $pid - test -s $DIR/$tfile || error "File was truncated" - wait $pid || return 1 + [ -s $DIR/$tfile ] || error "File was truncated" + return 0 } run_test 20c "check that client eviction does not affect file content" @@ -714,7 +709,7 @@ test_32() { multiop_bg_pause $DIR/$tfile O_c || return 3 pid2=$! mds_evict_client - df $MOUNT || sleep 1 && df $MOUNT || return 1 + client_up || client_up || return 1 kill -USR1 $pid1 kill -USR1 $pid2 wait $pid1 || return 4 @@ -812,7 +807,9 @@ test_37() { sync return 0 } +start_full_debug_logging run_test 37 "abort recovery before client does replay (test mds_cleanup_orphans for directories)" +stop_full_debug_logging test_38() { createmany -o $DIR/$tfile-%d 800 @@ -885,7 +882,7 @@ run_test 40 "cause recovery in ptlrpc, ensure IO continues" # assert on trying to unlock the unlocked page. test_41() { [ $OSTCOUNT -lt 2 ] && \ - skip "skipping test 41: we don't have a second OST to test with" && \ + skip_env "skipping test 41: we don't have a second OST to test with" && \ return local f=$MOUNT/$tfile @@ -940,11 +937,12 @@ test_43() { # bug 2530 } run_test 43 "mds osc import failure during recovery; don't LBUG" -test_44a() { # was test_44 +test_44a() { # was test_44 local at_max_saved=0 mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'` - [ "$mdcdev" ] || exit 2 + [ "$mdcdev" ] || return 2 + [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; } # adaptive timeouts slow this way down if at_is_enabled; then @@ -953,12 +951,13 @@ test_44a() { # was test_44 fi for i in `seq 1 10`; do - echo "$i of 10 ($(date +%s))" - do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service" - #define OBD_FAIL_TGT_CONN_RACE 0x701 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701" - $LCTL --device $mdcdev recover - df $MOUNT + echo "$i of 10 ($(date +%s))" + do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service" + #define OBD_FAIL_TGT_CONN_RACE 0x701 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000701" + # lctl below may fail, it is valid case + $LCTL --device $mdcdev recover + df $MOUNT done do_facet $SINGLEMDS "lctl set_param fail_loc=0" [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds @@ -967,15 +966,18 @@ test_44a() { # was test_44 run_test 44a "race in target handle connect" test_44b() { - mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'` - [ "$mdcdev" ] || exit 2 + local mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'` + [ "$mdcdev" ] || return 2 + [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; } + for i in `seq 1 10`; do echo "$i of 10 ($(date +%s))" - do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service" - #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704" - $LCTL --device $mdcdev recover - df $MOUNT + do_facet $SINGLEMDS "lctl get_param -n mdt.*.mdt.timeouts | grep service" + #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000704" + # lctl below may fail, it is valid case + $LCTL --device $mdcdev recover + df $MOUNT done do_facet $SINGLEMDS "lctl set_param fail_loc=0" return 0 @@ -985,8 +987,10 @@ run_test 44b "race in target handle connect" # Handle failed close test_45() { mdcdev=`lctl get_param -n devices | awk '/MDT0000-mdc-/ {print $1}'` - [ "$mdcdev" ] || exit 2 - $LCTL --device $mdcdev recover + [ "$mdcdev" ] || return 2 + [ $(echo $mdcdev | wc -w) -eq 1 ] || { echo $mdcdev=$mdcdev && return 3; } + + $LCTL --device $mdcdev recover || return 6 multiop_bg_pause $DIR/$tfile O_c || return 1 pid=$! @@ -1028,7 +1032,7 @@ test_47() { # bug 2824 # OBD_FAIL_OST_CREATE_NET 0x204 fail ost1 do_facet ost1 "lctl set_param fail_loc=0x80000204" - df $MOUNT || return 2 + client_up || return 2 # let the MDS discover the OST failure, attempt to recover, fail # and recover again. @@ -1045,14 +1049,14 @@ run_test 47 "MDS->OSC failure during precreate cleanup (2824)" test_48() { remote_ost_nodsh && skip "remote OST with nodsh" && return 0 - [ "$OSTCOUNT" -lt "2" ] && skip "$OSTCOUNT < 2 OSTs -- skipping" && return + [ "$OSTCOUNT" -lt "2" ] && skip_env "$OSTCOUNT < 2 OSTs -- skipping" && return replay_barrier $SINGLEMDS createmany -o $DIR/$tfile 20 || return 1 # OBD_FAIL_OST_EROFS 0x216 facet_failover $SINGLEMDS do_facet ost1 "lctl set_param fail_loc=0x80000216" - df $MOUNT || return 2 + client_up || return 2 createmany -o $DIR/$tfile 20 20 || return 2 unlinkmany $DIR/$tfile 40 || return 3 @@ -1121,7 +1125,7 @@ test_53b() { mkdir -p $DIR/${tdir}-1 mkdir -p $DIR/${tdir}-2 - multiop $DIR/${tdir}-1/f O_c & + multiop_bg_pause $DIR/${tdir}-1/f O_c || return 6 close_pid=$! #define OBD_FAIL_MDS_REINT_NET 0x107 @@ -1181,6 +1185,7 @@ test_53c() { run_test 53c "|X| open request and close request while two MDC requests in flight" test_53d() { + cancel_lru_locks mdc # cleanup locks from former test cases rm -rf $DIR/${tdir}-1 $DIR/${tdir}-2 mkdir -p $DIR/${tdir}-1 @@ -1522,8 +1527,7 @@ test_62() { # Bug 15756 - don't mis-drop resent replay createmany -o $DIR/$tdir/$tfile- 25 #define OBD_FAIL_TGT_REPLAY_DROP 0x707 do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000707" - facet_failover $SINGLEMDS - df $MOUNT || return 1 + fail $SINGLEMDS do_facet $SINGLEMDS "lctl set_param fail_loc=0" unlinkmany $DIR/$tdir/$tfile- 25 || return 2 return 0 @@ -1717,7 +1721,7 @@ test_67a() #bug 3055 do_facet ost1 "sysctl -w lustre.fail_loc=0" CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN2 - $CONN1)) - echo "$ATTEMPTS osc reconnect attemps on gradual slow" + echo "$ATTEMPTS osc reconnect attempts on gradual slow" [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect" return 0 } @@ -1729,16 +1733,27 @@ test_67b() #bug 3055 at_start || return 0 CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') + + # exhaust precreations on ost1 + local OST=$(lfs osts | grep 0": " | awk '{print $2}' | sed -e 's/_UUID$//') + local mdtosc=$(get_mdtosc_proc_path $OST) + local last_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_last_id) + local next_id=$(do_facet mds lctl get_param -n osc.$mdtosc.prealloc_next_id) + + mkdir -p $DIR/$tdir/${OST} + lfs setstripe $DIR/$tdir/${OST} -o 0 -c 1 || error "setstripe" + echo "Creating to objid $last_id on ost $OST..." #define OBD_FAIL_OST_PAUSE_CREATE 0x223 do_facet ost1 "sysctl -w lustre.fail_val=20000" do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" - cp /etc/profile $DIR/$tfile || error "cp failed" + createmany -o $DIR/$tdir/${OST}/f $next_id $((last_id - next_id + 2)) + client_reconnect do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts" log "phase 2" CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN2 - $CONN1)) - echo "$ATTEMPTS osc reconnect attemps on instant slow" + echo "$ATTEMPTS osc reconnect attempts on instant slow" # do it again; should not timeout do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" cp /etc/profile $DIR/$tfile || error "cp failed" @@ -1747,7 +1762,7 @@ test_67b() #bug 3055 do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts" CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN3 - $CONN2)) - echo "$ATTEMPTS osc reconnect attemps on 2nd slow" + echo "$ATTEMPTS osc reconnect attempts on 2nd slow" [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect" return 0 } @@ -1828,7 +1843,7 @@ test_70b () { [ "$SLOW" = "no" ] && duration=60 local cmd="rundbench 1 -t $duration" local PID="" - do_nodes $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \ + do_nodesv $clients "set -x; MISSING_DBENCH_OK=$MISSING_DBENCH_OK \ PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:$DBENCH_LIB \ DBENCH_LIB=$DBENCH_LIB TESTSUITE=$TESTSUITE TESTNAME=$TESTNAME \ LCTL=$LCTL $cmd" & @@ -1845,7 +1860,7 @@ test_70b () { # Increment the number of failovers NUM_FAILOVERS=$((NUM_FAILOVERS+1)) log "$TESTNAME fail mds1 $NUM_FAILOVERS times" - facet_failover $SINGLEMDS + fail $SINGLEMDS CURRENT_TS=$(date +%s) ELAPSED=$((CURRENT_TS - START_TS)) done @@ -1904,17 +1919,19 @@ run_test 73c "open(O_CREAT), unlink, replay, reconnect at last_replay, close" # bug 18554 test_74() { + local clients=${CLIENTS:-$HOSTNAME} + stop ost1 - zconf_umount $(hostname) $MOUNT - fail $SINGLEMDS - zconf_mount $(hostname) $MOUNT + zconf_umount_clients $clients $MOUNT + facet_failover $SINGLEMDS + zconf_mount_clients $clients $MOUNT mount_facet ost1 touch $DIR/$tfile || return 1 rm $DIR/$tfile || return 2 - df $MOUNT || error "df failed: $?" + clients_up || error "client evicted: $?" return 0 } -run_test 74 "Ensure applications don't fail waiting for OST reocvery" +run_test 74 "Ensure applications don't fail waiting for OST recovery" test_80a() { [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return 0 @@ -1983,6 +2000,37 @@ test_82b() { } run_test 82b "CMD: mkdir cross-node dir (fail mds with name)" +test_83a() { + mkdir -p $DIR/$tdir + createmany -o $DIR/$tdir/$tfile- 10 || return 1 +#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000140" + unlinkmany $DIR/$tdir/$tfile- 10 || return 2 +} +run_test 83a "fail log_add during unlink recovery" + +test_83b() { + mkdir -p $DIR/$tdir + createmany -o $DIR/$tdir/$tfile- 10 || return 1 + replay_barrier $SINGLEMDS + unlinkmany $DIR/$tdir/$tfile- 10 || return 2 +#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000140" + fail $SINGLEMDS +} +run_test 83b "fail log_add during unlink recovery" + +test_84a() { +#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 + do_facet $SINGLEMDS "lctl set_param fail_loc=0x80000144" + createmany -o $DIR/$tfile- 1 & + PID=$! + mds_evict_client + wait $PID + client_up || client_up || true # reconnect +} +run_test 84a "stale open during export disconnect" + equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true