X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Freplay-single.sh;h=65691bb306ad3db282085e698f9d302921f12127;hb=fa8df3b66d6f4e13d0b6076bb087a52b20d4166c;hp=4ccfe84e8e860a76d7ee766b8b40b56f38aa3221;hpb=4ee47d0f475cf42f9796f13e78c8cfb1e3c94807;p=fs%2Flustre-release.git diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 4ccfe84..65691bb 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -460,8 +460,6 @@ test_20a() { # was test_20 run_test 20a "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)" test_20b() { # bug 10480 - # XXX increase the debug level temporary - do_nodes $(comma_list $(nodes_list)) "$LCTL set_param debug=0x33f0406; $LCTL set_param debug_mb=150" BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` dd if=/dev/zero of=$DIR/$tfile bs=4k count=10000 & @@ -479,15 +477,29 @@ test_20b() { # bug 10480 df -P $DIR || df -P $DIR || true # reconnect wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" - # FIXME just because recovery is done doesn't mean we've finished - # orphan cleanup. Fake it with a sleep for now... - sleep 10 + # just because recovery is done doesn't mean we've finished + # orphan cleanup. Wait for llogs to get synchronized. + echo waiting for orphan cleanup... + while [ true ]; do + local -a sync=($(do_facet ost "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}')) + local con=1 + for ((i=0; i<${#sync[@]}; i++)); do + [ ${sync[$i]} -eq 0 ] && continue + # there is a not finished MDS-OST synchronization + con=0 + break; + done + [ ${con} -eq 1 ] && break + sleep 1 + done + + # let the statfs cache to get old enough. + sleep 1 + AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` log "before $BEFOREUSED, after $AFTERUSED" [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \ error "after $AFTERUSED > before $BEFOREUSED" - # XXX decrease it back - do_nodes $(comma_list $(nodes_list)) "$LCTL set_param debug=$PTLDEBUG; $LCTL set_param debug_mb=$DEBUG_SIZE" return 0 } run_test 20b "write, unlink, eviction, replay, (test mds_cleanup_orphans)" @@ -947,7 +959,7 @@ test_44a() { # was test_44 [ "$mdcdev" ] || exit 2 # adaptive timeouts slow this way down - if at_is_valid && at_is_enabled; then + if at_is_enabled; then at_max_saved=$(at_max_get mds) at_max_set 40 mds fi @@ -1540,9 +1552,9 @@ at_cleanup () { echo "Cleaning up AT ..." if [ -n "$ATOLDBASE" ]; then - local at_history=$(do_facet mds "find /sys/ -name at_history") - do_facet mds "echo $ATOLDBASE >> $at_history" || true - do_facet ost1 "echo $ATOLDBASE >> $at_history" || true + local at_history=$($LCTL get_param -n at_history) + do_facet mds "lctl set_param at_history=$at_history" || true + do_facet ost1 "lctl set_param at_history=$at_history" || true fi if [ $AT_MAX_SET -ne 0 ]; then @@ -1561,10 +1573,6 @@ at_cleanup () { at_start() { local at_max_new=600 - if ! at_is_valid; then - skip "AT env is invalid" - return 1 - fi # Save at_max original values local facet @@ -1585,12 +1593,10 @@ at_start() done if [ -z "$ATOLDBASE" ]; then - local at_history=$(do_facet mds "find /sys/ -name at_history") - [ -z "$at_history" ] && skip "missing /sys/.../at_history " && return 1 - ATOLDBASE=$(do_facet mds "cat $at_history") + ATOLDBASE=$(do_facet mds "lctl get_param -n at_history") # speed up the timebase so we can check decreasing AT - do_facet mds "echo 8 >> $at_history" - do_facet ost1 "echo 8 >> $at_history" + do_facet mds "lctl set_param at_history=8" || true + do_facet ost1 "lctl set_param at_history=8" || true # sleep for a while to cool down, should be > 8s and also allow # at least one ping to be sent. simply use TIMEOUT to be safe. @@ -1723,7 +1729,7 @@ test_67a() #bug 3055 do_facet ost1 "sysctl -w lustre.fail_loc=0" CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN2 - $CONN1)) - echo "$ATTEMPTS osc reconnect attemps on gradual slow" + echo "$ATTEMPTS osc reconnect attempts on gradual slow" [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect" return 0 } @@ -1744,7 +1750,7 @@ test_67b() #bug 3055 log "phase 2" CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN2 - $CONN1)) - echo "$ATTEMPTS osc reconnect attemps on instant slow" + echo "$ATTEMPTS osc reconnect attempts on instant slow" # do it again; should not timeout do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" cp /etc/profile $DIR/$tfile || error "cp failed" @@ -1753,7 +1759,7 @@ test_67b() #bug 3055 do_facet ost1 "lctl get_param -n ost.OSS.ost_create.timeouts" CONN3=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN3 - $CONN2)) - echo "$ATTEMPTS osc reconnect attemps on 2nd slow" + echo "$ATTEMPTS osc reconnect attempts on 2nd slow" [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect" return 0 } @@ -1851,7 +1857,7 @@ test_70b () { # Increment the number of failovers NUM_FAILOVERS=$((NUM_FAILOVERS+1)) log "$TESTNAME fail mds1 $NUM_FAILOVERS times" - facet_failover $SINGLEMDS + fail $SINGLEMDS CURRENT_TS=$(date +%s) ELAPSED=$((CURRENT_TS - START_TS)) done @@ -1911,13 +1917,13 @@ run_test 73c "open(O_CREAT), unlink, replay, reconnect at last_replay, close" # bug 18554 test_74() { stop ost1 - zconf_umount $(hostname) $MOUNT - fail $SINGLEMDS - zconf_mount $(hostname) $MOUNT + zconf_umount_clients $CLIENTS $MOUNT + facet_failover $SINGLEMDS + zconf_mount_clients $CLIENTS $MOUNT mount_facet ost1 touch $DIR/$tfile || return 1 rm $DIR/$tfile || return 2 - df $MOUNT || error "df failed: $?" + client_df || error "df failed: $?" return 0 } run_test 74 "Ensure applications don't fail waiting for OST reocvery" @@ -1989,6 +1995,17 @@ test_82b() { } run_test 82b "CMD: mkdir cross-node dir (fail mds with name)" +test_84() { +#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x143 + do_facet mds "lctl set_param fail_loc=0x80000143" + createmany -o $DIR/$tfile- 1 & + PID=$! + mds_evict_client + wait $PID + df -P $DIR || df -P $DIR || true # reconnect +} +run_test 84 "stale open during export disconnect" + equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true