From af5f388c5a66e167cde01403d8af7eb18a82db1b Mon Sep 17 00:00:00 2001 From: Li Wei Date: Mon, 11 Jun 2012 11:39:10 +0800 Subject: [PATCH] LU-1303 tests: Support for MDS-initiated OST_DESTROYs This patch makes sure the tests work with MDSs that destroy OST objects asynchronously on behalf of clients. Change-Id: I3d365766f1af8a305cf723546e868ff3d6b2501d Signed-off-by: Li Wei Reviewed-on: http://review.whamcloud.com/2982 Tested-by: Hudson Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Alex Zhuravlev --- lustre/tests/oos.sh | 2 + lustre/tests/replay-dual.sh | 8 +-- lustre/tests/replay-ost-single.sh | 4 +- lustre/tests/replay-single.sh | 3 +- lustre/tests/sanity.sh | 8 ++- lustre/tests/sanityn.sh | 12 +++- lustre/tests/test-framework.sh | 112 +++++++++++++++++++++++++------------- 7 files changed, 99 insertions(+), 50 deletions(-) diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 5d6a161..98d2c18 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -90,6 +90,8 @@ rm -f $OOS sync; sleep 3; sync +wait_delete_completed 300 + if [ $SUCCESS -eq 1 ]; then echo "Success!" rm -f $LOG diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index c8abcfd..0468b78 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -310,8 +310,8 @@ run_test 13 "close resend timeout" # as test_15a test_14b() { - wait_mds_ost_sync - wait_destroy_complete + wait_mds_ost_sync + wait_delete_completed BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` mkdir -p $MOUNT1/$tdir $SETSTRIPE -i 0 $MOUNT1/$tdir @@ -332,8 +332,8 @@ test_14b() { zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" - wait_mds_ost_sync || return 4 - wait_destroy_complete || return 5 + wait_mds_ost_sync || return 4 + wait_delete_completed || return 5 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` log "before $BEFOREUSED, after $AFTERUSED" diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index a58c4b3..83f790f 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -207,7 +207,7 @@ test_6() { sync # let the delete happen wait_mds_ost_sync || return 4 - wait_destroy_complete || return 5 + wait_delete_completed || return 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account @@ -240,7 +240,7 @@ test_7() { sync # let the delete happen wait_mds_ost_sync || return 4 - wait_destroy_complete || return 5 + wait_delete_completed || return 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 7abd8d0..fb817fc 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -2264,7 +2264,7 @@ test_89() { mkdir -p $DIR/$tdir rm -f $DIR/$tdir/$tfile wait_mds_ost_sync - wait_destroy_complete + wait_delete_completed BLOCKS1=$(df -P $MOUNT | tail -n 1 | awk '{ print $3 }') $SETSTRIPE -i 0 -c 1 $DIR/$tdir/$tfile dd if=/dev/zero bs=1M count=10 of=$DIR/$tdir/$tfile @@ -2277,6 +2277,7 @@ test_89() { zconf_mount $(hostname) $MOUNT client_up || return 1 wait_mds_ost_sync + wait_delete_completed BLOCKS2=$(df -P $MOUNT | tail -n 1 | awk '{ print $3 }') [ "$BLOCKS1" == "$BLOCKS2" ] || error $((BLOCKS2 - BLOCKS1)) blocks leaked } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 7dafbe7..b2c6cf8 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1085,6 +1085,7 @@ reset_enospc() { [ "$OSTIDX" ] && list=$(facet_host ost$((OSTIDX + 1))) do_nodes $list lctl set_param fail_loc=0 + sync # initiate all OST_DESTROYs from MDS to OST sleep_maxage } @@ -5262,8 +5263,9 @@ test_101d() { echo read-ahead disabled time read $time_ra_OFF echo read-ahead enabled time read $time_ra_ON - set_read_ahead $old_READAHEAD - rm -f $file + set_read_ahead $old_READAHEAD + rm -f $file + wait_delete_completed [ $time_ra_ON -lt $time_ra_OFF ] || error "read-ahead enabled time read (${time_ra_ON}s) is more than @@ -7550,6 +7552,7 @@ test_133c() { $SETSTRIPE -c 1 -i 0 ${testdir}/${tfile} sync cancel_lru_locks osc + wait_delete_completed # clear stats. do_facet $SINGLEMDS $LCTL set_param mdt.*.md_stats=clear @@ -7567,6 +7570,7 @@ test_133c() { check_stats ost "punch" 1 rm -f ${testdir}/${tfile} || error "file remove failed" + wait_delete_completed check_stats ost "destroy" 1 rm -rf $DIR/${tdir} diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 4729806..7f06570 100644 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -322,6 +322,7 @@ run_test 14d "chmod of executing file is still possible ========" test_15() { # bug 974 - ENOSPC echo "PATH=$PATH" sh oos2.sh $MOUNT1 $MOUNT2 + wait_delete_completed grant_error=`dmesg | grep "> available"` [ -z "$grant_error" ] || error "$grant_error" } @@ -615,8 +616,13 @@ test_31a() { run_test 31a "voluntary cancel / blocking ast race==============" test_31b() { - remote_ost || { skip "local OST" && return 0; } - remote_ost_nodsh && skip "remote OST w/o dsh" && return 0 + remote_ost || { skip "local OST" && return 0; } + remote_ost_nodsh && skip "remote OST w/o dsh" && return 0 + + # make sure there is no local locks due to destroy + wait_mds_ost_sync || error "wait_mds_ost_sync()" + wait_delete_completed || error "wait_delete_completed()" + mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir" lfs setstripe $DIR/$tdir/$tfile -i 0 -c 1 cp /etc/hosts $DIR/$tdir/$tfile @@ -925,7 +931,7 @@ test_36() { #bug 16417 rm -f $DIR1/$tdir/file000 kill -USR1 $read_pid wait $read_pid - sleep 1 + wait_delete_completed local after=$($LFS df | awk '{if ($1 ~/^filesystem/) \ {print $5; exit} }') echo "*** cycle($i) *** before($before) after_dd($after_dd)" \ diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index c1e9a9c..ea998e0 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1739,23 +1739,52 @@ sync_all_data() { grep -v 'Found no match' } -wait_delete_completed () { - local TOTALPREV=`lctl get_param -n osc.*.kbytesavail | \ - awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` +wait_delete_completed_mds() { + local MAX_WAIT=${1:-20} + local mds2sync="" + local stime=`date +%s` + local etime + local node + local changes + + # find MDS with pending deletions + for node in $(mdts_nodes); do + changes=$(do_node $node "lctl get_param -n osc.*MDT*.sync_*" \ + 2>/dev/null | calc_sum) + if [ -z "$changes" ] || [ $changes -eq 0 ]; then + continue + fi + mds2sync="$mds2sync $node" + done + if [ "$mds2sync" == "" ]; then + return + fi + mds2sync=$(comma_list $mds2sync) + + # sync MDS transactions + do_nodes $mds2sync "lctl set_param -n osd*.*MD*.force_sync 1" + + # wait till all changes are sent and commmitted by OSTs + # for ldiskfs space is released upon execution, but DMU + # do this upon commit + + local WAIT=0 + while [ "$WAIT" -ne "$MAX_WAIT" ]; do + changes=$(do_nodes $mds2sync "lctl get_param -n osc.*MDT*.sync_*" \ + | calc_sum) + #echo "$node: $changes changes on all" + if [ "$changes" -eq "0" ]; then + etime=`date +%s` + #echo "delete took $((etime - stime)) seconds" + return + fi + sleep 1 + WAIT=$(( WAIT + 1)) + done - local WAIT=0 - local MAX_WAIT=20 - while [ "$WAIT" -ne "$MAX_WAIT" ]; do - sleep 1 - TOTAL=`lctl get_param -n osc.*.kbytesavail | \ - awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` - [ "$TOTAL" -eq "$TOTALPREV" ] && return 0 - echo "Waiting delete completed ... prev: $TOTALPREV current: $TOTAL " - TOTALPREV=$TOTAL - WAIT=$(( WAIT + 1)) - done - echo "Delete is not completed in $MAX_WAIT sec" - return 1 + etime=`date +%s` + echo "Delete is not completed in $((etime - stime)) seconds" + do_nodes $mds2sync "lctl get_param osc.*MDT*.sync_*" } wait_for_host() { @@ -1849,28 +1878,35 @@ wait_mds_ost_sync () { } wait_destroy_complete () { - echo "Waiting for destroy to be done..." - # MAX value shouldn't be big as this mean server responsiveness - # never increase this just to make test pass but investigate - # why it takes so long time - local MAX=5 - local WAIT=0 - while [ $WAIT -lt $MAX ]; do - local -a RPCs=($($LCTL get_param -n osc.*.destroys_in_flight)) - local con=1 - for ((i=0; i<${#RPCs[@]}; i++)); do - [ ${RPCs[$i]} -eq 0 ] && continue - # there are still some destroy RPCs in flight - con=0 - break; - done - sleep 1 - [ ${con} -eq 1 ] && return 0 # done waiting - echo "Waiting $WAIT secs for destroys to be done." - WAIT=$((WAIT + 1)) - done - echo "Destroys weren't done in $MAX sec." - return 1 + echo "Waiting for local destroys to complete" + # MAX value shouldn't be big as this mean server responsiveness + # never increase this just to make test pass but investigate + # why it takes so long time + local MAX=5 + local WAIT=0 + while [ $WAIT -lt $MAX ]; do + local -a RPCs=($($LCTL get_param -n osc.*.destroys_in_flight)) + local con=1 + local i + + for ((i=0; i<${#RPCs[@]}; i++)); do + [ ${RPCs[$i]} -eq 0 ] && continue + # there are still some destroy RPCs in flight + con=0 + break; + done + sleep 1 + [ ${con} -eq 1 ] && return 0 # done waiting + echo "Waiting ${WAIT}s for local destroys to complete" + WAIT=$((WAIT + 1)) + done + echo "Local destroys weren't done in $MAX sec." + return 1 +} + +wait_delete_completed() { + wait_delete_completed_mds $1 || return $? + wait_destroy_complete } wait_exit_ST () { -- 1.8.3.1