From: Yu Jian Date: Sat, 5 Jan 2013 10:10:11 +0000 (+0800) Subject: LU-1526 tests: Support for MDS-initiated OST_DESTROYs X-Git-Tag: v1_8_9_WC1_RC1~28 X-Git-Url: https://git.whamcloud.com/gitweb?p=fs%2Flustre-release.git;a=commitdiff_plain;h=8190021cfc1050418442bc41f808b1df71acb20f LU-1526 tests: Support for MDS-initiated OST_DESTROYs This patch is backported from commit af5f388 of LU-1303 to support interoperating with 2.4 server. The patch makes sure the tests work with MDSs that destroy OST objects asynchronously on behalf of clients. Signed-off-by: Yu Jian Change-Id: I8d8cb9e3699b6e7f63af106a5f45363f61f3ce7c Reviewed-on: http://review.whamcloud.com/4959 Tested-by: Hudson Reviewed-by: Andreas Dilger Reviewed-by: Li Wei Tested-by: Maloo --- diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 77f360c..b313a07 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -100,6 +100,8 @@ cat $LOG rm -f $OOS sync; sleep 1; sync +wait_delete_completed 300 + if [ $SUCCESS -eq 1 ]; then echo "Success!" rm -f $LOG diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 2ca4a08..12110ac 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -275,6 +275,8 @@ test_14a() { run_test 14a "timeouts waiting for lost client during replay" test_14b() { + wait_mds_ost_sync + wait_delete_completed BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` #lfs setstripe --index=0 --count=1 $MOUNT1 mkdir -p $MOUNT1/$tdir @@ -295,7 +297,7 @@ test_14b() { zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" wait_mds_ost_sync || return 5 - wait_destroy_complete || return 6 + wait_delete_completed || return 6 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` log "before $BEFOREUSED, after $AFTERUSED" diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 1ac89c3..9f7bd35 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -197,7 +197,7 @@ test_6() { sync # let the delete happen wait_mds_ost_sync || return 4 - wait_destroy_complete || return 5 + wait_delete_completed || return 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account @@ -224,7 +224,7 @@ test_7() { sync # let the delete happen wait_mds_ost_sync || return 4 - wait_destroy_complete || return 5 + wait_delete_completed || return 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account @@ -302,7 +302,7 @@ test_8d() { [ -z "$(lctl get_param -n mdc.${FSNAME}-*.connect_flags|grep einprogress)" \ ] && skip_env "MDS doesn't support EINPROGRESS" && return #define OBD_FAIL_MDS_DQACQ_NET 0x187 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x187" + do_facet mds "lctl set_param fail_loc=0x187" # test the non-intent create path mcreate $TDIR/$tfile & cpid=$! @@ -311,14 +311,14 @@ test_8d() { error "mknod finished incorrectly" return 1 fi - do_facet $SINGLEMDS "lctl set_param fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" wait $cpid || return 2 stat $TDIR/$tfile || error "mknod failed" rm $TDIR/$tfile #define OBD_FAIL_MDS_DQACQ_NET 0x187 - do_facet $SINGLEMDS "lctl set_param fail_loc=0x187" + do_facet mds "lctl set_param fail_loc=0x187" # test the intent create path openfile -f O_RDWR:O_CREAT $TDIR/$tfile & cpid=$! @@ -327,7 +327,7 @@ test_8d() { error "open finished incorrectly" return 3 fi - do_facet $SINGLEMDS "lctl set_param fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" wait $cpid || return 4 stat $TDIR/$tfile || error "open failed" } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 3229b22..62b6a00 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -978,6 +978,7 @@ reset_enospc() { [ "$OSTIDX" ] && list=$(facet_host ost$((OSTIDX + 1))) do_nodes $list lctl set_param fail_loc=0 + sync # initiate all OST_DESTROYs from MDS to OST sleep_maxage } @@ -4232,6 +4233,7 @@ test_101d() { set_read_ahead $old_READAHEAD rm -f $file + wait_delete_completed [ $time_ra_ON -lt $time_ra_OFF ] || error "read-ahead enabled time read (${time_ra_ON}s) is more than @@ -6243,6 +6245,7 @@ test_133c() { $LFS setstripe -c 1 -o 0 ${testdir}/${tfile} sync cancel_lru_locks osc + wait_delete_completed # clear stats. local dev=$(get_mds_mdt_device_proc_path) @@ -6261,6 +6264,7 @@ test_133c() { check_stats_facet ost1 "punch" 1 rm -f ${testdir}/${tfile} || error "file remove failed" + wait_delete_completed check_stats_facet ost1 "destroy" 1 rm -rf $DIR/${tdir} diff --git a/lustre/tests/sanityn.sh b/lustre/tests/sanityn.sh index 9a909ac..1105570 100644 --- a/lustre/tests/sanityn.sh +++ b/lustre/tests/sanityn.sh @@ -319,6 +319,7 @@ run_test 14d "chmod of executing file is still possible ========" test_15() { # bug 974 - ENOSPC echo "PATH=$PATH" sh oos2.sh $MOUNT1 $MOUNT2 + wait_delete_completed grant_error=`dmesg | grep "> available"` [ -z "$grant_error" ] || error "$grant_error" } @@ -602,6 +603,11 @@ run_test 31a "voluntary cancel / blocking ast race==============" test_31b() { remote_ost || { skip "local OST" && return 0; } remote_ost_nodsh && skip "remote OST w/o dsh" && return 0 + + # make sure there is no local locks due to destroy + wait_mds_ost_sync || error "wait_mds_ost_sync()" + wait_delete_completed || error "wait_delete_completed()" + mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir" lfs setstripe $DIR/$tdir/$tfile -i 0 -c 1 cp /etc/hosts $DIR/$tdir/$tfile @@ -834,7 +840,7 @@ test_36() { #bug 16417 rm -f $DIR1/$tdir/file000 kill -USR1 $read_pid wait $read_pid - sleep 1 + wait_delete_completed local after=$($LFS df | awk '{if ($1 ~/^filesystem/) {print $5; exit} }') echo "*** cycle($i) *** before($before):after_dd($after_dd):after($after)" # this free space! not used diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 5f96dd0..9736f4e 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1186,23 +1186,55 @@ wait_update_facet () { wait_update $(facet_active_host $facet) "$@" } -wait_delete_completed () { - local TOTALPREV=`lctl get_param -n osc.*.kbytesavail | \ - awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` - - local WAIT=0 - local MAX_WAIT=20 - while [ "$WAIT" -ne "$MAX_WAIT" ]; do - sleep 1 - TOTAL=`lctl get_param -n osc.*.kbytesavail | \ - awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` - [ "$TOTAL" -eq "$TOTALPREV" ] && return 0 - echo "Waiting delete completed ... prev: $TOTALPREV current: $TOTAL " - TOTALPREV=$TOTAL - WAIT=$(( WAIT + 1)) - done - echo "Delete is not completed in $MAX_WAIT sec" - return 1 +wait_delete_completed_mds() { + [[ $(lustre_version_code mds) -lt $(version_code 2.2.58) ]] && + return 0 + + local MAX_WAIT=${1:-20} + local mds2sync="" + local stime=`date +%s` + local etime + local node + local changes + + # find MDS with pending deletions + for node in $(mdts_nodes); do + changes=$(do_node $node "lctl get_param -n osc.*MDT*.sync_*" \ + 2>/dev/null | calc_sum) + if [ -z "$changes" ] || [ $changes -eq 0 ]; then + continue + fi + mds2sync="$mds2sync $node" + done + if [ "$mds2sync" == "" ]; then + return + fi + mds2sync=$(comma_list $mds2sync) + + # sync MDS transactions + do_nodes $mds2sync "lctl set_param -n osd*.*MD*.force_sync 1" + + # wait till all changes are sent and commmitted by OSTs + # for ldiskfs space is released upon execution, but DMU + # do this upon commit + + local WAIT=0 + while [ "$WAIT" -ne "$MAX_WAIT" ]; do + changes=$(do_nodes $mds2sync "lctl get_param -n osc.*MDT*.sync_*" \ + | calc_sum) + #echo "$node: $changes changes on all" + if [ "$changes" -eq "0" ]; then + etime=`date +%s` + #echo "delete took $((etime - stime)) seconds" + return + fi + sleep 1 + WAIT=$(( WAIT + 1)) + done + + etime=`date +%s` + echo "Delete is not completed in $((etime - stime)) seconds" + do_nodes $mds2sync "lctl get_param osc.*MDT*.sync_*" } wait_for_host() { @@ -1295,28 +1327,35 @@ wait_mds_ost_sync () { } wait_destroy_complete () { - echo "Waiting for destroy to be done..." - # MAX value shouldn't be big as this mean server responsiveness - # never increase this just to make test pass but investigate - # why it takes so long time - local MAX=5 - local WAIT=0 - while [ $WAIT -lt $MAX ]; do - local -a RPCs=($($LCTL get_param -n osc.*.destroys_in_flight)) - local con=1 - for ((i=0; i<${#RPCs[@]}; i++)); do - [ ${RPCs[$i]} -eq 0 ] && continue - # there are still some destroy RPCs in flight - con=0 - break; - done - sleep 1 - [ ${con} -eq 1 ] && return 0 # done waiting - echo "Waiting $WAIT secs for destroys to be done." - WAIT=$((WAIT + 1)) - done - echo "Destroys weren't done in $MAX sec." - return 1 + echo "Waiting for local destroys to complete" + # MAX value shouldn't be big as this mean server responsiveness + # never increase this just to make test pass but investigate + # why it takes so long time + local MAX=5 + local WAIT=0 + while [ $WAIT -lt $MAX ]; do + local -a RPCs=($($LCTL get_param -n osc.*.destroys_in_flight)) + local con=1 + local i + + for ((i=0; i<${#RPCs[@]}; i++)); do + [ ${RPCs[$i]} -eq 0 ] && continue + # there are still some destroy RPCs in flight + con=0 + break; + done + sleep 1 + [ ${con} -eq 1 ] && return 0 # done waiting + echo "Waiting ${WAIT}s for local destroys to complete" + WAIT=$((WAIT + 1)) + done + echo "Local destroys weren't done in $MAX sec." + return 1 +} + +wait_delete_completed() { + wait_delete_completed_mds $1 || return $? + wait_destroy_complete } wait_exit_ST () {