From e094fb967b3300b535ac80b2b627d645ab12b5c4 Mon Sep 17 00:00:00 2001 From: tappro Date: Tue, 13 Oct 2009 06:02:55 +0000 Subject: [PATCH] Branch HEAD b=19884,20274 i=vitaly i=rread wait for mds-ost sync and for destroy is done before calculating free space after tests --- lustre/osc/lproc_osc.c | 9 ++++++ lustre/tests/replay-dual.sh | 21 +++++++------- lustre/tests/replay-ost-single.sh | 18 +++++++----- lustre/tests/replay-single.sh | 22 +------------- lustre/tests/test-framework.sh | 60 ++++++++++++++++++++++++++++++++++++--- 5 files changed, 88 insertions(+), 42 deletions(-) diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 35ad7a7..4fe16bc 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -581,6 +581,14 @@ static int osc_wr_lockless_truncate(struct file *file, const char *buffer, count; } +static int osc_rd_destroys_in_flight(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + return snprintf(page, count, "%u\n", + atomic_read(&obd->u.cli.cl_destroy_in_flight)); +} + static struct lprocfs_vars lprocfs_osc_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "ping", 0, lprocfs_wr_ping, 0, 0, 0222 }, @@ -600,6 +608,7 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = { osc_wr_max_pages_per_rpc, 0 }, { "max_rpcs_in_flight", osc_rd_max_rpcs_in_flight, osc_wr_max_rpcs_in_flight, 0 }, + { "destroys_in_flight", osc_rd_destroys_in_flight, 0, 0 }, { "max_dirty_mb", osc_rd_max_dirty_mb, osc_wr_max_dirty_mb, 0 }, { "cur_dirty_bytes", osc_rd_cur_dirty_bytes, 0, 0 }, { "cur_grant_bytes", osc_rd_cur_grant_bytes, diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index c47082e..11a3122 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -236,7 +236,7 @@ test_13() { kill -USR1 $MULTIPID || return 3 wait $MULTIPID || return 4 - # drop close + # drop close do_facet $SINGLEMDS lctl set_param fail_loc=0x80000115 facet_failover $SINGLEMDS do_facet $SINGLEMDS lctl set_param fail_loc=0 @@ -279,17 +279,18 @@ test_14b() { createmany -o $MOUNT1/$tfile-3- 5 umount $MOUNT2 - facet_failover $SINGLEMDS - # expect recovery don't fail due to VBR - df $MOUNT1 || return 1 + fail $SINGLEMDS + wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" # first 25 files should have been replayed unlinkmany $MOUNT1/$tfile- 5 || return 2 unlinkmany $MOUNT1/$tfile-3- 5 || return 3 zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" - # give ost time to process llogs - sleep 3 + + wait_mds_ost_sync || return 4 + wait_destroy_complete || return 5 + AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` log "before $BEFOREUSED, after $AFTERUSED" [ $AFTERUSED -ne $BEFOREUSED ] && \ @@ -298,7 +299,7 @@ test_14b() { } run_test 14b "delete ost orphans if gap occured in objids due to VBR" -test_15a() { # was test_15 +test_15a() { # was test_15 replay_barrier $SINGLEMDS createmany -o $MOUNT1/$tfile- 25 createmany -o $MOUNT2/$tfile-2- 1 @@ -318,14 +319,14 @@ run_test 15a "timeout waiting for lost client during replay, 1 client completes" test_15c() { replay_barrier $SINGLEMDS for ((i = 0; i < 2000; i++)); do - echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed" + echo "data" > "$MOUNT2/${tfile}-$i" || error "create ${tfile}-$i failed" done - + umount $MOUNT2 facet_failover $SINGLEMDS df $MOUNT || return 1 - + zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" return 0 } diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 4d7c5bb..1bdbcbf 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -124,7 +124,7 @@ iozone_bg () { # need to check iozone output on "complete" local iozonelog=$TMP/${TESTSUITE}.iozone.log rm -f $iozonelog - cat $tmppipe | while read line ; do + cat $tmppipe | while read line ; do echo "$line" echo "$line" >>$iozonelog done; @@ -138,7 +138,7 @@ iozone_bg () { fi rm -f $tmppipe rm -f $iozonelog - return $rc + return $rc } test_5() { @@ -158,7 +158,7 @@ test_5() { local pid=$! echo iozone bg pid=$pid - + sleep 8 fail ost1 local rc=0 @@ -187,7 +187,7 @@ test_6() { get_stripe_info client $f sync - sleep 2 # ensure we have a fresh statfs + sleep 2 # ensure we have a fresh statfs sync #define OBD_FAIL_MDS_REINT_NET_REP 0x119 do_facet mds "lctl set_param fail_loc=0x80000119" @@ -196,10 +196,12 @@ test_6() { (( $before > $after_dd )) || return 1 rm -f $f fail ost$((stripe_index + 1)) + wait_recovery_complete ost$((stripe_index + 1)) || error "OST recovery not done" $CHECKSTAT -t file $f && return 2 || true sync # let the delete happen - sleep 5 + wait_mds_ost_sync || return 4 + wait_destroy_complete || return 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account @@ -213,7 +215,7 @@ test_7() { before=`kbytesfree` dd if=/dev/urandom bs=4096 count=1280 of=$f || return 4 sync - sleep 2 # ensure we have a fresh statfs + sleep 2 # ensure we have a fresh statfs sync after_dd=`kbytesfree` log "before: $before after_dd: $after_dd" @@ -221,10 +223,12 @@ test_7() { replay_barrier ost1 rm -f $f fail ost1 + wait_recovery_complete ost1 || error "OST recovery not done" $CHECKSTAT -t file $f && return 2 || true sync # let the delete happen - sleep 5 + wait_mds_ost_sync || return 4 + wait_destroy_complete || return 5 after=`kbytesfree` log "before: $before after: $after" (( $before <= $after + 40 )) || return 3 # take OST logs into account diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 8a182a4..792176d 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -479,27 +479,7 @@ test_20b() { # bug 10480 fail $SINGLEMDS # start orphan recovery df -P $DIR || df -P $DIR || true # reconnect wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" - - # just because recovery is done doesn't mean we've finished - # orphan cleanup. Wait for llogs to get synchronized. - echo waiting for orphan cleanup... - while [ true ]; do - local -a sync=($(do_nodes $(comma_list $(osts_nodes)) \ - "$LCTL get_param obdfilter.*.mds_sync" | awk -F= ' {print $2}')) - local con=1 - for ((i=0; i<${#sync[@]}; i++)); do - [ ${sync[$i]} -eq 0 ] && continue - # there is a not finished MDS-OST synchronization - con=0 - break; - done - [ ${con} -eq 1 ] && break - sleep 1 - done - - # let the statfs cache to get old enough. - sleep 1 - + wait_mds_ost_sync || return 3 AFTERUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` log "before $BEFOREUSED, after $AFTERUSED" [ $AFTERUSED -gt $((BEFOREUSED + 20)) ] && \ diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index be4ae46..c0c1bd9 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1013,12 +1013,13 @@ wait_delete_completed () { sleep 1 TOTAL=`lctl get_param -n osc.*.kbytesavail | \ awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` - [ "$TOTAL" -eq "$TOTALPREV" ] && break + [ "$TOTAL" -eq "$TOTALPREV" ] && return 0 echo "Waiting delete completed ... prev: $TOTALPREV current: $TOTAL " TOTALPREV=$TOTAL WAIT=$(( WAIT + 1)) done - echo "Delete completed." + echo "Delete is not completed in $MAX_WAIT sec" + return 1 } wait_for_host() { @@ -1036,12 +1037,12 @@ wait_for() { wait_recovery_complete () { local facet=$1 - # Use default policy if $2 is not passed by caller. + # Use default policy if $2 is not passed by caller. #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) # as we are in process of changing obd_timeout in different ways # let's set MAX longer than that local MAX=${2:-$(( TIMEOUT * 4 ))} - + local var_svc=${facet}_svc local procfile="*.${!var_svc}.recovery_status" local WAIT=0 @@ -1058,6 +1059,57 @@ wait_recovery_complete () { return 1 } +wait_mds_ost_sync () { + # just because recovery is done doesn't mean we've finished + # orphan cleanup. Wait for llogs to get synchronized. + echo "Waiting for orphan cleanup..." + # MAX value includes time needed for MDS-OST reconnection + local MAX=$(( TIMEOUT * 2 )) + local WAIT=0 + while [ $WAIT -lt $MAX ]; do + local -a sync=($(do_nodes $(comma_list $(osts_nodes)) \ + "$LCTL get_param -n obdfilter.*.mds_sync")) + local con=1 + for ((i=0; i<${#sync[@]}; i++)); do + [ ${sync[$i]} -eq 0 ] && continue + # there is a not finished MDS-OST synchronization + con=0 + break; + done + sleep 2 # increase waiting time and cover statfs cache + [ ${con} -eq 1 ] && return 0 + echo "Waiting $WAIT secs for $facet mds-ost sync done." + WAIT=$((WAIT + 2)) + done + echo "$facet recovery not done in $MAX sec. $STATUS" + return 1 +} + +wait_destroy_complete () { + echo "Waiting for destroy to be done..." + # MAX value shouldn't be big as this mean server responsiveness + # never increase this just to make test pass but investigate + # why it takes so long time + local MAX=5 + local WAIT=0 + while [ $WAIT -lt $MAX ]; do + local -a RPCs=($($LCTL get_param -n osc.*.destroys_in_flight)) + local con=1 + for ((i=0; i<${#RPCs[@]}; i++)); do + [ ${RPCs[$i]} -eq 0 ] && continue + # there are still some destroy RPCs in flight + con=0 + break; + done + sleep 1 + [ ${con} -eq 1 ] && return 0 # done waiting + echo "Waiting $WAIT secs for destroys to be done." + WAIT=$((WAIT + 1)) + done + echo "Destroys weren't done in $MAX sec." + return 1 +} + wait_exit_ST () { local facet=$1 -- 1.8.3.1