From 4cfa81cb40f2edfcbe6df8c866defbf22640d725 Mon Sep 17 00:00:00 2001 From: grev Date: Mon, 2 Feb 2009 12:51:36 +0000 Subject: [PATCH] b=16818 i=Nathan get rid of set_obd_timeout(), improve test_26* --- lustre/tests/acceptance-small.sh | 2 +- lustre/tests/conf-sanity.sh | 23 ------------- lustre/tests/recovery-small.sh | 63 ++++++++++++++++++++-------------- lustre/tests/test-framework.sh | 73 ++++++++++++++++++++++++++++------------ 4 files changed, 89 insertions(+), 72 deletions(-) diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index 70b91ec..ccf9cf4 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -53,7 +53,7 @@ setup_if_needed() { local MOUNTED=$(mounted_lustre_filesystems) if $(echo $MOUNTED | grep -w -q $MOUNT); then check_config $MOUNT - init_versions_vars + init_param_vars return fi diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 1e65b9e..375703d 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -764,29 +764,6 @@ test_26() { } run_test 26 "MDT startup failure cleans LOV (should return errs)" -wait_update () { - local node=$1 - local TEST=$2 - local FINAL=$3 - - local RESULT - local MAX=90 - local WAIT=0 - local sleep=5 - while [ $WAIT -lt $MAX ]; do - RESULT=$(do_node $node "$TEST") - if [ $RESULT -eq $FINAL ]; then - echo "Updated config after $WAIT sec: wanted $FINAL got $RESULT" - return 0 - fi - WAIT=$((WAIT + sleep)) - echo "Waiting $((MAX - WAIT)) secs for config update" - sleep $sleep - done - echo "Config update not seen after $MAX sec: wanted $FINAL got $RESULT" - return 3 -} - set_and_check() { local myfacet=$1 local TEST=$2 diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 54b6816..715990c 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -649,54 +649,65 @@ test_24() { # bug 11710 details correct fsync() behavior } run_test 24 "fsync error (should return error)" +wait_client_evicted () { + local facet=$1 + local exports=$2 + local varsvc=${facet}_svc + + wait_update $(facet_host $facet) "lctl get_param -n *.${!varsvc}.num_exports | cut -d' ' -f2" $((exports - 1)) $3 +} + test_26a() { # was test_26 bug 5921 - evict dead exports by pinger # this test can only run from a client on a separate node. remote_ost || { skip "local OST" && return 0; } remote_ost_nodsh && skip "remote OST with nodsh" && return 0 remote_mds || { skip "local MDS" && return 0; } - OST_FILE=obdfilter.${ost1_svc}.num_exports - OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`" - OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2` - echo starting with $OST_NEXP1 OST exports + + check_timeout || return 1 + + local OST_NEXP=$(do_facet ost1 lctl get_param -n obdfilter.${ost1_svc}.num_exports | cut -d' ' -f2) + + echo starting with $OST_NEXP OST exports # OBD_FAIL_PTLRPC_DROP_RPC 0x505 do_facet client lctl set_param fail_loc=0x505 # evictor takes up to 2.25x to evict. But if there's a # race to start the evictor from various obds, the loser # might have to wait for the next ping. - echo Waiting for $(($TIMEOUT * 4)) secs - sleep $(($TIMEOUT * 4)) - OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`" - OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2` - echo ending with $OST_NEXP2 OST exports + + local rc=0 + wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) + rc=$? do_facet client lctl set_param fail_loc=0x0 - [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted" - return 0 + [ $rc -eq 0 ] || error "client not evicted from OST" } run_test 26a "evict dead exports" test_26b() { # bug 10140 - evict dead exports by pinger remote_ost_nodsh && skip "remote OST with nodsh" && return 0 + check_timeout || return 1 client_df zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2" - MDS_FILE=mds.${mds_svc}.num_exports - MDS_NEXP1="`do_facet mds lctl get_param -n $MDS_FILE | cut -d' ' -f2`" - OST_FILE=obdfilter.${ost1_svc}.num_exports - OST_NEXP1="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`" - echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports + + local MDS_NEXP=$(do_facet mds lctl get_param -n mds.${mds_svc}.num_exports | cut -d' ' -f2) + local OST_NEXP=$(do_facet ost1 lctl get_param -n obdfilter.${ost1_svc}.num_exports | cut -d' ' -f2) + + echo starting with $OST_NEXP OST and $MDS_NEXP MDS exports + #force umount a client; exports should get evicted zconf_umount `hostname` $MOUNT2 -f + # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. - # But if there's a race to start the evictor from various obds, - # the loser might have to wait for the next ping. - echo Waiting for $(($TIMEOUT * 3)) secs - sleep $(($TIMEOUT * 3)) - OST_NEXP2="`do_facet ost1 lctl get_param -n $OST_FILE | cut -d' ' -f2`" - MDS_NEXP2="`do_facet mds lctl get_param -n $MDS_FILE | cut -d' ' -f2`" - echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports - [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted from OST" - [ $MDS_NEXP1 -le $MDS_NEXP2 ] && error "client not evicted from MDS" - return 0 + # But if there's a race to start the evictor from various obds, + # the loser might have to wait for the next ping. + # PING_INTERVAL max(obd_timeout / 4, 1U) + # sleep (2*PING_INTERVAL) + + local rc=0 + wait_client_evicted ost1 $OST_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \ + error "Client was not evicted by ost" rc=1 + wait_client_evicted mds $MDS_NEXP $((TIMEOUT * 2 + TIMEOUT * 3 / 4)) || \ + error "Client was not evicted by mds" } run_test 26b "evict dead exports" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index deda674..d6775e8 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -511,6 +511,34 @@ cleanup_check() { return 0 } +wait_update () { + local node=$1 + local TEST=$2 + local FINAL=$3 + local MAX=${4:-90} + + local RESULT + local WAIT=0 + local sleep=5 + while [ $WAIT -lt $MAX ]; do + sleep $sleep + RESULT=$(do_node $node "$TEST") + if [ $RESULT -eq $FINAL ]; then + echo "Updated after $WAIT sec: wanted $FINAL got $RESULT" + return 0 + fi + WAIT=$((WAIT + sleep)) + echo "Waiting $((MAX - WAIT)) secs for update" + done + echo "Update not seen after $MAX sec: wanted $FINAL got $RESULT" + return 3 +} + +wait_update_facet () { + local facet=$1 + wait_update $(facet_host $facet) $@ +} + wait_delete_completed () { local TOTALPREV=`lctl get_param -n osc.*.kbytesavail | \ awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` @@ -530,14 +558,14 @@ wait_delete_completed () { } wait_for_host() { - HOST=$1 + local HOST=$1 check_network "$HOST" 900 while ! do_node $HOST "ls -d $LUSTRE " > /dev/null; do sleep 5; done } wait_for() { - facet=$1 - HOST=`facet_active_host $facet` + local facet=$1 + local HOST=`facet_active_host $facet` wait_for_host $HOST } @@ -546,8 +574,8 @@ wait_mds_recovery_done () { #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) # as we are in process of changing obd_timeout in different ways # let's set MAX longer than that - MAX=$(( timeout * 4 )) - WAIT=0 + local MAX=$(( timeout * 4 )) + local WAIT=0 while [ $WAIT -lt $MAX ]; do STATUS=`do_facet mds "lctl get_param -n mds.*-MDT*.recovery_status | grep status"` echo $STATUS | grep COMPLETE && return 0 @@ -634,8 +662,8 @@ client_reconnect() { } facet_failover() { - facet=$1 - sleep_time=$2 + local facet=$1 + local sleep_time=$2 echo "Failing $facet on node `facet_active_host $facet`" shutdown_facet $facet [ -n "$sleep_time" ] && sleep $sleep_time @@ -976,16 +1004,6 @@ remount_client() zconf_mount `hostname` $1 || error "mount failed" } -set_obd_timeout() { - local facet=$1 - local timeout=$2 - - do_facet $facet lsmod | grep -q obdclass || \ - do_facet $facet "modprobe obdclass" - - do_facet $facet "lctl set_param timeout=$timeout" -} - writeconf_facet () { local facet=$1 local dev=$2 @@ -1010,7 +1028,6 @@ setupall() { echo $WRITECONF | grep -q "writeconf" && \ writeconf_all - set_obd_timeout mds $TIMEOUT start mds $MDSDEV $MDS_MOUNT_OPTS # We started mds, now we should set failover variable properly. # Set mdsfailover_HOST if it is not set (the default failnode). @@ -1018,7 +1035,6 @@ setupall() { for num in `seq $OSTCOUNT`; do DEVNAME=`ostdevname $num` - set_obd_timeout ost$num $TIMEOUT start ost$num $DEVNAME $OST_MOUNT_OPTS # We started ost$num, now we should set ost${num}failover variable properly. @@ -1039,7 +1055,7 @@ setupall() { [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT2 fi sleep 5 - init_versions_vars + init_param_vars } mounted_lustre_filesystems() { @@ -1077,10 +1093,13 @@ init_facets_vars () { done } -init_versions_vars () { +init_param_vars () { export MDSVER=$(do_facet mds "lctl get_param version" | cut -d. -f1,2) export OSTVER=$(do_facet ost1 "lctl get_param version" | cut -d. -f1,2) export CLIVER=$(lctl get_param version | cut -d. -f 1,2) + + TIMEOUT=$(do_facet mds "lctl get_param -n timeout") + log "Using TIMEOUT=$TIMEOUT" } check_config () { @@ -1101,6 +1120,16 @@ check_config () { fi } +check_timeout () { + local mdstimeout=$(do_facet mds "lctl get_param -n timeout") + local cltimeout=$(lctl get_param -n timeout) + echo $timeout + if [ $mdstimeout -ne $TIMEOUT ] || [ $mdstimeout -ne $cltimeout ]; then + error "timeouts are wrong! mds: $mdstimeout, client: $cltimeout, TIMEOUT=$TIMEOUT" + return 1 + fi +} + check_and_setup_lustre() { local MOUNTED=$(mounted_lustre_filesystems) if [ -z "$MOUNTED" ] || ! $(echo $MOUNTED | grep -w -q $MOUNT); then @@ -1112,7 +1141,7 @@ check_and_setup_lustre() { else check_config $MOUNT init_facets_vars - init_versions_vars + init_param_vars fi if [ "$ONLY" == "setup" ]; then exit 0 -- 1.8.3.1