From af666bef058c5b7997527fc851a84a89375912fb Mon Sep 17 00:00:00 2001 From: Andreas Dilger Date: Wed, 20 Oct 2021 19:47:25 -0600 Subject: [PATCH] LU-12857 tests: allow clients to be IDLE after recovery If clients are not connected to an OST when it fails (connection is IDLE), they do not need to be involved in recovery, so this should not be considered an error when checking the client state. Test-Parameters: trivial testlist=recovery-mds-scale env=SLOW=no Test-Parameters: testlist=conf-sanity Test-Parameters: testlist=replay-dual,replay-single Signed-off-by: Andreas Dilger Change-Id: I6cfeb718acd233378ed1608f22061bc15c3ebbe5 Reviewed-on: https://review.whamcloud.com/45318 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: James Nunez Reviewed-by: Oleg Drokin --- lustre/tests/recovery-mds-scale.sh | 9 ++-- lustre/tests/test-framework.sh | 90 +++++++++++++++++++------------------- 2 files changed, 49 insertions(+), 50 deletions(-) diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh index 134f285..cc78f5f 100644 --- a/lustre/tests/recovery-mds-scale.sh +++ b/lustre/tests/recovery-mds-scale.sh @@ -155,11 +155,10 @@ failover_target() { exit 7 fi - log "Checking clients are in FULL state before doing next failover..." - if ! wait_clients_import_state $NODES_TO_USE $serverfacet FULL; then - echo "Clients import not FULL, please consider to increase \ -SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD!" - fi + want="FULL|IDLE" + log "Checking clients are in $want state before next failover" + wait_clients_import_state $NODES_TO_USE $serverfacet "$want" || + echo "Client import not $want, please consider to increase SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD!" log "Starting failover on $serverfacet" facet_failover "$serverfacet" || exit 1 diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 4da8b3f..3d0398f 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -7456,48 +7456,49 @@ get_osc_import_name() { } _wait_import_state () { - local expected=$1 - local CONN_PROC=$2 - local maxtime=${3:-$(max_recovery_time)} - local error_on_failure=${4:-1} - local CONN_STATE - local i=0 + local expected="$1" + local CONN_PROC="$2" + local maxtime=${3:-$(max_recovery_time)} + local err_on_fail=${4:-1} + local CONN_STATE + local i=0 CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2 | uniq) - while ! echo "${CONN_STATE}" | egrep -q "^${expected}\$" ; do - if [ "${expected}" == "DISCONN" ]; then - # for disconn we can check after proc entry is removed - [ "x${CONN_STATE}" == "x" ] && return 0 - # with AT enabled, we can have connect request timeout near of - # reconnect timeout and test can't see real disconnect - [ "${CONN_STATE}" == "CONNECTING" ] && return 0 - fi - if [ $i -ge $maxtime ]; then - [ $error_on_failure -ne 0 ] && \ - error "can't put import for $CONN_PROC into ${expected}" \ - "state after $i sec, have ${CONN_STATE}" - return 1 - fi - sleep 1 - # Add uniq for multi-mount case - CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2 | uniq) - i=$(($i + 1)) - done + while ! echo "${CONN_STATE}" | egrep -q "^${expected}\$" ; do + if [[ "${expected}" == "DISCONN" ]]; then + # for disconn we can check after proc entry is removed + [[ -z "${CONN_STATE}" ]] && return 0 + # with AT, we can have connect request timeout near + # reconnect timeout and test can't see real disconnect + [[ "${CONN_STATE}" == "CONNECTING" ]] && return 0 + fi + if (( $i >= $maxtime )); then + (( $err_on_fail != 0 )) && + error "can't put import for $CONN_PROC into ${expected} state after $i sec, have ${CONN_STATE}" + return 1 + fi + sleep 1 + # Add uniq for multi-mount case + CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | + cut -f2 | uniq) + i=$((i + 1)) + done - log "$CONN_PROC in ${CONN_STATE} state after $i sec" - return 0 + log "$CONN_PROC in ${CONN_STATE} state after $i sec" + return 0 } wait_import_state() { - local state=$1 - local params=$2 - local maxtime=${3:-$(max_recovery_time)} - local error_on_failure=${4:-1} - local param - - for param in ${params//,/ }; do - _wait_import_state $state $param $maxtime $error_on_failure || return - done + local expected="$1" + local params="$2" + local maxtime=${3:-$(max_recovery_time)} + local err_on_fail=${4:-1} + local param + + for param in ${params//,/ }; do + _wait_import_state "$expected" "$param" $maxtime $err_on_fail || + return + done } wait_import_state_mount() { @@ -7505,7 +7506,7 @@ wait_import_state_mount() { return 0 fi - wait_import_state $* + wait_import_state "$@" } # One client request could be timed out because server was not ready @@ -7704,11 +7705,10 @@ do_rpc_nodes () { } wait_clients_import_state () { - local list=$1 - local facet=$2 - local expected=$3 - - local facets=$facet + local list="$1" + local facet="$2" + local expected="$3" + local facets="$facet" if [ "$FAILURE_MODE" = HARD ]; then facets=$(facets_on_host $(facet_active_host $facet)) @@ -7719,11 +7719,11 @@ wait_clients_import_state () { local proc_path case $facet in ost* ) proc_path="osc.$(get_clientosc_proc_path \ - $label).ost_server_uuid" ;; + $label).ost_server_uuid" ;; mds* ) proc_path="mdc.$(get_clientmdc_proc_path \ - $label).mds_server_uuid" ;; + $label).mds_server_uuid" ;; mgs* ) proc_path="mgc.$(get_clientmgc_proc_path \ - $label).mgs_server_uuid" ;; + $label).mgs_server_uuid" ;; *) error "unknown facet!" ;; esac -- 1.8.3.1