From: Elena Gryaznova Date: Mon, 16 Aug 2010 16:23:10 +0000 (+0400) Subject: b=20407 TF: "HARD" failovers with multiple targets per server X-Git-Tag: v1_8_4_51~32 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=f9feb40ec30de1f6ad01b7205cb5c67273f464b6;p=fs%2Flustre-release.git b=20407 TF: "HARD" failovers with multiple targets per server i=Brian.Murrell i=Li.Wei --- diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index fd1f60f..704741e 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -66,7 +66,7 @@ if [[ $mds_HOST == $mgs_HOST ]] && [[ $MDSDEV == $MGSDEV ]]; then MDS_MKFS_OPTS="--mgs $MDS_MKFS_OPTS" else MDS_MKFS_OPTS="--mgsnode=$MGSNID $MDS_MKFS_OPTS" - mgs_MKFS_OPTS="--mgs --device-size=$MGSSIZE" + MGS_MKFS_OPTS="--mgs --device-size=$MGSSIZE" fi MKFSOPT="" @@ -80,7 +80,7 @@ OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID - MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS:-"-o loop"} OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-"-o loop"} -mgs_MOUNT_OPTS=${mgs_MOUNT_OPTS:-$MDS_MOUNT_OPTS} +MGS_MOUNT_OPTS=${MGS_MOUNT_OPTS:-$MDS_MOUNT_OPTS} #client MOUNT=${MOUNT:-/mnt/${FSNAME}} diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index ceba457..82d0524 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -85,7 +85,7 @@ reformat_and_config() { start_mgs () { echo "start mgs" - start mgs $MGSDEV $mgs_MOUNT_OPTS + start mgs $MGSDEV $MGS_MOUNT_OPTS } start_mds() { diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index d1345f1..69f074d 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -13,10 +13,8 @@ init_test_env $@ ALWAYS_EXCEPT="10 $INSANITY_EXCEPT" if [ "$FAILURE_MODE" = "HARD" ]; then - mixed_ost_devs && CONFIG_EXCEPTIONS="0 2 4 5 6 8" && \ - echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " && \ - echo "Except the tests: $CONFIG_EXCEPTIONS" && \ - ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS" + skip_env "$0: is not functional with FAILURE_MODE = HARD, please use recovery-double-scale, bz20407" + exit 0 fi # @@ -194,10 +192,10 @@ test_2() { echo "Reintegrating OST" reboot_facet ost1 - wait_for ost1 + wait_for_facet ost1 start_ost 1 || return 2 - wait_for mds + wait_for_facet mds start mds $MDSDEV $MDS_MOUNT_OPTS || return $? #Check FS @@ -270,10 +268,10 @@ test_4() { #Reintegration echo "Reintegrating OST" reboot_facet ost1 - wait_for ost1 + wait_for_facet ost1 start_ost 1 - wait_for mds + wait_for_facet mds start mds $MDSDEV $MDS_MOUNT_OPTS #Check FS @@ -320,9 +318,9 @@ test_5() { #Reintegration echo "Reintegrating OSTs" - wait_for ost1 + wait_for_facet ost1 start_ost 1 - wait_for ost2 + wait_for_facet ost2 start_ost 2 clients_recover_osts ost1 @@ -371,7 +369,7 @@ test_6() { #Reintegration echo "Reintegrating OST/CLIENTs" - wait_for ost1 + wait_for_facet ost1 start_ost 1 reintegrate_clients || return 1 sleep 5 @@ -491,7 +489,7 @@ test_8() { #Reintegration echo "Reintegrating CLIENTs/OST" reintegrate_clients || return 3 - wait_for ost1 + wait_for_facet ost1 start_ost 1 wait $DFPID clients_up || return 1 diff --git a/lustre/tests/mmp.sh b/lustre/tests/mmp.sh index 6986c1a..10b748b 100755 --- a/lustre/tests/mmp.sh +++ b/lustre/tests/mmp.sh @@ -346,7 +346,7 @@ mount_after_reboot() { if [ "$FAILURE_MODE" = "HARD" ]; then shutdown_facet $facet reboot_facet $facet - wait_for $facet + wait_for_facet $facet else replay_barrier_nodf $facet fi diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh index bdb765c..f808cc1 100644 --- a/lustre/tests/recovery-double-scale.sh +++ b/lustre/tests/recovery-double-scale.sh @@ -76,7 +76,7 @@ reboot_recover_node () { # MDS, OST item contains the facet case $nodetype in MDS|OST ) facet_failover $item - [ "$SERIAL" ] && wait_recovery_complete $item $((timeout * 4)) || true + [ "$SERIAL" ] && wait_recovery_complete $item || true ;; clients) for c in ${item//,/ }; do shutdown_client $c diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh index 2ef17cc..0d579bb 100644 --- a/lustre/tests/recovery-mds-scale.sh +++ b/lustre/tests/recovery-mds-scale.sh @@ -223,11 +223,10 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do fi log "Wait $SERVERFACET recovery complete before doing next failover ...." - if [[ $(server_numfailovers $SERVERFACET) != 0 ]]; then - if ! wait_recovery_complete $SERVERFACET ; then - echo "$SERVERFACET recovery is not completed!" - exit 7 - fi + + if ! wait_recovery_complete $SERVERFACET ; then + echo "$SERVERFACET recovery is not completed!" + exit 7 fi log "Checking clients are in FULL state before doing next failover" diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh index 945c4db..bf6b63c 100644 --- a/lustre/tests/recovery-random-scale.sh +++ b/lustre/tests/recovery-random-scale.sh @@ -244,7 +244,7 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do log "Starting failover on $SERVERFACET" facet_failover "$SERVERFACET" || exit 1 - if ! wait_recovery_complete $SERVERFACET $((TIMEOUT * 10)); then + if ! wait_recovery_complete $SERVERFACET ; then echo "$SERVERFACET recovery is not completed!" exit 7 fi diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 987b771..03ec75d 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -13,13 +13,6 @@ init_test_env $@ remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 -if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then - CONFIG_EXCEPTIONS="52" - echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " - echo "Except the tests: $CONFIG_EXCEPTIONS" - ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS" -fi - # also long tests: 19, 21a, 21e, 21f, 23, 27 # 1 2.5 2.5 4 4 (min)" [ "$SLOW" = "no" ] && EXCEPT_SLOW="17 26a 26b 50 51 57" diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 6cfb8b2..f814369 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -13,13 +13,6 @@ CLEANUP=${CLEANUP:-""} MOUNT_2=${MOUNT_2:-"yes"} . $LUSTRE/tests/test-framework.sh -if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then - CONFIG_EXCEPTIONS="17" - echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " - echo "Except the tests: $CONFIG_EXCEPTIONS" - ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS" -fi - init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 645efd6..f493db8 100644 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -22,13 +22,6 @@ remote_mds_nodsh && log "SKIP: remote MDS with nodsh" && exit 0 # bug number: ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT" -if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then - CONFIG_EXCEPTIONS="0b 42 47 61a 61c" - echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " - echo "Except the tests: $CONFIG_EXCEPTIONS" - ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS" -fi - # 63 min 7 min AT AT AT AT" [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68" @@ -1482,9 +1475,9 @@ test_61d() { # bug 16002 #define OBD_FAIL_OBD_LLOG_SETUP 0x605 stop mgs do_facet mgs "lctl set_param fail_loc=0x80000605" - start mgs $MGSDEV $mgs_MOUNT_OPTS && error "mgs start should have failed" + start mgs $MGSDEV $MGS_MOUNT_OPTS && error "mgs start should have failed" do_facet mgs "lctl set_param fail_loc=0" - start mgs $MGSDEV $mgs_MOUNT_OPTS || error "cannot restart mgs" + start mgs $MGSDEV $MGS_MOUNT_OPTS || error "cannot restart mgs" } run_test 61d "error in llog_setup should cleanup the llog context correctly" @@ -2222,12 +2215,12 @@ test_87() { #bug 17485 reboot_facet mds change_active mds - wait_for mds + wait_for_facet mds mount_facet mds || error "Restart of mds failed" reboot_facet ost1 change_active ost1 - wait_for ost1 + wait_for_facet ost1 mount_facet ost1 || error "Restart of ost1 failed" clients_up diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 7802fc1..4f655b8 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -710,12 +710,18 @@ fi fi" } -shudown_node_hard () { +shutdown_node () { + local node=$1 + echo + $POWER_DOWN $node + $POWER_DOWN $node +} + +shutdown_node_hard () { local host=$1 local attempts=3 for i in $(seq $attempts) ; do - $POWER_DOWN $host + shutdown_node $host sleep 1 ping -w 3 -c 1 $host > /dev/null 2>&1 || return 0 echo "waiting for $host to fail attempts=$attempts" @@ -730,25 +736,48 @@ shutdown_client() { local attempts=3 if [ "$FAILURE_MODE" = HARD ]; then - shudown_node_hard $client + shutdown_node_hard $client else zconf_umount_clients $client $mnt -f fi } +facets_on_host () { + local host=$1 + local facets="$(get_facets OST),$(get_facets MDS)" + local affected + + combined_mgs_mds || facets="$facets,mgs" + + for facet in ${facets//,/ }; do + if [ $(facet_active_host $facet) == $host ]; then + affected="$affected $facet" + fi + done + + echo $(comma_list $affected) +} + shutdown_facet() { local facet=$1 + if [ "$FAILURE_MODE" = HARD ]; then - shudown_node_hard $(facet_active_host $facet) - elif [ "$FAILURE_MODE" = SOFT ]; then + shutdown_node_hard $(facet_active_host $facet) + else stop $facet fi } +reboot_node() { + local node=$1 + echo + $POWER_UP $node + $POWER_UP $node +} + reboot_facet() { - facet=$1 + local facet=$1 if [ "$FAILURE_MODE" = HARD ]; then - $POWER_UP `facet_active_host $facet` + reboot_node $(facet_active_host $facet) else sleep 10 fi @@ -757,7 +786,7 @@ reboot_facet() { boot_node() { local node=$1 if [ "$FAILURE_MODE" = HARD ]; then - $POWER_UP $node + reboot_node $node wait_for_host $node fi } @@ -983,39 +1012,68 @@ wait_delete_completed () { } wait_for_host() { - local host=$1 - check_network "$host" 900 - while ! do_node $host hostname > /dev/null; do sleep 5; done + local hostlist=$1 + + # we can use "for" here because we are waiting the slowest + for host in ${hostlist//,/ }; do + check_network "$host" 900 + done + while ! do_nodes $hostlist hostname > /dev/null; do sleep 5; done } -wait_for() { - local facet=$1 - local host=`facet_active_host $facet` - wait_for_host $host +wait_for_facet() { + local facetlist=$1 + local hostlist + + for facet in ${facetlist//,/ }; do + hostlist=$(expand_list $hostlist $(facet_active_host $facet)) + done + wait_for_host $hostlist } -wait_recovery_complete () { - local facet=$1 +_wait_recovery_complete () { + local param=$1 # Use default policy if $2 is not passed by caller. local MAX=${2:-$(max_recovery_time)} - local var_svc=${facet}_svc - local procfile="*.${!var_svc}.recovery_status" local WAIT=0 local STATUS= while [ $WAIT -lt $MAX ]; do - STATUS=$(do_facet $facet lctl get_param -n $procfile | grep status) - [[ $STATUS = "status: COMPLETE" ]] && return 0 + STATUS=$(lctl get_param -n $param | grep status) + echo $param $STATUS + [[ $STATUS = "status: COMPLETE" || $STATUS = "status: INACTIVE" ]] && return 0 sleep 5 WAIT=$((WAIT + 5)) - echo "Waiting $((MAX - WAIT)) secs for $facet recovery done. $STATUS" + echo "Waiting $((MAX - WAIT)) secs for $param recovery done. $STATUS" done - echo "$facet recovery not done in $MAX sec. $STATUS" + echo "$param recovery not done in $MAX sec. $STATUS" return 1 } +wait_recovery_complete () { + local facet=$1 + + # with an assumption that at_max is the same on all nodes + local MAX=${2:-$(max_recovery_time)} + + local facets=$facet + if [ "$FAILURE_MODE" = HARD ]; then + facets=$(facets_on_host $(facet_active_host $facet)) + fi + echo affected facets: $facets + + # we can use "for" here because we are waiting the slowest + for facet in ${facets//,/ }; do + local var_svc=${facet}_svc + local param="*.${!var_svc}.recovery_status" + + local host=$(facet_active_host $facet) + do_rpc_nodes $host _wait_recovery_complete $param $MAX + done +} + wait_mds_ost_sync () { # just because recovery is done doesn't mean we've finished # orphan cleanup. Wait for llogs to get synchronized. @@ -1161,19 +1219,40 @@ client_reconnect() { facet_failover() { local facet=$1 local sleep_time=$2 - echo "Failing $facet on node `facet_active_host $facet`" + local host=$(facet_active_host $facet) + + echo "Failing $facet on node $host" + + local affected=$facet + + if [ "$FAILURE_MODE" = HARD ]; then + affected=$(facets_on_host $host) + fi + shutdown_facet $facet + + echo affected facets: $affected + [ -n "$sleep_time" ] && sleep $sleep_time + reboot_facet $facet clients_up & - DFPID=$! + local dfpid=$! RECOVERY_START_TIME=`date +%s` - echo "df pid is $DFPID" - change_active $facet - local TO=`facet_active_host $facet` - echo "Failover $facet to $TO" - wait_for $facet - mount_facet $facet || error "Restart of $facet failed" + echo "df pid is $dfpid" + + change_active $affected + + wait_for_facet $affected + # start mgs first if it is affected + if ! combined_mgs_mds && list_member $affected mgs; then + mount_facet mgs || error "Restart of mgs failed" + fi + # FIXME; has to be changed to mount all facets concurrently + affected=$(exclude_items_from_list $affected mgs) + for facet in ${affected//,/ }; do + mount_facet $facet || error "Restart of $facet on node $host failed!" + done } obd_name() { @@ -1334,10 +1413,16 @@ facet_active_host() { } change_active() { - local facet=$1 + local facetlist=$1 + local facet + + facetlist=$(exclude_items_from_list $facetlist mgs) + + for facet in ${facetlist//,/ }; do local failover=${facet}failover - host=`facet_host $failover` + local host=`facet_host $failover` [ -z "$host" ] && return + local curactive=`facet_active $facet` if [ -z "${curactive}" -o "$curactive" == "$failover" ] ; then eval export ${facet}active=$facet @@ -1347,6 +1432,9 @@ change_active() { # save the active host for this facet local activevar=${facet}active echo "$activevar=${!activevar}" > $TMP/$activevar + local TO=`facet_active_host $facet` + echo "Failover $facet to $TO" + done } do_node() { @@ -1540,6 +1628,53 @@ combined_mgs_mds () { [[ $MDSDEV = $MGSDEV ]] && [[ $mds_HOST = $mgs_HOST ]] } +mkfs_opts () { + local facet=$1 + + local tgt=$(echo $facet | tr -d [:digit:] | tr "[:lower:]" "[:upper:]") + local optvar=${tgt}_MKFS_OPTS + local opt=${!optvar} + + # FIXME: ! combo mgs/mds + mgsfailover is not supported yet + [[ $facet = mgs ]] && echo $opt && return + + # 1. + # --failnode options + local var=${facet}failover_HOST + if [ x"${!var}" != x ] && [ x"${!var}" != x$(facet_host $facet) ] ; then + local failnode=$(h2$NETTYPE ${!var}) + failnode="--failnode=$failnode" + # options does not contain + # or contains wrong --failnode= + if [[ $opt != *${failnode}* ]]; then + opt=$(echo $opt | sed 's/--failnode=.* / /') + opt="$opt $failnode" + fi + fi + + # 2. + # --mgsnode options + # no additional mkfs mds "--mgsnode" option for this configuration + if [[ $facet = mds ]] && combined_mgs_mds; then + echo $opt + return + fi + + # additional mkfs "--mgsnode" + local mgsnode="--mgsnode=$MGSNID" + opt=${opt//$mgsnode } + for nid in ${MGSNID//:/ }; do + local mgsnode="--mgsnode=$nid" + # options does not contain + # --mgsnode=$nid + if [[ $opt != *${mgsnode}" "* ]]; then + opt="$opt --mgsnode=$nid" + fi + done + + echo $opt +} + formatall() { [ "$FSTYPE" ] && FSTYPE_OPT="--backfstype $FSTYPE" @@ -1549,20 +1684,22 @@ formatall() { [ "$CLIENTONLY" ] && return echo Formatting mgs, mds, osts if ! combined_mgs_mds ; then - add mgs $mgs_MKFS_OPTS $FSTYPE_OPT --reformat $MGSDEV || exit 10 + add mgs $(mkfs_opts mgs) $FSTYPE_OPT --reformat $MGSDEV || exit 10 fi if $VERBOSE; then - add mds $MDS_MKFS_OPTS $FSTYPE_OPT --reformat $MDSDEV || exit 10 + add mds $(mkfs_opts mds) $FSTYPE_OPT --reformat $MDSDEV || exit 10 else - add mds $MDS_MKFS_OPTS $FSTYPE_OPT --reformat $MDSDEV > /dev/null || exit 10 + add mds $(mkfs_opts mds) $FSTYPE_OPT --reformat $MDSDEV > /dev/null || exit 10 fi + # the ost-s could have different OST_MKFS_OPTS + # because of different failnode-s for num in `seq $OSTCOUNT`; do if $VERBOSE; then - add ost$num $OST_MKFS_OPTS $FSTYPE_OPT --reformat `ostdevname $num` || exit 10 + add ost$num $(mkfs_opts ost${num}) $FSTYPE_OPT --reformat `ostdevname $num` || exit 10 else - add ost$num $OST_MKFS_OPTS $FSTYPE_OPT --reformat `ostdevname $num` > /dev/null || exit 10 + add ost$num $(mkfs_opts ost${num}) $FSTYPE_OPT --reformat `ostdevname $num` > /dev/null || exit 10 fi done } @@ -1607,7 +1744,7 @@ setupall() { writeconf_all if ! combined_mgs_mds ; then - start mgs $MGSDEV $mgs_MOUNT_OPTS + start mgs $MGSDEV $MGS_MOUNT_OPTS fi start mds $MDSDEV $MDS_MOUNT_OPTS @@ -1692,6 +1829,8 @@ init_facets_vars () { remote_mds_nodsh || init_facet_vars mds $MDSDEV $MDS_MOUNT_OPTS + combined_mgs_mds || init_facet_vars mgs $MGSDEV $MGS_MOUNT_OPTS + remote_ost_nodsh && return for num in `seq $OSTCOUNT`; do @@ -2076,6 +2215,12 @@ comma_list() { echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g' } +list_member () { + local list=$1 + local item=$2 + echo $list | grep -qw $item +} + # list, excluded are the comma separated lists exclude_items_from_list () { local list=$1 @@ -2124,19 +2269,22 @@ absolute_path() { } get_facets () { - local name=$(echo $1 | tr "[:upper:]" "[:lower:]") - local type=$(echo $1 | tr "[:lower:]" "[:upper:]") + local types=${1:-"OST MDS MGS"} local list="" - - case $type in - MDS ) list=mds;; - OST ) for ((i=1; i<=$OSTCOUNT; i++)) do - list="$list ${name}$i" - done;; - * ) error "Invalid facet type" + for entry in $types; do + local name=$(echo $entry | tr "[:upper:]" "[:lower:]") + local type=$(echo $entry | tr "[:lower:]" "[:upper:]") + + case $type in + MDS|MGS ) list="$list $name";; + OST ) for ((i=1; i<=$OSTCOUNT; i++)) do + list="$list ${name}$i" + done;; + * ) error "Invalid facet type" exit 1;; - esac + esac + done echo $(comma_list $list) } @@ -2644,19 +2792,30 @@ local_mode () $(single_local_node $(comma_list $(nodes_list))) } -osts_nodes () { - local OSTNODES=$(facet_host ost1) +facets_nodes () { + local facets=$1 + local nodes local NODES_sort - for num in `seq $OSTCOUNT`; do - local myOST=$(facet_host ost$num) - OSTNODES="$OSTNODES $myOST" + for facet in ${facets//,/ }; do + if [ "$FAILURE_MODE" = HARD ]; then + nodes="$nodes $(facet_active_host $facet)" + else + nodes="$nodes $(facet_host $facet)" + fi done - NODES_sort=$(for i in $OSTNODES; do echo $i; done | sort -u) + NODES_sort=$(for i in $nodes; do echo $i; done | sort -u) echo $NODES_sort } +osts_nodes () { + local facets=$(get_facets OST) + local nodes=$(facets_nodes $facets) + + echo $nodes +} + nodes_list () { # FIXME. We need a list of clients local myNODES=$HOSTNAME @@ -2666,7 +2825,7 @@ nodes_list () { [ -n "$CLIENTS" ] && myNODES=${CLIENTS//,/ } if [ "$PDSH" -a "$PDSH" != "no_dsh" ]; then - myNODES="$myNODES $(osts_nodes) $mds_HOST" + myNODES="$myNODES $(facets_nodes $(get_facets))" fi myNODES_sort=$(for i in $myNODES; do echo $i; done | sort -u) @@ -3040,7 +3199,7 @@ convert_facet2label() { get_clientosc_proc_path() { local ost=$1 - echo "{$1}-osc-*" + echo "${1}-osc-*" } get_osc_import_name() { @@ -3057,33 +3216,37 @@ get_osc_import_name() { return 0 } -wait_import_state () { +_wait_import_state () { local expected=$1 local CONN_PROC=$2 + local maxtime=${3:-max_recovery_time} local CONN_STATE local i=0 CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2) while [ "${CONN_STATE}" != "${expected}" ]; do - if [ "${expected}" == "DISCONN" ]; then - # for disconn we can check after proc entry is removed - [ "x${CONN_STATE}" == "x" ] && return 0 - # with AT we can have connect request timeout ~ reconnect timeout - # and test can't see real disconnect - [ "${CONN_STATE}" == "CONNECTING" ] && return 0 - fi - # disconnect rpc should be wait not more obd_timeout - [ $i -ge $(($TIMEOUT * 3 / 2)) ] && \ - error "can't put import for $CONN_PROC into ${expected} state" && return 1 + [ $i -ge $maxtime ] && \ + error "can't put import for $CONN_PROC into ${expected} state after $i sec, have ${CONN_STATE}" && \ + return 1 sleep 1 CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2) i=$(($i + 1)) done - log "$CONN_PROC now in ${CONN_STATE} state" + log "$CONN_PROC in ${CONN_STATE} state after $i sec" return 0 } +wait_import_state() { + local state=$1 + local params=$2 + local maxtime=${3:-max_recovery_time} + local param + + for param in ${params//,/ }; do + _wait_import_state $state $param $maxtime || return + done +} wait_osc_import_state() { local facet=$1 local ost_facet=$2 @@ -3130,8 +3293,14 @@ wait_clients_import_state () { local list=$1 local facet=$2 local expected=$3 - shift + local facets=$facet + + if [ "$FAILURE_MODE" = HARD ]; then + facets=$(facets_on_host $(facet_active_host $facet)) + fi + + for facet in ${facets//,/ }; do local label=$(convert_facet2label $facet) local proc_path case $facet in @@ -3139,8 +3308,10 @@ wait_clients_import_state () { mds* ) proc_path="mdc.$(get_clientmdc_proc_path $label).mds_server_uuid" ;; *) error "unknown facet!" ;; esac + local params=$(expand_list $params $proc_path) + done - if ! do_rpc_nodes $list wait_import_state $expected $proc_path; then + if ! do_rpc_nodes $list wait_import_state $expected $params; then error "import is not in ${expected} state" return 1 fi