From: grev Date: Tue, 24 Feb 2009 19:54:59 +0000 (+0000) Subject: b=17839 X-Git-Tag: v1_9_162~21 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=852f34ac50727d3a012b9b325f9614b2b4fa7db7 b=17839 i=Brian cmd3-17 port to acc-sm --- diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index fae5f37..884d9fe 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -18,7 +18,7 @@ noinst_SCRIPTS += mdsrate-stat-small.sh mdsrate-stat-large.sh noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause noinst_SCRIPTS += sanity-sec.sh sanity-gss.sh krb5_login.sh setup_kerberos.sh noinst_SCRIPTS += recovery-mds-scale.sh run_dd.sh run_tar.sh run_iozone.sh -noinst_SCRIPTS += run_dbench.sh +noinst_SCRIPTS += run_dbench.sh recovery-double-scale.sh nobase_noinst_SCRIPTS = cfg/local.sh nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh nobase_noinst_SCRIPTS += racer/dir_create.sh racer/file_create.sh racer/file_list.sh diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index b21af68..710e6f0 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -23,7 +23,7 @@ fi [ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\"" [ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484" -export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY RECOVERY_MDS_SCALE" +export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY RECOVERY_MDS_SCALE RECOVERY_DOUBLE_SCALE" if [ "$ACC_SM_ONLY" ]; then for O in $TESTSUITE_LIST; do @@ -440,6 +440,14 @@ if [ "$RECOVERY_MDS_SCALE" != "no" ]; then RECOVERY_MDS_SCALE="done" fi +[ "$RECOVERY_DOUBLE_SCALE" != "no" ] && skip_remmds recovery-double-scale && RECOVERY_DOUBLE_SCALE=no && MSKIPPED=1 +[ "$RECOVERY_DOUBLE_SCALE" != "no" ] && skip_remost recovery-double-scale && RECOVERY_DOUBLE_SCALE=no && OSKIPPED=1 +if [ "$RECOVERY_DOUBLE_SCALE" != "no" ]; then + title recovery-double-scale + bash recovery-double-scale.sh + RECOVERY_DOUBLE_SCALE="done" +fi + RC=$? title FINISHED echo "Finished at `date` in $((`date +%s` - $STARTTIME))s" diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index ebf5cb8..06b5ca3 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -60,19 +60,6 @@ set_fail_client() { echo "fail $FAIL_CLIENT, next is $FAIL_NEXT" } -shutdown_client() { - client=$1 - if [ "$FAILURE_MODE" = HARD ]; then - $POWER_DOWN $client - while ping -w 3 -c 1 $client > /dev/null 2>&1; do - echo "waiting for node $client to fail" - sleep 1 - done - elif [ "$FAILURE_MODE" = SOFT ]; then - zconf_umount $client $MOUNT -f - fi -} - fail_clients() { num=$1 diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh new file mode 100644 index 0000000..d98dc65 --- /dev/null +++ b/lustre/tests/recovery-double-scale.sh @@ -0,0 +1,314 @@ +#!/bin/bash + +# All pairwise combinations of node failures. +# Was cmd3-17 +# +# Author: Chris Cooper +# +# Script fails pair of nodes: +# -- in parallel by default +# -- in series if SERIAL is set + +LUSTRE=${LUSTRE:-`dirname $0`/..} +SETUP=${SETUP:-""} +CLEANUP=${CLEANUP:-""} +. $LUSTRE/tests/test-framework.sh + +init_test_env $@ + +. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +TESTSUITELOG=${TESTSUITELOG:-$TMP/recovery-double-scale} +DEBUGLOG=$TESTSUITELOG.debug +exec 2>$DEBUGLOG +echo "--- env ---" >&2 +env >&2 +echo "--- env ---" >&2 +set -x + +[ -n "$CLIENTS" ] || { skip "$0 Need two or more remote clients" && exit 0; } +[ $CLIENTCOUNT -ge 3 ] || \ + { skip "$0 Need two or more remote clients, have $CLIENTCOUNT" && exit 0; } + +END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY}/end_run_file} +LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid} + +remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 +remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 + +check_timeout || exit 1 + +build_test_filter + +check_and_setup_lustre +rm -rf $DIR/[df][0-9]* + +# the test node needs to be insulated from a lustre failure as much as possible, +# so not even loading the lustre modules is ideal. +# -- umount lustre +# -- remove hostname from clients list +zconf_umount $(hostname) $MOUNT +NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} +NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname)) + +check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} + +MDTS=$(get_facets MDS) +OSTS=$(get_facets OST) + +rm -f $END_RUN_FILE + +reboot_recover_node () { + # item var contains a pair of clients if nodetype=clients + # I would prefer to have a list here + local item=$1 + local nodetype=$2 + local timeout=$($LCTL get_param -n timeout) + + # MDS, OST item contains the facet + case $nodetype in + MDS|OST ) facet_failover $item + [ "$SERIAL" ] && wait_recovery_complete $item $((timeout * 4)) || true + ;; + clients) for c in ${item//,/ }; do + shutdown_client $c + boot_node $c + done + start_client_loads $list || return $? + ;; + * ) error "reboot_recover_node: nodetype=$nodetype. Must be one of 'MDS', 'OST', or 'clients'." + exit 1;; + esac +} + +get_item_type () { + local type=$1 + local excluded=${2:-""} + + local list + case $type in + MDS ) list=$MDTS;; + OST ) list=$OSTS;; + clients) list=$NODES_TO_USE + ;; + * ) error "Invalid type=$type. Must be one of 'MDS', 'OST', or 'clients'." + exit 1;; + esac + + [ "$excluded" ] && list=$(exclude_items_from_list $list $excluded) + # empty list + if [ ! "$(echo $list)" ]; then + echo + return + fi + + item=$(get_random_entry $list) + if [ "$type" = clients ] ; then + item="$item $(get_random_entry $(exclude_items_from_list $list $item))" + item=$(comma_list $item) + fi + echo $item +} + +# failover_pair +# +# for the two nodetypes specified, chooses a random node(s) from each +# class, reboots the nodes sequentially, and then restarts lustre on +# the nodes. +failover_pair() { + local type1=$1 + local type2=$2 + local title=$3 + + local client_nodes="" + local item1= + local item2= + local client1= + local client2= + + log " +==== START === $title " + + item1=$(get_item_type $type1) + [ "$item1" ] || \ + { echo "type1=$type1 item1 is empty" && return 0; } + item2=$(get_item_type $type2 $item1) + [ "$item2" ] || \ + { echo "type1=$type1 item1=$item1 type2=$type2 item2=$item2 is empty" && return 0; } + + # Check that our client loads are still running. If any have died, + # that means they have died outside of recovery, which is unacceptable. + log "==== Checking the clients loads BEFORE failover -- failure NOT OK" + + # FIXME. need print summary on exit + if ! check_client_loads $NODES_TO_USE; then + exit 4 + fi + + log "Done checking client loads. Failing type1=$type1 item1=$item1 ... " + + reboot_recover_node $item1 $type1 || return $? + + # Hendrix test17 description: + # Introduce a failure, wait at + # least 5 minutes (for recovery), + # introduce a 2nd + # failure, and wait another 5 + # minutes + + # reboot_recover_node waits recovery in according to + # SERIAL value. + # We have a "double failures" if SERIAL is not set, + # do not need a sleep between failures for "double failures" + + log " Failing type2=$type2 item2=$item2 ... " + reboot_recover_node $item2 $type2 || return $? + + # Client loads are allowed to die while in recovery, so we just + # restart them. + log "==== Checking the clients loads AFTER failovers -- ERRORS_OK=$ERRORS_OK" + restart_client_loads $NODES_TO_USE $ERRORS_OK || return $? + log "Done checking / re-Starting client loads. PASS" + return 0 +} + +summary_and_cleanup () { + local rc=$? + trap 0 + + # Having not empty END_RUN_FILE means the failed loads only + if [ -s $END_RUN_FILE ]; then + echo "Found the END_RUN_FILE file: $END_RUN_FILE" + cat $END_RUN_FILE + local END_RUN_NODE= + read END_RUN_NODE < $END_RUN_FILE + + # a client load will end (i.e. fail) if it finds + # the end run file. that does not mean that that client load + # actually failed though. the first node in the END_RUN_NODE is + # the one we are really interested in. + if [ -n "$END_RUN_NODE" ]; then + var=${END_RUN_NODE}_load + echo "Client load failed on node $END_RUN_NODE" + echo + echo "client $END_RUN_NODE load debug output :" + local logfile=${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}.debug + do_node ${END_RUN_NODE} "set -x; [ -e $logfile ] && cat $logfile " || true + fi + rc=1 + fi + + echo $(date +'%F %H:%M:%S') Terminating clients loads ... + echo "$0" >> $END_RUN_FILE + local result=PASS + [ $rc -eq 0 ] || result=FAIL + + log " +Server failover period: $FAILOVER_PERIOD seconds +Exited after: $ELAPSED seconds +Status: $result: rc=$rc" + + # make sure the client loads die + do_nodes $NODES_TO_USE "set -x; test -f $TMP/client-load.pid && \ + { kill -s TERM \$(cat $TMP/client-load.pid) || true; }" + + # and free up the pdshes that started them, if any are still around + if [ -n "$CLIENT_LOAD_PIDS" ]; then + kill $CLIENT_LOAD_PIDS || true + sleep 5 + kill -9 $CLIENT_LOAD_PIDS || true + fi + [ $rc -eq 0 ] && zconf_mount $(hostname) $MOUNT + exit $rc +} + +trap summary_and_cleanup EXIT TERM INT + +# +# MAIN +# +log "-----============= $0 starting =============-----" + +START_TS=$(date +%s) +CURRENT_TS=$START_TS +ELAPSED=0 + +# Set SERIAL to serialize the failure through a recovery of the first failure. +SERIAL=${SERIAL:-""} +ERRORS_OK="yes" + +[ "$SERIAL" ] && ERRORS_OK="" + +FAILOVER_PERIOD=${FAILOVER_PERIOD:-$((60*5))} # 5 minutes + +# Start client loads. +start_client_loads $NODES_TO_USE +echo clients load pids: +if ! do_nodes $NODES_TO_USE "set -x; echo \$(hostname): && cat $TMP/client-load.pid"; then + if [ -e $DEBUGLOG ]; then + exec 2<&- + cat $DEBUGLOG + exit 3 + fi +fi + +# FIXME: Do we want to have an initial sleep period where the clients +# just run before introducing a failure? +sleep $FAILOVER_PERIOD + +#CMD_TEST_NUM=17.1 +failover_pair MDS OST "test 1: failover MDS, then OST ==========" +sleep $FAILOVER_PERIOD + +#CMD_TEST_NUM=17.2 +failover_pair MDS clients "test 2: failover MDS, then 2 clients ====" +sleep $FAILOVER_PERIOD + +#CMD_TEST_NUM=17.3 +if [ $MDSCOUNT -gt 1 ]; then + failover_pair MDS MDS "test 3: failover MDS, then another MDS ==" + sleep $FAILOVER_PERIOD +else + skip "$0 : $MDSCOUNT < 2 MDTs, test 3 skipped" +fi + +#CMD_TEST_NUM=17.4 +if [ $OSTCOUNT -gt 1 ]; then + failover_pair OST OST "test 4: failover OST, then another OST ==" + sleep $FAILOVER_PERIOD +else + skip "$0 : $OSTCOUNT < 2 OSTs, test 4 skipped" +fi + +#CMD_TEST_NUM=17.5 +failover_pair OST clients "test 5: failover OST, then 2 clients ====" +sleep $FAILOVER_PERIOD + +#CMD_TEST_NUM=17.6 +failover_pair OST MDS "test 6: failover OST, then MDS ==========" +sleep $FAILOVER_PERIOD + +#CMD_TEST_NUM=17.7 +failover_pair clients MDS "test 7: failover 2 clients, then MDS ====" +sleep $FAILOVER_PERIOD + +#CMD_TEST_NUM=17.8 +#failover_pair clients OST "test 8: failover 2 clients, then OST ====" +sleep $FAILOVER_PERIOD + +#CMD_TEST_NUM=17.9 +if [ $CLIENTCOUNT -ge 5 ]; then + failover_pair clients clients "test 9: failover 2 clients, then 2 different clients ==" + sleep $FAILOVER_PERIOD +fi +log "==== Checking the clients loads AFTER all failovers -- failure NOT OK" +if ! check_client_loads $NODES_TO_USE; then + log "Client load failed after failover. Exiting" + exit 5 +fi + +CURRENT_TS=$(date +%s) +ELAPSED=$((CURRENT_TS - START_TS)) + +log "Completed successfully in $ELAPSED seconds" + +exit 0 diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh index 4d6bb7c..598620b 100644 --- a/lustre/tests/recovery-mds-scale.sh +++ b/lustre/tests/recovery-mds-scale.sh @@ -47,21 +47,12 @@ rm -rf $DIR/[df][0-9]* # -- remove hostname from clients list zconf_umount $(hostname) $MOUNT NODES_TO_USE=${NODES_TO_USE:-$CLIENTS} -NODES_TO_USE=$(exclude_item_from_list $NODES_TO_USE $(hostname)) +NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $(hostname)) check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]} -MDTS="" -for ((i=1; i<=$MDSCOUNT; i++)) do - MDTS="$MDTS mds$i" -done -MDTS=$(comma_list $MDTS) - -OSTS="" -for ((i=1; i<=$OSTCOUNT; i++)) do - OSTS="$OSTS ost$i" -done -OSTS=$(comma_list $OSTS) +MDTS=$(get_facets MDS) +OSTS=$(get_facets OST) ERRORS_OK="" # No application failures should occur during this test. FLAVOR=${FLAVOR:-"MDS"} diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 54f5305..c6428c7 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -436,21 +436,6 @@ test_21a() { } run_test 21a "commit on sharing" -shutdown_client() { - local client=$1 - local mnt=$2 - - if [ "$FAILURE_MODE" = HARD ]; then - $POWER_DOWN $client - while ping -w 3 -c 1 $client > /dev/null 2>&1; do - echo "waiting for node $client to fail" - sleep 1 - done - else - zconf_umount_clients $client $mnt -f - fi -} - test_21b_sub () { local mds=$1 do_node $CLIENT1 rm -f $MOUNT1/$tfile-* diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 46bf4e7..5b618e0 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -475,7 +475,7 @@ test_20b() { # bug 10480 fail $SINGLEMDS # start orphan recovery df -P $DIR || df -P $DIR || true # reconnect - wait_mds_recovery_done || error "MDS recovery not done" + wait_recovery_complete $SINGLEMDS || error "MDS recovery not done" # FIXME just because recovery is done doesn't mean we've finished # orphan cleanup. Fake it with a sleep for now... diff --git a/lustre/tests/run_dbench.sh b/lustre/tests/run_dbench.sh index f82d9dd..45cfceb 100755 --- a/lustre/tests/run_dbench.sh +++ b/lustre/tests/run_dbench.sh @@ -33,7 +33,7 @@ trap signaled TERM # recovery-mds-scale uses this to signal the client loads to die echo $$ >$LOAD_PID_FILE -TESTDIR=$MOUNT/dbench-$(hostname) +TESTDIR=$MOUNT/d0.dbench-$(hostname) CONTINUE=true diff --git a/lustre/tests/run_dd.sh b/lustre/tests/run_dd.sh index 96a4950..f4f1a54 100755 --- a/lustre/tests/run_dd.sh +++ b/lustre/tests/run_dd.sh @@ -31,7 +31,7 @@ trap signaled TERM # recovery-mds-scale uses this to signal the client loads to die echo $$ >$LOAD_PID_FILE -TESTDIR=$MOUNT/dd-$(hostname) +TESTDIR=$MOUNT/d0.dd-$(hostname) CONTINUE=true while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do diff --git a/lustre/tests/run_iozone.sh b/lustre/tests/run_iozone.sh index 2b71118..2d075d7 100755 --- a/lustre/tests/run_iozone.sh +++ b/lustre/tests/run_iozone.sh @@ -31,7 +31,7 @@ trap signaled TERM # recovery-mds-scale uses this to signal the client loads to die echo $$ >$LOAD_PID_FILE -TESTDIR=$MOUNT/iozone-$(hostname) +TESTDIR=$MOUNT/d0.iozone-$(hostname) # needed to debug oom problem #echo 1 > /proc/sys/vm/vm_gfp_debug diff --git a/lustre/tests/run_tar.sh b/lustre/tests/run_tar.sh index 7502c241..5f40e68 100755 --- a/lustre/tests/run_tar.sh +++ b/lustre/tests/run_tar.sh @@ -31,7 +31,7 @@ trap signaled TERM # recovery-mds-scale uses this to signal the client loads to die echo $$ >$LOAD_PID_FILE -TESTDIR=$MOUNT/tar-$(hostname) +TESTDIR=$MOUNT/d0.tar-$(hostname) CONTINUE=true while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 88dadb1..e081f8d 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -622,11 +622,36 @@ fi fi" } +shudown_node_hard () { + local host=$1 + local attempts=3 + + for i in $(seq $attempts) ; do + $POWER_DOWN $host + sleep 1 + ping -w 3 -c 1 $host > /dev/null 2>&1 || return 0 + echo "waiting for $host to fail attempts=$attempts" + [ $i -lt $attempts ] || \ + { echo "$host still pingable after power down! attempts=$attempts" && return 1; } + done +} + +shutdown_client() { + local client=$1 + local mnt=${2:-$MOUNT} + local attempts=3 + + if [ "$FAILURE_MODE" = HARD ]; then + shudown_node_hard $client + else + zconf_umount_clients $client $mnt -f + fi +} + shutdown_facet() { local facet=$1 if [ "$FAILURE_MODE" = HARD ]; then - $POWER_DOWN `facet_active_host $facet` - sleep 2 + shudown_node_hard $(facet_active_host $facet) elif [ "$FAILURE_MODE" = SOFT ]; then stop $facet fi @@ -661,30 +686,30 @@ check_progs_installed () { } start_client_load() { - local list=(${1//,/ }) - local nodenum=$2 - - local numloads=${#CLIENT_LOADS[@]} - local testnum=$((nodenum % numloads)) + local client=$1 + local var=${client}_load - do_node ${list[nodenum]} "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \ + do_node $client "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \ BREAK_ON_ERROR=$BREAK_ON_ERROR \ END_RUN_FILE=$END_RUN_FILE \ LOAD_PID_FILE=$LOAD_PID_FILE \ TESTSUITELOG=$TESTSUITELOG \ - run_${CLIENT_LOADS[testnum]}.sh" & + run_${!var}.sh" & CLIENT_LOAD_PIDS="$CLIENT_LOAD_PIDS $!" - log "Started client load: ${CLIENT_LOADS[testnum]} on ${list[nodenum]}" + log "Started client load: ${!var} on $client" - eval export ${list[nodenum]}_load=${CLIENT_LOADS[testnum]} return 0 } start_client_loads () { local clients=(${1//,/ }) + local numloads=${#CLIENT_LOADS[@]} + local testnum - for ((num=0; num < ${#clients[@]}; num++ )); do - start_client_load $1 $num + for ((nodenum=0; nodenum < ${#clients[@]}; nodenum++ )); do + testnum=$((nodenum % numloads)) + eval export ${clients[nodenum]}_load=${CLIENT_LOADS[testnum]} + start_client_load ${clients[nodenum]} done } @@ -725,13 +750,39 @@ check_client_loads () { for client in $clients; do check_client_load $client - rc=$? + rc=${PIPESTATUS[0]} if [ "$rc" != 0 ]; then log "Client load failed on node $client, rc=$rc" return $rc fi done } + +restart_client_loads () { + local clients=${1//,/ } + local expectedfail=${2:-""} + local client= + local rc=0 + + for client in $clients; do + check_client_load $client + rc=${PIPESTATUS[0]} + if [ "$rc" != 0 -a "$expectedfail"]; then + start_client_load $client + echo "Restarted client load: on $client. Checking ..." + check_client_load $client + rc=${PIPESTATUS[0]} + if [ "$rc" != 0 ]; then + log "Client load failed to restart on node $client, rc=$rc" + # failure one client load means test fail + # we do not need to check other + return $rc + fi + else + return $rc + fi + done +} # End recovery-scale functions # verify that lustre actually cleaned up properly @@ -805,32 +856,39 @@ wait_delete_completed () { } wait_for_host() { - local HOST=$1 - check_network "$HOST" 900 - while ! do_node $HOST "ls -d $LUSTRE " > /dev/null; do sleep 5; done + local host=$1 + check_network "$host" 900 + while ! do_node $host "ls -d $LUSTRE " > /dev/null; do sleep 5; done } wait_for() { local facet=$1 - local HOST=`facet_active_host $facet` - wait_for_host $HOST + local host=`facet_active_host $facet` + wait_for_host $host } -wait_mds_recovery_done () { - local timeout=`do_facet $SINGLEMDS lctl get_param -n timeout` -#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) -# as we are in process of changing obd_timeout in different ways -# let's set MAX longer than that - local MAX=$(( timeout * 4 )) +wait_recovery_complete () { + local facet=$1 + + # Use default policy if $2 is not passed by caller. + #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) + # as we are in process of changing obd_timeout in different ways + # let's set MAX longer than that + local MAX=${2:-$(( TIMEOUT * 4 ))} + + local var_svc=${facet}_svc + local procfile="*.${!var_svc}.recovery_status" local WAIT=0 + local STATUS= + while [ $WAIT -lt $MAX ]; do - STATUS=`do_facet $SINGLEMDS "lctl get_param -n mdt.*-MDT0000.recovery_status | grep status"` - echo $STATUS | grep COMPLETE && return 0 + STATUS=$(do_facet $facet lctl get_param -n $procfile | grep status) + [[ $STATUS = "status: COMPLETE" ]] && return 0 sleep 5 WAIT=$((WAIT + 5)) - echo "Waiting $(($MAX - $WAIT)) secs for MDS recovery done" + echo "Waiting $((MAX - WAIT)) secs for $facet recovery done. $STATUS" done - echo "MDS recovery not done in $MAX sec" + echo "$facet recovery not done in $MAX sec. $STATUS" return 1 } @@ -919,7 +977,7 @@ facet_failover() { DFPID=$! echo "df pid is $DFPID" change_active $facet - TO=`facet_active_host $facet` + local TO=`facet_active_host $facet` echo "Failover $facet to $TO" wait_for $facet mount_facet $facet || error "Restart of $facet failed" @@ -1560,13 +1618,16 @@ comma_list() { echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g' } -# list is comma separated list -exclude_item_from_list () { +# list, excluded are the comma separated lists +exclude_items_from_list () { local list=$1 local excluded=$2 + local item list=${list//,/ } - list=$(echo " $list " | sed -re "s/\s+$excluded\s+/ /g") + for item in ${excluded//,/ }; do + list=$(echo " $list " | sed -re "s/\s+$item\s+/ /g") + done echo $(comma_list $list) } @@ -1574,6 +1635,18 @@ absolute_path() { (cd `dirname $1`; echo $PWD/`basename $1`) } +get_facets () { + local name=$(echo $1 | tr "[:upper:]" "[:lower:]") + local type=$(echo $1 | tr "[:lower:]" "[:upper:]") + + local list="" + local count=${type}COUNT + for ((i=1; i<=${!count}; i++)) do + list="$list ${name}$i" + done + echo $(comma_list $list) +} + ################################## # Adaptive Timeouts funcs