From: Elena Gryaznova <grev@sun.com>
Date: Tue, 24 Aug 2010 19:52:29 +0000 (+0400)
Subject: b=20407 TF: "HARD" failovers with multiple targets per server
X-Git-Tag: 2.0.51.0~36
X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=66572cdeaefae2bcc7a3043a9b8de6ab7c37a642

b=20407 TF: "HARD" failovers with multiple targets per server

i=Brian.Murrell
i=Li.Wei
---

diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh
index 4bee2a5..298716a 100644
--- a/lustre/tests/cfg/local.sh
+++ b/lustre/tests/cfg/local.sh
@@ -75,15 +75,13 @@ MKFSOPT=""
 [ "x$L_GETIDENTITY" != "x" ] &&
     MDSOPT=$MDSOPT" --param mdt.identity_upcall=$L_GETIDENTITY"
 
-MDSn_MKFS_OPTS=$MDS_MKFS_OPTS
 MDS_MKFS_OPTS="--mdt --fsname=$FSNAME --device-size=$MDSSIZE --param sys.timeout=$TIMEOUT $MKFSOPT $MDSOPT $MDS_MKFS_OPTS"
 if [[ $mds1_HOST == $mgs_HOST ]] && [[ $MDSDEV1 == $MGSDEV ]]; then
     MDS_MKFS_OPTS="--mgs $MDS_MKFS_OPTS"
 else
     MDS_MKFS_OPTS="--mgsnode=$MGSNID $MDS_MKFS_OPTS"
-    mgs_MKFS_OPTS="--mgs --device-size=$MGSSIZE"
+    MGS_MKFS_OPTS="--mgs --device-size=$MGSSIZE"
 fi
-MDSn_MKFS_OPTS="--mgsnode=$MGSNID --mdt --fsname=$FSNAME --device-size=$MDSSIZE --param sys.timeout=$TIMEOUT $MKFSOPT $MDSOPT $MDSn_MKFS_OPTS"
 
 MKFSOPT=""
 [ "x$OSTJOURNALSIZE" != "x" ] &&
@@ -100,7 +98,7 @@ OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID -
 
 MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS:-"-o loop,user_xattr,acl"}
 OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-"-o loop"}
-mgs_MOUNT_OPTS=${mgs_MOUNT_OPTS:-$MDS_MOUNT_OPTS}
+MGS_MOUNT_OPTS=${MGS_MOUNT_OPTS:-$MDS_MOUNT_OPTS}
 
 #client
 MOUNT=${MOUNT:-/mnt/${FSNAME}}
diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh
index 1e7c8a6..f4d8546 100644
--- a/lustre/tests/conf-sanity.sh
+++ b/lustre/tests/conf-sanity.sh
@@ -16,6 +16,12 @@ ONLY=${ONLY:-"$*"}
 ALWAYS_EXCEPT="$CONF_SANITY_EXCEPT"
 # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT!
 
+if [ "$FAILURE_MODE" = "HARD" ]; then
+	CONFIG_EXCEPTIONS="24a " && \
+	echo "Except the tests: $CONFIG_EXCEPTIONS for FAILURE_MODE=$FAILURE_MODE, bug 23573" && \
+	ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
+fi
+
 SRCDIR=`dirname $0`
 PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
 
@@ -94,7 +100,7 @@ reformat_and_config() {
 
 start_mgs () {
 	echo "start mgs"
-	start mgs $MGSDEV $mgs_MOUNT_OPTS
+	start mgs $MGSDEV $MGS_MOUNT_OPTS
 }
 
 start_mds() {
diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh
index 497887f..6ceecf8 100755
--- a/lustre/tests/insanity.sh
+++ b/lustre/tests/insanity.sh
@@ -14,10 +14,8 @@ init_logging
 ALWAYS_EXCEPT="10  $INSANITY_EXCEPT"
 
 if [ "$FAILURE_MODE" = "HARD" ]; then
-    mixed_ost_devs && CONFIG_EXCEPTIONS="0 2 4 5 6 8" && \
-        echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " && \
-        echo "Except the tests: $CONFIG_EXCEPTIONS" && \
-        ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
+        skip_env "$0: is not functional with FAILURE_MODE = HARD, please use recovery-double-scale, bz20407"
+        exit 0
 fi
 
 #
@@ -191,10 +189,10 @@ test_2() {
 
     echo "Reintegrating OST"
     reboot_facet ost1
-    wait_for ost1
+    wait_for_facet ost1
     start_ost 1 || return 2
 
-    wait_for $SINGLEMDS
+    wait_for_facet $SINGLEMDS
     start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
 
     #Check FS
@@ -267,10 +265,10 @@ test_4() {
     #Reintegration
     echo "Reintegrating OST"
     reboot_facet ost1
-    wait_for ost1
+    wait_for_facet ost1
     start_ost 1
-    
-    wait_for $SINGLEMDS
+
+    wait_for_facet $SINGLEMDS
     start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
     #Check FS
     
@@ -317,9 +315,9 @@ test_5() {
 
     #Reintegration
     echo "Reintegrating OSTs"
-    wait_for ost1
+    wait_for_facet ost1
     start_ost 1
-    wait_for ost2
+    wait_for_facet ost2
     start_ost 2
     
     clients_recover_osts ost1
@@ -368,7 +366,7 @@ test_6() {
     
     #Reintegration
     echo "Reintegrating OST/CLIENTs"
-    wait_for ost1
+    wait_for_facet ost1
     start_ost 1
     reintegrate_clients || return 1
     sleep 5 
@@ -485,7 +483,7 @@ test_8() {
     #Reintegration
     echo "Reintegrating CLIENTs/OST"
     reintegrate_clients || return 3
-    wait_for ost1
+    wait_for_facet ost1
     start_ost 1
     wait $DFPID
     clients_up || return 1
diff --git a/lustre/tests/mmp.sh b/lustre/tests/mmp.sh
index 2ac19df..fe0eafe 100755
--- a/lustre/tests/mmp.sh
+++ b/lustre/tests/mmp.sh
@@ -348,7 +348,7 @@ mount_after_reboot() {
     if [ "$FAILURE_MODE" = "HARD" ]; then
         shutdown_facet $facet
         reboot_facet $facet
-        wait_for $facet
+        wait_for_facet $facet
     else
         replay_barrier_nodf $facet
     fi
diff --git a/lustre/tests/recovery-double-scale.sh b/lustre/tests/recovery-double-scale.sh
index 4c0ea6f..bbf6534 100644
--- a/lustre/tests/recovery-double-scale.sh
+++ b/lustre/tests/recovery-double-scale.sh
@@ -77,7 +77,7 @@ reboot_recover_node () {
     # MDS, OST item contains the facet
     case $nodetype in
        MDS|OST )    facet_failover $item
-                [ "$SERIAL" ] && wait_recovery_complete $item $((timeout * 4)) || true
+                [ "$SERIAL" ] && wait_recovery_complete $item || true
                 ;;
        clients) for c in ${item//,/ }; do
                       # make sure the client loads die
diff --git a/lustre/tests/recovery-mds-scale.sh b/lustre/tests/recovery-mds-scale.sh
index 2d9acb7..a4c874f 100644
--- a/lustre/tests/recovery-mds-scale.sh
+++ b/lustre/tests/recovery-mds-scale.sh
@@ -126,7 +126,7 @@ summary_and_cleanup () {
         fi
         rc=1
     fi
-     
+
     echo $(date +'%F %H:%M:%S') Terminating clients loads ...
     echo "$0" >> $END_RUN_FILE
     local result=PASS
@@ -171,7 +171,7 @@ Status: $result: rc=$rc"
 }
 
 #
-# MAIN 
+# MAIN
 #
 log "-----============= $0 starting =============-----"
 
@@ -203,38 +203,37 @@ CURRENT_TS=$START_TS
 
 while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
 
-    # In order to perform the 
+    # In order to perform the
     # expected number of failovers, we need to account the following :
     # 1) the time that has elapsed during the client load checking
     # 2) time takes for failover
 
     it_time_start=$(date +%s)
-    
+
     SERVERFACET=$(get_random_entry $SERVERS)
     var=${SERVERFACET}_numfailovers
 
-    # Check that our client loads are still running. If any have died, 
-    # that means they have died outside of recovery, which is unacceptable.    
+    # Check that our client loads are still running. If any have died,
+    # that means they have died outside of recovery, which is unacceptable.
 
     log "==== Checking the clients loads BEFORE failover -- failure NOT OK \
-    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD" 
+    ELAPSED=$ELAPSED DURATION=$DURATION PERIOD=$SERVER_FAILOVER_PERIOD"
 
     if ! check_client_loads $NODES_TO_USE; then
         exit 4
     fi
 
     log "Wait $SERVERFACET recovery complete before doing next failover ...."
-    if [[ $(server_numfailovers $SERVERFACET) != 0 ]]; then
-        if ! wait_recovery_complete $SERVERFACET ; then
-            echo "$SERVERFACET recovery is not completed!"
-            exit 7
-        fi
+
+    if ! wait_recovery_complete $SERVERFACET ; then
+        echo "$SERVERFACET recovery is not completed!"
+        exit 7
     fi
 
     log "Checking clients are in FULL state before doing next failover"
     if ! wait_clients_import_state $NODES_TO_USE $SERVERFACET FULL; then
         echo "Clients import not FULL, please consider to increase SERVER_FAILOVER_PERIOD=$SERVER_FAILOVER_PERIOD !"
-        
+
     fi
     log "Starting failover on $SERVERFACET"
 
@@ -252,10 +251,10 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     # Increment the number of failovers
     val=$((${!var} + 1))
     eval $var=$val
- 
+
     CURRENT_TS=$(date +%s)
     ELAPSED=$((CURRENT_TS - START_TS))
- 
+
     sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
 
     # keep count the number of itterations when
@@ -269,8 +268,8 @@ This iteration, the load was only applied for sleep=$sleep seconds.
 Estimated max recovery time : $max_recov_time
 Probably the hardware is taking excessively long to boot.
 Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
-        [ $reqfail -gt $REQFAIL ] && exit 6 
-    fi  
+        [ $reqfail -gt $REQFAIL ] && exit 6
+    fi
 
     log "$SERVERFACET has failed over ${!var} times, and counting..."
 
@@ -278,7 +277,7 @@ Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug
          break
     fi
 
-    if [ $sleep -gt 0 ]; then 
+    if [ $sleep -gt 0 ]; then
         echo "sleeping $sleep seconds ... "
         sleep $sleep
     fi
diff --git a/lustre/tests/recovery-random-scale.sh b/lustre/tests/recovery-random-scale.sh
index 57fe798..ce1ba18 100644
--- a/lustre/tests/recovery-random-scale.sh
+++ b/lustre/tests/recovery-random-scale.sh
@@ -122,7 +122,7 @@ summary_and_cleanup () {
     # the one we are really interested in.
         if [ -n "$END_RUN_NODE" ]; then
             var=$(client_var_name $END_RUN_NODE)_load
-            echo "Client load failed on node $END_RUN_NODE" 
+            echo "Client load failed on node $END_RUN_NODE"
             echo
             echo "client $END_RUN_NODE load stdout and debug files :
               ${TESTSUITELOG}_run_${!var}.sh-${END_RUN_NODE}
@@ -245,11 +245,11 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     log "Starting failover on $SERVERFACET"
 
     facet_failover "$SERVERFACET" || exit 1
-    if ! wait_recovery_complete $SERVERFACET $((TIMEOUT * 10)); then 
+    if ! wait_recovery_complete $SERVERFACET ; then
         echo "$SERVERFACET recovery is not completed!"
         exit 7
     fi
- 
+
     boot_node $FAIL_CLIENT
     echo "Reintegrating $FAIL_CLIENT"
     zconf_mount $FAIL_CLIENT $MOUNT || exit $?
@@ -268,10 +268,10 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
     # not for all clients.
     if [ -e $END_RUN_FILE ]; then
         read END_RUN_NODE < $END_RUN_FILE
-        [[ $END_RUN_NODE = $FAIL_CLIENT ]] && 
+        [[ $END_RUN_NODE = $FAIL_CLIENT ]] &&
             rm -f $END_RUN_FILE || exit 13
     fi
-   
+
     restart_client_loads $FAIL_CLIENT $ERRORS_OK || exit $?
 
     # Check that not failed clients loads are still running.
@@ -285,7 +285,6 @@ while [ $ELAPSED -lt $DURATION -a ! -e $END_RUN_FILE ]; do
 
     CURRENT_TS=$(date +%s)
     ELAPSED=$((CURRENT_TS - START_TS))
- 
     sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
 
     # keep count the number of itterations when
@@ -299,8 +298,8 @@ This iteration, the load was only applied for sleep=$sleep seconds.
 Estimated max recovery time : $max_recov_time
 Probably the hardware is taking excessively long to boot.
 Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), bug 20918"
-        [ $reqfail -gt $REQFAIL ] && exit 6 
-    fi  
+        [ $reqfail -gt $REQFAIL ] && exit 6
+    fi
 
     log " Number of failovers:
 $(numfailovers)                and counting..."
@@ -309,7 +308,7 @@ $(numfailovers)                and counting..."
          break
     fi
 
-    if [ $sleep -gt 0 ]; then 
+    if [ $sleep -gt 0 ]; then
         echo "sleeping $sleep seconds ... "
         sleep $sleep
     fi
diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh
index 562def5..3431fc8 100755
--- a/lustre/tests/recovery-small.sh
+++ b/lustre/tests/recovery-small.sh
@@ -12,13 +12,6 @@ init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 init_logging
 
-if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
-    CONFIG_EXCEPTIONS="52"
-    echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
-    echo "Except the tests: $CONFIG_EXCEPTIONS"
-    ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
-fi
-
 require_dsh_mds || exit 0
 
 # also long tests: 19, 21a, 21e, 21f, 23, 27
diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh
index 44e257a..c92bf35 100755
--- a/lustre/tests/replay-dual.sh
+++ b/lustre/tests/replay-dual.sh
@@ -16,13 +16,6 @@ CLEANUP=${CLEANUP:-""}
 MOUNT_2=${MOUNT_2:-"yes"}
 . $LUSTRE/tests/test-framework.sh
 
-if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
-    CONFIG_EXCEPTIONS="17"
-    echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
-    echo "Except the tests: $CONFIG_EXCEPTIONS"
-    ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
-fi
-
 init_test_env $@
 . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh}
 init_logging
diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh
index 5eb0222..82f9836 100755
--- a/lustre/tests/replay-single.sh
+++ b/lustre/tests/replay-single.sh
@@ -23,13 +23,6 @@ require_dsh_mds || exit 0
 # bug number:  17466 18857
 ALWAYS_EXCEPT="61d   33a 33b     $REPLAY_SINGLE_EXCEPT"
 
-if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then
-    CONFIG_EXCEPTIONS="0b 42 47 61a 61c"
-    echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. "
-    echo "Except the tests: $CONFIG_EXCEPTIONS"
-    ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
-fi
-
 #                                                  63 min  7 min  AT AT AT AT"
 [ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a      44b    65 66 67 68"
 
@@ -1538,9 +1531,9 @@ test_61d() { # bug 16002 # bug 17466 # bug 22137
 #   OBD_FAIL_OBD_LLOG_SETUP        0x605
     stop mgs
     do_facet mgs "lctl set_param fail_loc=0x80000605"
-    start mgs $MGSDEV $mgs_MOUNT_OPTS && error "mgs start should have failed"
+    start mgs $MGSDEV $MGS_MOUNT_OPTS && error "mgs start should have failed"
     do_facet mgs "lctl set_param fail_loc=0"
-    start mgs $MGSDEV $mgs_MOUNT_OPTS || error "cannot restart mgs"
+    start mgs $MGSDEV $MGS_MOUNT_OPTS || error "cannot restart mgs"
 }
 run_test 61d "error in llog_setup should cleanup the llog context correctly"
 
diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh
index 67bb9f8..a56027f 100644
--- a/lustre/tests/test-framework.sh
+++ b/lustre/tests/test-framework.sh
@@ -859,12 +859,18 @@ fi
 fi"
 }
 
-shudown_node_hard () {
+shutdown_node () {
+    local node=$1
+    echo + $POWER_DOWN $node
+    $POWER_DOWN $node
+}
+
+shutdown_node_hard () {
     local host=$1
     local attempts=3
 
     for i in $(seq $attempts) ; do
-        $POWER_DOWN $host
+        shutdown_node $host
         sleep 1
         ping -w 3 -c 1 $host > /dev/null 2>&1 || return 0
         echo "waiting for $host to fail attempts=$attempts"
@@ -879,21 +885,44 @@ shutdown_client() {
     local attempts=3
 
     if [ "$FAILURE_MODE" = HARD ]; then
-        shudown_node_hard $client 
+        shutdown_node_hard $client
     else
        zconf_umount_clients $client $mnt -f
     fi
 }
 
+facets_on_host () {
+    local host=$1
+    local facets="$(get_facets OST),$(get_facets MDS)"
+    local affected
+
+    combined_mgs_mds || facets="$facets,mgs"
+
+    for facet in ${facets//,/ }; do
+        if [ $(facet_active_host $facet) == $host ]; then
+           affected="$affected $facet"
+        fi
+    done
+
+    echo $(comma_list $affected)
+}
+
 shutdown_facet() {
     local facet=$1
+
     if [ "$FAILURE_MODE" = HARD ]; then
-        shudown_node_hard $(facet_active_host $facet)
-    elif [ "$FAILURE_MODE" = SOFT ]; then
+        shutdown_node_hard $(facet_active_host $facet)
+    else
         stop $facet
     fi
 }
 
+reboot_node() {
+    local node=$1
+    echo + $POWER_UP $node
+    $POWER_UP $node
+}
+
 remount_facet() {
     local facet=$1
 
@@ -902,9 +931,9 @@ remount_facet() {
 }
 
 reboot_facet() {
-    facet=$1
+    local facet=$1
     if [ "$FAILURE_MODE" = HARD ]; then
-        $POWER_UP `facet_active_host $facet`
+        reboot_node $(facet_active_host $facet)
     else
         sleep 10
     fi
@@ -913,7 +942,7 @@ reboot_facet() {
 boot_node() {
     local node=$1
     if [ "$FAILURE_MODE" = HARD ]; then
-       $POWER_UP $node
+       reboot_node $node
        wait_for_host $node
     fi
 }
@@ -1137,39 +1166,68 @@ wait_delete_completed () {
 }
 
 wait_for_host() {
-    local host=$1
-    check_network "$host" 900
-    while ! do_node $host hostname  > /dev/null; do sleep 5; done
+    local hostlist=$1
+
+    # we can use "for" here because we are waiting the slowest
+    for host in ${hostlist//,/ }; do
+        check_network "$host" 900
+    done
+    while ! do_nodes $hostlist hostname  > /dev/null; do sleep 5; done
 }
 
-wait_for() {
-    local facet=$1
-    local host=`facet_active_host $facet`
-    wait_for_host $host
+wait_for_facet() {
+    local facetlist=$1
+    local hostlist
+
+    for facet in ${facetlist//,/ }; do
+        hostlist=$(expand_list $hostlist $(facet_active_host $facet))
+    done
+    wait_for_host $hostlist
 }
 
-wait_recovery_complete () {
-    local facet=$1
+_wait_recovery_complete () {
+    local param=$1
 
     # Use default policy if $2 is not passed by caller.
     local MAX=${2:-$(max_recovery_time)}
 
-    local var_svc=${facet}_svc
-    local procfile="*.${!var_svc}.recovery_status"
     local WAIT=0
     local STATUS=
 
     while [ $WAIT -lt $MAX ]; do
-        STATUS=$(do_facet $facet lctl get_param -n $procfile | grep status)
-        [[ $STATUS = "status: COMPLETE" ]] && return 0
+        STATUS=$(lctl get_param -n $param | grep status)
+        echo $param $STATUS
+        [[ $STATUS = "status: COMPLETE" || $STATUS = "status: INACTIVE" ]] && return 0
         sleep 5
         WAIT=$((WAIT + 5))
-        echo "Waiting $((MAX - WAIT)) secs for $facet recovery done. $STATUS"
+        echo "Waiting $((MAX - WAIT)) secs for $param recovery done. $STATUS"
     done
-    echo "$facet recovery not done in $MAX sec. $STATUS"
+    echo "$param recovery not done in $MAX sec. $STATUS"
     return 1
 }
 
+wait_recovery_complete () {
+    local facet=$1
+
+    # with an assumption that at_max is the same on all nodes
+    local MAX=${2:-$(max_recovery_time)}
+
+    local facets=$facet
+    if [ "$FAILURE_MODE" = HARD ]; then
+        facets=$(facets_on_host $(facet_active_host $facet))
+    fi
+    echo affected facets: $facets
+
+    # we can use "for" here because we are waiting the slowest
+    for facet in ${facets//,/ }; do
+        local var_svc=${facet}_svc
+        local param="*.${!var_svc}.recovery_status"
+
+        local host=$(facet_active_host $facet)
+        do_rpc_nodes $host _wait_recovery_complete $param $MAX
+    done
+}
+
 wait_mds_ost_sync () {
     # just because recovery is done doesn't mean we've finished
     # orphan cleanup. Wait for llogs to get synchronized.
@@ -1316,15 +1374,36 @@ client_reconnect() {
 facet_failover() {
     local facet=$1
     local sleep_time=$2
-    echo "Failing $facet on node `facet_active_host $facet`"
+    local host=$(facet_active_host $facet)
+
+    echo "Failing $facet on node $host"
+
+    local affected=$facet
+
+    if [ "$FAILURE_MODE" = HARD ]; then
+        affected=$(facets_on_host $host)
+    fi
+
     shutdown_facet $facet
+
+    echo affected facets: $affected
+
     [ -n "$sleep_time" ] && sleep $sleep_time
+
     reboot_facet $facet
-    change_active $facet
-    local TO=`facet_active_host $facet`
-    echo "Failover $facet to $TO"
-    wait_for $facet
-    mount_facet $facet || error "Restart of $facet failed"
+
+    change_active $affected
+
+    wait_for_facet $affected
+    # start mgs first if it is affected
+    if ! combined_mgs_mds && list_member $affected mgs; then
+        mount_facet mgs || error "Restart of mgs failed"
+    fi
+    # FIXME; has to be changed to mount all facets concurrently
+    affected=$(exclude_items_from_list $affected mgs)
+    for facet in ${affected//,/ }; do
+        mount_facet $facet || error "Restart of $facet on node $host failed!"
+    done
 }
 
 obd_name() {
@@ -1482,10 +1561,16 @@ facet_active_host() {
 }
 
 change_active() {
-    local facet=$1
+    local facetlist=$1
+    local facet
+
+    facetlist=$(exclude_items_from_list $facetlist mgs)
+
+    for facet in ${facetlist//,/ }; do
     local failover=${facet}failover
-    host=`facet_host $failover`
+    local host=`facet_host $failover`
     [ -z "$host" ] && return
+
     local curactive=`facet_active $facet`
     if [ -z "${curactive}" -o "$curactive" == "$failover" ] ; then
         eval export ${facet}active=$facet
@@ -1495,6 +1580,9 @@ change_active() {
     # save the active host for this facet
     local activevar=${facet}active
     echo "$activevar=${!activevar}" > $TMP/$activevar
+    local TO=`facet_active_host $facet`
+    echo "Failover $facet to $TO"
+    done
 }
 
 do_node() {
@@ -1697,20 +1785,60 @@ cleanupall() {
     cleanup_gss
 }
 
-mdsmkfsopts()
-{
-    local nr=$1
-    test $nr = 1 && echo -n $MDS_MKFS_OPTS || echo -n $MDSn_MKFS_OPTS
-}
-
 combined_mgs_mds () {
     [[ $MDSDEV1 = $MGSDEV ]] && [[ $mds1_HOST = $mgs_HOST ]]
 }
 
+mkfs_opts () {
+    local facet=$1
+
+    local tgt=$(echo $facet | tr -d [:digit:] | tr "[:lower:]" "[:upper:]")
+    local optvar=${tgt}_MKFS_OPTS
+    local opt=${!optvar}
+
+    # FIXME: ! combo  mgs/mds + mgsfailover is not supported yet
+    [[ $facet = mgs ]] && echo $opt && return
+
+    # 1.
+    # --failnode options 
+    local var=${facet}failover_HOST
+    if [ x"${!var}" != x ] && [ x"${!var}" != x$(facet_host $facet) ] ; then
+        local failnode=$(h2$NETTYPE ${!var})
+        failnode="--failnode=$failnode"
+        # options does not contain
+        # or contains wrong --failnode=
+        if [[ $opt != *${failnode}* ]]; then
+            opt=$(echo $opt | sed 's/--failnode=.* / /')
+            opt="$opt $failnode"
+        fi
+    fi
+
+    # 2.
+    # --mgsnode options
+    # no additional mkfs mds "--mgsnode" option for this configuration
+    if [[ $facet = mds ]] && combined_mgs_mds; then
+        echo $opt
+        return
+    fi
+
+    # additional mkfs "--mgsnode"
+    local mgsnode="--mgsnode=$MGSNID"
+    opt=${opt//$mgsnode }
+    for nid in ${MGSNID//:/ }; do
+        local mgsnode="--mgsnode=$nid"
+        # options does not contain
+        # --mgsnode=$nid
+        if [[ $opt != *${mgsnode}" "* ]]; then
+            opt="$opt --mgsnode=$nid"
+        fi
+    done
+
+    echo $opt
+}
+
 formatall() {
     if [ "$IAMDIR" == "yes" ]; then
         MDS_MKFS_OPTS="$MDS_MKFS_OPTS --iam-dir"
-        MDSn_MKFS_OPTS="$MDSn_MKFS_OPTS --iam-dir"
     fi
 
     [ "$FSTYPE" ] && FSTYPE_OPT="--backfstype $FSTYPE"
@@ -1721,24 +1849,26 @@ formatall() {
     [ "$CLIENTONLY" ] && return
     echo Formatting mgs, mds, osts
     if ! combined_mgs_mds ; then
-        add mgs $mgs_MKFS_OPTS $FSTYPE_OPT --reformat $MGSDEV || exit 10
+        add mgs $(mkfs_opts mgs) $FSTYPE_OPT --reformat $MGSDEV || exit 10
     fi
 
     for num in `seq $MDSCOUNT`; do
         echo "Format mds$num: $(mdsdevname $num)"
         if $VERBOSE; then
-            add mds$num `mdsmkfsopts $num` $FSTYPE_OPT --reformat `mdsdevname $num` || exit 9
+            add mds$num $(mkfs_opts mds) $FSTYPE_OPT --reformat $(mdsdevname $num) || exit 10
         else
-            add mds$num `mdsmkfsopts $num` $FSTYPE_OPT --reformat `mdsdevname $num` > /dev/null || exit 9
+            add mds$num $(mkfs_opts mds) $FSTYPE_OPT --reformat $(mdsdevname $num) > /dev/null || exit 10
         fi
     done
 
+    # the ost-s could have different OST_MKFS_OPTS
+    # because of different failnode-s
     for num in `seq $OSTCOUNT`; do
         echo "Format ost$num: $(ostdevname $num)"
         if $VERBOSE; then
-            add ost$num $OST_MKFS_OPTS --reformat `ostdevname $num` || exit 10
+            add ost$num $(mkfs_opts ost${num}) $FSTYPE_OPT --reformat `ostdevname $num` || exit 10
         else
-            add ost$num $OST_MKFS_OPTS --reformat `ostdevname $num` > /dev/null || exit 10
+            add ost$num $(mkfs_opts ost${num}) $FSTYPE_OPT --reformat `ostdevname $num` > /dev/null || exit 10
         fi
     done
 }
@@ -1820,7 +1950,7 @@ setupall() {
         echo $WRITECONF | grep -q "writeconf" && \
             writeconf_all
         if ! combined_mgs_mds ; then
-            start mgs $MGSDEV $mgs_MOUNT_OPTS
+            start mgs $MGSDEV $MGS_MOUNT_OPTS
         fi
 
         for num in `seq $MDSCOUNT`; do
@@ -1929,6 +2059,8 @@ init_facets_vars () {
         done
     fi
 
+    combined_mgs_mds || init_facet_vars mgs $MGSDEV $MGS_MOUNT_OPTS
+
     remote_ost_nodsh && return
 
     for num in `seq $OSTCOUNT`; do
@@ -2350,6 +2482,12 @@ comma_list() {
     echo "$*" | tr -s " " "\n" | sort -b -u | tr "\n" " " | sed 's/ \([^$]\)/,\1/g'
 }
 
+list_member () {
+    local list=$1
+    local item=$2
+    echo $list | grep -qw $item
+}
+
 # list, excluded are the comma separated lists
 exclude_items_from_list () {
     local list=$1
@@ -2360,7 +2498,7 @@ exclude_items_from_list () {
     for item in ${excluded//,/ }; do
         list=$(echo " $list " | sed -re "s/\s+$item\s+/ /g")
     done
-    echo $(comma_list $list) 
+    echo $(comma_list $list)
 }
 
 # list, expand  are the comma separated lists
@@ -2398,13 +2536,23 @@ absolute_path() {
 }
 
 get_facets () {
-    local name=$(echo $1 | tr "[:upper:]" "[:lower:]")
-    local type=$(echo $1 | tr "[:lower:]" "[:upper:]")
+    local types=${1:-"OST MDS MGS"}
 
     local list=""
-    local count=${type}COUNT
-    for ((i=1; i<=${!count}; i++)) do
-        list="$list ${name}$i"
+
+    for entry in $types; do
+        local name=$(echo $entry | tr "[:upper:]" "[:lower:]")
+        local type=$(echo $entry | tr "[:lower:]" "[:upper:]")
+
+        case $type in
+                MGS ) list="$list $name";;
+            MDS|OST ) local count=${type}COUNT
+                       for ((i=1; i<=${!count}; i++)) do
+                          list="$list ${name}$i"
+                      done;;
+                  * ) error "Invalid facet type"
+                 exit 1;;
+        esac
     done
     echo $(comma_list $list)
 }
@@ -3025,19 +3173,30 @@ remote_servers () {
     remote_ost && remote_mds
 }
 
-osts_nodes () {
-    local OSTNODES=$(facet_host ost1)
+facets_nodes () {
+    local facets=$1
+    local nodes
     local NODES_sort
 
-    for num in `seq $OSTCOUNT`; do
-        local myOST=$(facet_host ost$num)
-        OSTNODES="$OSTNODES $myOST"
+    for facet in ${facets//,/ }; do
+        if [ "$FAILURE_MODE" = HARD ]; then
+            nodes="$nodes $(facet_active_host $facet)"
+        else
+            nodes="$nodes $(facet_host $facet)"
+        fi
     done
-    NODES_sort=$(for i in $OSTNODES; do echo $i; done | sort -u)
+    NODES_sort=$(for i in $nodes; do echo $i; done | sort -u)
 
     echo $NODES_sort
 }
 
+osts_nodes () {
+    local facets=$(get_facets OST)
+    local nodes=$(facets_nodes $facets)
+
+    echo $nodes
+}
+
 nodes_list () {
     # FIXME. We need a list of clients
     local myNODES=$HOSTNAME
@@ -3047,7 +3206,7 @@ nodes_list () {
     [ -n "$CLIENTS" ] && myNODES=${CLIENTS//,/ }
 
     if [ "$PDSH" -a "$PDSH" != "no_dsh" ]; then
-        myNODES="$myNODES $(osts_nodes) $(mdts_nodes)"
+        myNODES="$myNODES $(facets_nodes $(get_facets))"
     fi
 
     myNODES_sort=$(for i in $myNODES; do echo $i; done | sort -u)
@@ -3407,7 +3566,7 @@ convert_facet2label() {
 get_clientosc_proc_path() {
     local ost=$1
 
-    echo "{$1}-osc-*"
+    echo "${1}-osc-*"
 }
 
 get_lustre_version () {
@@ -3474,33 +3633,37 @@ get_osc_import_name() {
     return 0
 }
 
-wait_import_state () {
+_wait_import_state () {
     local expected=$1
     local CONN_PROC=$2
+    local maxtime=${3:-max_recovery_time}
     local CONN_STATE
     local i=0
 
     CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2)
     while [ "${CONN_STATE}" != "${expected}" ]; do
-        if [ "${expected}" == "DISCONN" ]; then
-            # for disconn we can check after proc entry is removed
-            [ "x${CONN_STATE}" == "x" ] && return 0
-            #  with AT we can have connect request timeout ~ reconnect timeout
-            # and test can't see real disconnect
-            [ "${CONN_STATE}" == "CONNECTING" ] && return 0
-        fi
-        # disconnect rpc should be wait not more obd_timeout
-        [ $i -ge $(($TIMEOUT * 3 / 2)) ] && \
-            error "can't put import for $CONN_PROC into ${expected} state" && return 1
+        [ $i -ge $maxtime ] && \
+            error "can't put import for $CONN_PROC into ${expected} state after $i sec, have ${CONN_STATE}" && \
+            return 1
         sleep 1
         CONN_STATE=$($LCTL get_param -n $CONN_PROC 2>/dev/null | cut -f2)
         i=$(($i + 1))
     done
 
-    log "$CONN_PROC now in ${CONN_STATE} state"
+    log "$CONN_PROC in ${CONN_STATE} state after $i sec"
     return 0
 }
 
+wait_import_state() {
+    local state=$1
+    local params=$2
+    local maxtime=${3:-max_recovery_time}
+    local param
+
+    for param in ${params//,/ }; do
+        _wait_import_state $state $param $maxtime || return
+    done
+}
 wait_osc_import_state() {
     local facet=$1
     local ost_facet=$2
@@ -3548,8 +3711,14 @@ wait_clients_import_state () {
     local list=$1
     local facet=$2
     local expected=$3
-    shift
 
+    local facets=$facet
+
+    if [ "$FAILURE_MODE" = HARD ]; then
+        facets=$(facets_on_host $(facet_active_host $facet))
+    fi
+
+    for facet in ${facets//,/ }; do
     local label=$(convert_facet2label $facet)
     local proc_path
     case $facet in
@@ -3557,8 +3726,10 @@ wait_clients_import_state () {
         mds* ) proc_path="mdc.$(get_clientmdc_proc_path $label).mds_server_uuid" ;;
         *) error "unknown facet!" ;;
     esac
+    local params=$(expand_list $params $proc_path)
+    done
 
-    if ! do_rpc_nodes $list wait_import_state $expected $proc_path; then
+    if ! do_rpc_nodes $list wait_import_state $expected $params; then
         error "import is not in ${expected} state"
         return 1
     fi