- # Increment the number of failovers
- NUM_FAILOVERS=$((NUM_FAILOVERS+1))
- val=$((${!var} + 1))
- eval $var=$val
-
- CURRENT_TS=$(date +%s)
- ELAPSED=$((CURRENT_TS - START_TS))
-
- sleep=$((SERVER_FAILOVER_PERIOD-(CURRENT_TS - it_time_start)))
-
- # keep count the number of itterations when
- # time spend to failover and two client loads check exceeded
- # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP )
- if [ $sleep -lt $MINSLEEP ]; then
- reqfail=$((reqfail +1))
- log "WARNING: failover and two check_client_loads time exceeded SERVER_FAILOVER_PERIOD - MINSLEEP !
-Failed to meet interval $reqfail times ( REQFAIL=$REQFAIL ); have sleep=$sleep"
- [ $reqfail -gt $REQFAIL ] && exit 6
- fi
-
- log "$SERVERFACET has failed over ${!var} times, and counting..."
- if [ $sleep -gt 0 ]; then
- echo "sleeping $sleep seconds ... "
- sleep $sleep
- fi
-done
+ # Increment the number of failovers.
+ val=$((${!var} + 1))
+ eval $var=$val
+
+ current_ts=$(date +%s)
+ ELAPSED=$((current_ts - start_ts))
+
+ sleep=$((SERVER_FAILOVER_PERIOD - (current_ts - it_time_start)))
+
+ # Keep counting the number of iterations when
+ # time spent to failover and two client loads check exceeded
+ # the value ( SERVER_FAILOVER_PERIOD - MINSLEEP ).
+ if [ $sleep -lt $MINSLEEP ]; then
+ reqfail=$((reqfail + 1))
+ log "WARNING: failover and two check_client_loads time exceeded \
+SERVER_FAILOVER_PERIOD - MINSLEEP!
+Failed to load the filesystem with I/O for a minimum period of \
+$MINSLEEP $reqfail times ( REQFAIL=$REQFAIL ).
+This iteration, the load was only applied for sleep=$sleep seconds.
+Estimated max recovery time: $MAX_RECOV_TIME
+Probably the hardware is taking excessively long time to boot.
+Try to increase SERVER_FAILOVER_PERIOD (current is $SERVER_FAILOVER_PERIOD), \
+bug 20918"
+ [ $reqfail -gt $REQFAIL ] && exit 6
+ fi
+
+ log "$serverfacet has failed over ${!var} times, and counting..."
+
+ [ $((ELAPSED + sleep)) -ge $DURATION ] && break
+
+ if [ $sleep -gt 0 ]; then
+ echo "sleeping $sleep seconds... "
+ sleep $sleep
+ fi
+ done
+ exit 0
+}
+
+################################## Main Flow ###################################
+build_test_filter
+
+check_and_setup_lustre
+rm -rf $DIR/[Rdfs][0-9]*
+
+MAX_RECOV_TIME=$(max_recovery_time)
+
+# The test node needs to be insulated from a lustre failure as much as possible,
+# so not even loading the lustre modules is ideal.
+# -- umount lustre
+# -- remove hostname from clients list
+zconf_umount $HOSTNAME $MOUNT
+NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
+
+check_progs_installed $NODES_TO_USE ${CLIENT_LOADS[@]}
+
+MDTS=$(get_facets MDS)
+OSTS=$(get_facets OST)
+
+test_failover_mds() {
+ # failover a random MDS
+ failover_target MDS
+}
+run_test failover_mds "failover MDS"
+
+test_failover_ost() {
+ # failover a random OST
+ failover_target OST
+}
+run_test failover_ost "failover OST"
+
+zconf_mount $HOSTNAME $MOUNT || error "mount $MOUNT on $HOSTNAME failed"
+client_up || error "start client on $HOSTNAME failed"