. ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh}
-ALWAYS_EXCEPT="10"
+ALWAYS_EXCEPT="10 $INSANITY_EXCEPT"
+
+#
+[ "$SLOW" = "no" ] && EXCEPT_SLOW=""
+
+SETUP=${SETUP:-""}
+CLEANUP=${CLEANUP:-""}
build_test_filter
-assert_env mds_HOST ost1_HOST ost2_HOST client_HOST LIVE_CLIENT
+assert_env mds_HOST MDS_MKFS_OPTS
+assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT
+assert_env LIVE_CLIENT FSNAME
-####
-# Initialize all the ostN_HOST
-NUMOST=2
-if [ "$EXTRA_OSTS" ]; then
- for host in $EXTRA_OSTS; do
- NUMOST=$((NUMOST + 1))
- OST=ost$NUMOST
- eval ${OST}_HOST=$host
- done
-fi
# This can be a regexp, to allow more clients
CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"}
DOWN_NUM=0
}
-gen_config() {
- rm -f $XMLCONFIG
- add_mds mds --dev $MDSDEV --size $MDSSIZE --journal-size $MDSJOURNALSIZE
-
- if [ ! -z "$mdsfailover_HOST" ]; then
- add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE
- fi
-
- add_lov lov1 mds --stripe_sz $STRIPE_BYTES\
- --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0
- for i in `seq $NUMOST`; do
- dev=`printf $OSTDEV $i`
- add_ost ost$i --lov lov1 --dev $dev --size $OSTSIZE \
- --journal-size $OSTJOURNALSIZE
- done
-
-
- add_client client mds --lov lov1 --path $MOUNT
-}
-
-setup() {
- rm -rf logs/*
- for i in `seq $NUMOST`; do
- wait_for ost$i
- start ost$i ${REFORMAT} $OSTLCONFARGS
- done
- [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE
- wait_for mds
- start mds $MDSLCONFARGS ${REFORMAT}
- while ! do_node $CLIENTS "ls -d $LUSTRE" > /dev/null; do sleep 5; done
- zconf_mount $CLIENTS $MOUNT
-
-}
-
-cleanup() {
- zconf_umount $CLIENTS $MOUNT
-
- stop mds ${FORCE} $MDSLCONFARGS || :
- for i in `seq $NUMOST`; do
- stop ost$i ${REFORMAT} ${FORCE} $OSTLCONFARGS || :
- done
+start_ost() {
+ start ost$1 `ostdevname $1` $OST_MOUNT_OPTS
}
trap exit INT
file=$1
for c in $LIVE_CLIENT $FAIL_CLIENTS; do
if echo $DOWN_CLIENTS | grep -q $c; then continue; fi
- $PDSH $c touch $MOUNT/${c}_$file
+ $PDSH $c touch $MOUNT/${c}_$file || return 1
done
}
# do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover"
}
-node_to_ost() {
- node=$1
- retvar=$2
- for i in `seq $NUMOST`; do
- ostvar="ost${i}_HOST"
- if [ "${!ostvar}" == $node ]; then
- eval $retvar=ost${i}
- return 0
- fi
- done
- echo "No ost found for node; $node"
- return 1
-
-}
-
-
-
-if [ "$ONLY" == "cleanup" ]; then
- cleanup
- exit
-fi
-
-if [ -z "$NOSETUP" ]; then
- gen_config
- setup
-fi
-
-if [ ! -z "$EVAL" ]; then
- eval "$EVAL"
- exit $?
-fi
-
-if [ "$ONLY" == "setup" ]; then
- exit 0
-fi
+cleanup_and_setup_lustre
# 9 Different Failure Modes Combinations
echo "Starting Test 17 at `date`"
test_0() {
- echo "Failover MDS"
- facet_failover mds
+ facet_failover $SINGLEMDS
echo "Waiting for df pid: $DFPID"
- wait $DFPID || return 1
+ wait $DFPID || { echo "df returned $?" && return 1; }
- echo "Failing OST1"
- facet_failover ost1
+ facet_failover ost1 || return 4
echo "Waiting for df pid: $DFPID"
- wait $DFPID || return 2
+ wait $DFPID || { echo "df returned $?" && return 2; }
- echo "Failing OST2"
- facet_failover ost2
+ facet_failover ost2 || return 5
echo "Waiting for df pid: $DFPID"
- wait $DFPID || return 3
+ wait $DFPID || { echo "df returned $?" && return 3; }
return 0
}
run_test 0 "Fail all nodes, independently"
echo "Verify Lustre filesystem is up and running"
client_df
- echo "Failing MDS"
- shutdown_facet mds
- reboot_facet mds
+ shutdown_facet $SINGLEMDS
+ reboot_facet $SINGLEMDS
# prepare for MDS failover
- change_active mds
- reboot_facet mds
+ change_active $SINGLEMDS
+ reboot_facet $SINGLEMDS
client_df &
DFPID=$!
sleep 5
- echo "Failing OST"
shutdown_facet ost1
echo "Reintegrating OST"
reboot_facet ost1
wait_for ost1
- start ost1
+ start_ost 1 || return 2
- echo "Failover MDS"
- wait_for mds
- start mds
+ wait_for $SINGLEMDS
+ start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS || return $?
#Check FS
wait $DFPID
echo "Verify Lustre filesystem is up and running"
#MDS Portion
- facet_failover mds
+ facet_failover $SINGLEMDS
wait $DFPID || echo df failed: $?
#Check FS
reintegrate_clients || return 1
client_df || return 3
+ sleep 2 # give it a little time for fully recovered before next test
}
run_test 3 "Thirdb Failure Mode: MDS/CLIENT `date`"
###################################################
echo "Fourth Failure Mode: OST/MDS `date`"
#OST Portion
- echo "Failing OST ost1"
shutdown_facet ost1
#Check FS
echo "Test Lustre stability after OST failure"
- client_df
+ client_df &
+ DFPIDA=$!
+ sleep 5
#MDS Portion
- echo "Failing MDS"
- shutdown_facet mds
- reboot_facet mds
+ shutdown_facet $SINGLEMDS
+ reboot_facet $SINGLEMDS
# prepare for MDS failover
- change_active mds
- reboot_facet mds
+ change_active $SINGLEMDS
+ reboot_facet $SINGLEMDS
client_df &
- DFPID=$!
+ DFPIDB=$!
sleep 5
#Reintegration
echo "Reintegrating OST"
reboot_facet ost1
wait_for ost1
- start ost1
+ start_ost 1
- echo "Failover MDS"
- wait_for mds
- start mds
+ wait_for $SINGLEMDS
+ start $SINGLEMDS `mdsdevname 1` $MDS_MOUNT_OPTS
#Check FS
- wait $DFPID
+ wait $DFPIDA
+ wait $DFPIDB
clients_recover_osts ost1
echo "Test Lustre stability after MDS failover"
client_df || return 1
client_df
#OST Portion
- echo "Failing OST"
shutdown_facet ost1
reboot_facet ost1
#Check FS
echo "Test Lustre stability after OST failure"
- client_df
+ client_df &
+ DFPIDA=$!
+ sleep 5
#OST Portion
- echo "Failing OST"
shutdown_facet ost2
reboot_facet ost2
#Check FS
echo "Test Lustre stability after OST failure"
- client_df
+ client_df &
+ DFPIDB=$!
+ sleep 5
#Reintegration
echo "Reintegrating OSTs"
wait_for ost1
- start ost1
+ start_ost 1
wait_for ost2
- start ost2
+ start_ost 2
clients_recover_osts ost1
clients_recover_osts ost2
sleep $TIMEOUT
+ wait $DFPIDA
+ wait $DFPIDB
client_df || return 2
}
run_test 5 "Fifth Failure Mode: OST/OST `date`"
client_touch testfile || return 2
#OST Portion
- echo "Failing OST"
shutdown_facet ost1
reboot_facet ost1
#Check FS
echo "Test Lustre stability after OST failure"
- client_df
+ client_df &
+ DFPIDA=$!
+ sleep 5
#CLIENT Portion
echo "Failing CLIENTs"
#Check FS
echo "Test Lustre stability after CLIENTs failure"
- client_df
+ client_df &
+ DFPIDB=$!
+ sleep 5
#Reintegration
echo "Reintegrating OST/CLIENTs"
wait_for ost1
- start ost1
+ start_ost 1
reintegrate_clients
sleep 5
+ wait $DFPIDA
+ wait $DFPIDB
echo "Verifying mount"
client_df || return 3
}
client_rm testfile
#MDS Portion
- echo "Failing MDS"
- facet_failover mds
+ facet_failover $SINGLEMDS
#Check FS
echo "Test Lustre stability after MDS failover"
#OST Portion
- echo "Failing OST"
shutdown_facet ost1
reboot_facet ost1
#Check FS
echo "Test Lustre stability after OST failure"
- client_df
- $PDSH $LIVE_CLIENT "ls -l $MOUNT"
- $PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
+ client_df &
+ DFPID=$!
+ sleep 5
+ #non-failout hangs forever here
+ #$PDSH $LIVE_CLIENT "ls -l $MOUNT"
+ #$PDSH $LIVE_CLIENT "rm -f $MOUNT/*_testfile"
#Reintegration
echo "Reintegrating CLIENTs/OST"
reintegrate_clients
wait_for ost1
- start ost1
+ start_ost 1
+ wait $DFPID
client_df || return 1
client_touch testfile2 || return 2
}
run_test 10 "Running Availability for 6 hours..."
-equals_msg "Done, cleaning up"
-cleanup
+equals_msg `basename $0`: test complete, cleaning up
+check_and_cleanup_lustre
+[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true