From cd964f28831937c91a2473d4d0ad70e5787078eb Mon Sep 17 00:00:00 2001 From: grev Date: Sat, 7 Jun 2008 21:28:16 +0000 Subject: [PATCH] b=15711 i=Tappro t-f and insanity fix to work properly with FAILURE_MODE=HARD --- lustre/tests/Makefile.am | 4 +- lustre/tests/cfg/ncli.sh | 1 + lustre/tests/insanity.sh | 34 +++++++---- lustre/tests/test-framework.sh | 128 +++++++++++++++++++++++++++++++++++------ 4 files changed, 135 insertions(+), 32 deletions(-) diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 60dedf1..db8fa3f 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -12,8 +12,8 @@ noinst_SCRIPTS += conf-sanity.sh insanity.sh lfscktest.sh oos.sh oos2.sh noinst_SCRIPTS += llog-test.sh recovery-small.sh replay-dual.sh sanity-quota.sh noinst_SCRIPTS += replay-ost-single.sh replay-single.sh run-llog.sh sanityN.sh noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause -nobase_noinst_SCRIPTS = cfg/insanity-local.sh -nobase_noinst_SCRIPTS += cfg/local.sh acl/make-tree acl/run cfg/ncli.sh +nobase_noinst_SCRIPTS = cfg/local.sh +nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh nobase_noinst_DATA = acl/cp.test acl/getfacl-noacl.test acl/inheritance.test nobase_noinst_DATA += acl/misc.test acl/permissions.test acl/setfacl.test diff --git a/lustre/tests/cfg/ncli.sh b/lustre/tests/cfg/ncli.sh index 2735b9a..6dfae9b 100644 --- a/lustre/tests/cfg/ncli.sh +++ b/lustre/tests/cfg/ncli.sh @@ -15,3 +15,4 @@ CLIENTCOUNT=$((${#REMOTECLIENTS[@]} + 1)) [ -n "$RCLIENTS" -a "$PDSH" = "no_dsh" ] && \ error "tests for remote clients $RCLIENTS needs pdsh != do_dsh " || true +[ -n "$FUNCTIONS" ] && . $FUNCTIONS || true diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 011e66c..cd4ffaf 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -8,10 +8,17 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} init_test_env $@ -. ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh} +. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} ALWAYS_EXCEPT="10 $INSANITY_EXCEPT" +if [ "$FAILURE_MODE" = "HARD" ]; then + mixed_ost_devs && CONFIG_EXCEPTIONS="0 2 4 5 6 8" && \ + echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " && \ + echo "Except the tests: $CONFIG_EXCEPTIONS" && \ + ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS" +fi + # [ "$SLOW" = "no" ] && EXCEPT_SLOW="" @@ -20,13 +27,17 @@ CLEANUP=${CLEANUP:-""} build_test_filter +SINGLECLIENT=${SINGLECLIENT:-$HOSTNAME} +LIVE_CLIENT=${LIVE_CLIENT:-$SINGLECLIENT} +FAIL_CLIENTS=${FAIL_CLIENTS:-$RCLIENTS} + assert_env mds_HOST MDS_MKFS_OPTS MDSDEV assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT assert_env LIVE_CLIENT FSNAME # This can be a regexp, to allow more clients -CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"} +CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS`"} DIR=${DIR:-$MOUNT} @@ -69,11 +80,14 @@ reboot_node() { fail_clients() { num=$1 + + log "Request clients to fail: ${num}. Num of clients to fail: ${FAIL_NUM}, already failed: $DOWN_NUM" if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then num=$((FAIL_NUM - DOWN_NUM)) fi if [ -z "$num" ] || [ "$num" -le 0 ]; then + log "No clients failed!" return fi @@ -156,15 +170,11 @@ test_0() { echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 1; } - facet_failover ost1 || return 4 - echo "Waiting for df pid: $DFPID" - wait $DFPID || { echo "df returned $?" && return 2; } - - if [ $OSTCOUNT -gt 1 ]; then - facet_failover ost2 || return 5 - echo "Waiting for df pid: $DFPID" - wait $DFPID || { echo "df returned $?" && return 3; } - fi + for i in $(seq $OSTCOUNT) ; do + facet_failover ost$i || return 4 + echo "Waiting for df pid: $DFPID" + wait $DFPID || { echo "df returned $?" && return 3; } + done return 0 } run_test 0 "Fail all nodes, independently" @@ -293,7 +303,7 @@ run_test 4 "Fourth Failure Mode: OST/MDS `date`" ############### Fifth Failure Mode ############### test_5() { - [ $OSTCOUNT -lt 1 ] && skip "$OSTCOUNT < 1, not enough OSTs" && return 0 + [ $OSTCOUNT -lt 2 ] && skip "$OSTCOUNT < 2, not enough OSTs" && return 0 echo "Fifth Failure Mode: OST/OST `date`" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 14ee67a..24e09aa 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -66,7 +66,6 @@ print_summary () { init_test_env() { export LUSTRE=`absolute_path $LUSTRE` export TESTSUITE=`basename $0 .sh` - export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} [ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} @@ -114,6 +113,7 @@ init_test_env() { ONLY=${ONLY:-$*} [ "$TESTSUITELOG" ] && rm -f $TESTSUITELOG || true + rm -f $TMP/*active } @@ -330,7 +330,7 @@ zconf_mount() { exit 1 fi - echo "Starting client: $client: $OPTIONS $device $mnt" + echo "Starting client: $client: $OPTIONS $device $mnt" do_node $client mkdir -p $mnt do_node $client mount -t lustre $OPTIONS $device $mnt || return 1 @@ -354,14 +354,30 @@ zconf_umount() { } zconf_mount_clients() { + local OPTIONS local clients=$1 local mnt=$2 - echo "Mounting clients: $clients" - local client - for client in ${clients//,/ }; do - zconf_mount $client $mnt || true - done + + # Only supply -o to mount if we have options + if [ -n "$MOUNTOPT" ]; then + OPTIONS="-o $MOUNTOPT" + fi + local device=$MGSNID:/$FSNAME + if [ -z "$mnt" -o -z "$FSNAME" ]; then + echo Bad zconf mount command: opt=$OPTIONS dev=$device mnt=$mnt + exit 1 + fi + + echo "Starting client $clients: $OPTIONS $device $mnt" + do_nodes $clients mkdir -p $mnt + do_nodes $clients mount -t lustre $OPTIONS $device $mnt || return 1 + + do_nodes $clients "sysctl -w lnet.debug=$PTLDEBUG; + sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }; + sysctl -w lnet.debug_mb=${DEBUG_SIZE};" + + return 0 } zconf_umount_clients() { @@ -369,11 +385,8 @@ zconf_umount_clients() { local mnt=$2 [ "$3" ] && force=-f - echo "Umounting clients: $clients" - local client - for client in ${clients//,/ }; do - zconf_umount $client $mnt $force || true - done + echo "Stopping clients: $clients $mnt (opts:$force)" + do_nodes $clients umount $force $mnt } shutdown_facet() { @@ -650,8 +663,8 @@ facet_active() { local facet=$1 local activevar=${facet}active - if [ -f ./${facet}active ] ; then - source ./${facet}active + if [ -f $TMP/${facet}active ] ; then + source $TMP/${facet}active fi active=${!activevar} @@ -685,7 +698,7 @@ change_active() { fi # save the active host for this facet activevar=${facet}active - echo "$activevar=${!activevar}" > ./$activevar + echo "$activevar=${!activevar}" > $TMP/$activevar } do_node() { @@ -717,6 +730,46 @@ do_node() { return ${PIPESTATUS[0]} } +do_nodes() { + local nodes=$1 + shift + + nodes=${nodes//,/ } + # split list to local and remote + local rnodes=$(echo " $nodes " | sed -re "s/\s+$HOSTNAME\s+/ /g") + + if [ "$(get_node_count $nodes)" != "$(get_node_count $rnodes)" ]; then + do_node $HOSTNAME $@ + fi + + [ -z "$(echo $rnodes)" ] && return 0 + + # This is part from do_node + local myPDSH=$PDSH + + rnodes=$(comma_list $rnodes) + [ -z "$myPDSH" -o "$myPDSH" = "no_dsh" ] && \ + echo "cannot run remote command on $rnodes with $myPDSH" && return 128 + + if $VERBOSE; then + echo "CMD: $rnodes $@" >&2 + $myPDSH $rnodes $LCTL mark "$@" > /dev/null 2>&1 || : + fi + + if [ "$myPDSH" = "rsh" ]; then +# we need this because rsh does not return exit code of an executed command + local command_status="$TMP/cs" + rsh $rnodes ":> $command_status" + rsh $rnodes "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; + cd $RPWD; sh -c \"$@\") || + echo command failed >$command_status" + [ -n "$($myPDSH $rnodes cat $command_status)" ] && return 1 || true + return 0 + fi + $myPDSH $rnodes "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; cd $RPWD; sh -c \"$@\")" | sed -re "s/\w+:\s//g" + return ${PIPESTATUS[0]} +} + do_facet() { facet=$1 shift @@ -730,7 +783,7 @@ add() { shift # make sure its not already running stop ${facet} -f - rm -f ${facet}active + rm -f $TMP/${facet}active do_facet ${facet} $MKFS $* } @@ -759,14 +812,21 @@ stopall() { if [ -n "$CLIENTS" ]; then zconf_umount_clients $CLIENTS $MOUNT "$*" || true - zconf_umount_clients $CLIENTS $MOUNT2 "$*" || true + [ -n "$MOUNT2" ] && { zconf_umount_clients $CLIENTS $MOUNT2 "$*" || true; } fi [ "$CLIENTONLY" ] && return + # The add fn does rm ${facet}active file, this would be enough + # if we use do_facet only after the facet added, but + # currently we use do_facet mds in local.sh stop mds -f + rm -f ${TMP}/mds${num}active + for num in `seq $OSTCOUNT`; do stop ost$num -f + rm -f $TMP/ost${num}active done + return 0 } @@ -826,10 +886,22 @@ setupall() { || do_facet mds "$TUNEFS --writeconf $MDSDEV" set_obd_timeout mds $TIMEOUT start mds $MDSDEV $MDS_MOUNT_OPTS + # We started mds, now we should set failover variable properly. + # Set mdsfailover_HOST if it is not set (the default failnode). + mdsfailover_HOST=$(facet_host mds) + for num in `seq $OSTCOUNT`; do DEVNAME=`ostdevname $num` set_obd_timeout ost$num $TIMEOUT start ost$num $DEVNAME $OST_MOUNT_OPTS + + # We started ost$num, now we should set ost${num}failover variable properly. + # Set ost${num}failover_HOST if it is not set (the default failnode). + varname=ost${num}failover_HOST + if [ -z "${!varname}" ]; then + eval ost${num}failover_HOST=$(facet_host ost${num}) + fi + done fi [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE @@ -1055,6 +1127,15 @@ set_nodes_failloc () { done } +set_nodes_failloc () { + local nodes=$1 + local node + + for node in $nodes ; do + do_node $node sysctl -w lustre.fail_loc=$2 + done +} + cancel_lru_locks() { $LCTL mark "cancel_lru_locks $1 start" for d in `find $LPROC/ldlm/namespaces | egrep -i $1`; do @@ -1164,7 +1245,7 @@ build_test_filter() { eval ONLY_${O}=true done [ "$EXCEPT$ALWAYS_EXCEPT" ] && \ - log "skipping tests: `echo $EXCEPT $ALWAYS_EXCEPT`" + log "excepting tests: `echo $EXCEPT $ALWAYS_EXCEPT`" [ "$EXCEPT_SLOW" ] && \ log "skipping tests SLOW=no: `echo $EXCEPT_SLOW`" for E in $EXCEPT $ALWAYS_EXCEPT; do @@ -1433,6 +1514,17 @@ is_patchless () lctl get_param version | grep -q patchless } +get_node_count() { + local nodes="$@" + echo $nodes | wc -w || true +} + +mixed_ost_devs () { + local nodes=$(osts_nodes) + local osscount=$(get_node_count "$nodes") + [ ! "$OSTCOUNT" = "$osscount" ] +} + check_runas_id_ret() { local myRC=0 local myRUNAS_ID=$1 -- 1.8.3.1