X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Ftests%2Ftest-framework.sh;h=5769f4b9d94ddf7e7b49f2d403d34e962b2d6db8;hb=27443f4a9453a3956fb111252324f5b53e708f0b;hp=1ea2be73a829f9cb5714b937edb193e8dc078702;hpb=9570cac99cb852d8c0ba0694e8b4d33e32539775;p=fs%2Flustre-release.git diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 1ea2be7..5769f4b 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -1,18 +1,25 @@ -#!/bin/sh +#!/bin/bash +# vim:expandtab:shiftwidth=4:softtabstop=4:tabstop=4: +trap 'echo "test-framework exiting on error"' ERR set -e +#set -x -export REFORMAT="" + +export REFORMAT=${REFORMAT:-""} export VERBOSE=false +export GMNALNID=${GMNALNID:-/usr/sbin/gmlndnid} +export CATASTROPHE=${CATASTROPHE:-/proc/sys/lnet/catastrophe} +#export PDSH="pdsh -S -Rssh -w" # eg, assert_env LUSTRE MDSNODES OSTNODES CLIENTS assert_env() { local failed="" for name in $@; do - if [ -z "${!name}" ]; then - echo "$0: $name must be set" - failed=1 - fi + if [ -z "${!name}" ]; then + echo "$0: $name must be set" + failed=1 + fi done [ $failed ] && exit 1 || true } @@ -27,16 +34,36 @@ usage() { init_test_env() { export LUSTRE=`absolute_path $LUSTRE` export TESTSUITE=`basename $0 .sh` - export XMLCONFIG="${TESTSUITE}.xml" export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} - [ -d /r ] && export ROOT=/r + [ -d /r ] && export ROOT=${ROOT:-/r} + export TMP=${TMP:-$ROOT/tmp} - export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests - export LCONF=${LCONF:-"lconf"} - export LMC=${LMC:-"lmc"} - export LCTL=${LCTL:-"lctl"} + export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/utils/gss:$LUSTRE/tests + export LCTL=${LCTL:-"$LUSTRE/utils/lctl"} + [ ! -f "$LCTL" ] && export LCTL=$(which lctl) + export MKFS=${MKFS:-"$LUSTRE/utils/mkfs.lustre"} + [ ! -f "$MKFS" ] && export MKFS=$(which mkfs.lustre) + export TUNEFS=${TUNEFS:-"$LUSTRE/utils/tunefs.lustre"} + [ ! -f "$TUNEFS" ] && export TUNEFS=$(which tunefs.lustre) export CHECKSTAT="${CHECKSTAT:-checkstat} " + export FSYTPE=${FSTYPE:-"ldiskfs"} + export NAME=${NAME:-local} + export LPROC=/proc/fs/lustre + export LGSSD=${LGSSD:-"$LUSTRE/utils/gss/lgssd"} + export LSVCGSSD=${LSVCGSSD:-"$LUSTRE/utils/gss/lsvcgssd"} + export KRB5DIR=${KRB5DIR:-"/usr/kerberos"} + + if [ "$ACCEPTOR_PORT" ]; then + export PORT_OPT="--port $ACCEPTOR_PORT" + fi + + case "x$SEC" in + xkrb5*) + echo "Using GSS/krb5 ptlrpc security flavor" + export USING_KRB5="y" + ;; + esac # Paths on remote nodes, if different export RLUSTRE=${RLUSTRE:-$LUSTRE} @@ -45,192 +72,582 @@ init_test_env() { # command line while getopts "rvf:" opt $*; do - case $opt in - f) CONFIG=$OPTARG;; - r) REFORMAT=--reformat;; - v) VERBOSE=true;; - \?) usage;; - esac + case $opt in + f) CONFIG=$OPTARG;; + r) REFORMAT=--reformat;; + v) VERBOSE=true;; + \?) usage;; + esac done - - # save the name of the config file for the upcall - echo "XMLCONFIG=$LUSTRE/tests/$XMLCONFIG" > $LUSTRE/tests/XMLCONFIG + + shift $((OPTIND - 1)) + ONLY=${ONLY:-$*} +} + +load_module() { + EXT=".ko" + module=$1 + shift + BASE=`basename $module $EXT` + lsmod | grep -q ${BASE} || \ + if [ -f ${LUSTRE}/${module}${EXT} ]; then + insmod ${LUSTRE}/${module}${EXT} $@ + else + # must be testing a "make install" or "rpm" installation + # note failed to load ptlrpc_gss is considered not fatal + if [ "$BASE" == "ptlrpc_gss" ]; then + modprobe $BASE $@ || echo "gss/krb5 is not supported" + else + modprobe $BASE $@ + fi + fi +} + +load_modules() { + if [ -n "$MODPROBE" ]; then + # use modprobe + return 0 + fi + if [ "$HAVE_MODULES" = true ]; then + # we already loaded + return 0 + fi + HAVE_MODULES=true + + echo Loading modules from $LUSTRE + load_module ../lnet/libcfs/libcfs + [ -z "$LNETOPTS" ] && \ + LNETOPTS=$(awk '/^options lnet/ { print $0}' /etc/modprobe.conf | sed 's/^options lnet //g') + echo "lnet options: '$LNETOPTS'" + # note that insmod will ignore anything in modprobe.conf + load_module ../lnet/lnet/lnet $LNETOPTS + LNETLND=${LNETLND:-"socklnd/ksocklnd"} + load_module ../lnet/klnds/$LNETLND + [ "$FSTYPE" = "ldiskfs" ] && load_module ../ldiskfs/ldiskfs/ldiskfs + load_module lvfs/lvfs + load_module obdclass/obdclass + load_module lvfs/fsfilt_$FSTYPE + load_module ptlrpc/ptlrpc + load_module ptlrpc/gss/ptlrpc_gss + # Now, some modules depend on lquota without USE_QUOTA check, + # will fix later. Disable check "$USE_QUOTA" = "yes" temporary. + #[ "$USE_QUOTA" = "yes" ] && load_module quota/lquota + load_module quota/lquota + load_module fid/fid + load_module fld/fld + load_module lmv/lmv + load_module mdc/mdc + load_module osc/osc + load_module lov/lov + load_module mds/mds + load_module mdd/mdd + load_module mdt/mdt + load_module cmm/cmm + load_module osd/osd + load_module ost/ost + load_module obdfilter/obdfilter + load_module llite/lustre + load_module mgc/mgc + load_module mgs/mgs + rm -f $TMP/ogdb-`hostname` + $LCTL modules > $TMP/ogdb-`hostname` + # 'mount' doesn't look in $PATH, just sbin + [ -f $LUSTRE/utils/mount.lustre ] && cp $LUSTRE/utils/mount.lustre /sbin/. || true +} + +RMMOD=rmmod +if [ `uname -r | cut -c 3` -eq 4 ]; then + RMMOD="modprobe -r" +fi + +wait_for_lnet() { + local UNLOADED=0 + local WAIT=0 + local MAX=60 + MODULES=$($LCTL modules | awk '{ print $2 }') + while [ -n "$MODULES" ]; do + sleep 5 + $RMMOD $MODULES > /dev/null 2>&1 || true + MODULES=$($LCTL modules | awk '{ print $2 }') + if [ -z "$MODULES" ]; then + return 0 + else + WAIT=$((WAIT + 5)) + echo "waiting, $((MAX - WAIT)) secs left" + fi + if [ $WAIT -eq $MAX ]; then + echo "LNET modules $MODULES will not unload" + lsmod + return 3 + fi + done +} + +unload_modules() { + lsmod | grep lnet > /dev/null && $LCTL dl && $LCTL dk $TMP/debug + local MODULES=$($LCTL modules | awk '{ print $2 }') + $RMMOD $MODULES > /dev/null 2>&1 || true + # do it again, in case we tried to unload ksocklnd too early + MODULES=$($LCTL modules | awk '{ print $2 }') + [ -n "$MODULES" ] && $RMMOD $MODULES > /dev/null 2>&1 || true + MODULES=$($LCTL modules | awk '{ print $2 }') + if [ -n "$MODULES" ]; then + echo "Modules still loaded: " + echo $MODULES + if [ -e $LPROC ]; then + echo "Lustre still loaded" + cat $LPROC/devices || true + lsmod + return 2 + else + echo "Lustre stopped but LNET is still loaded, waiting..." + wait_for_lnet || return 3 + fi + fi + HAVE_MODULES=false + + LEAK_LUSTRE=$(dmesg | tail -n 30 | grep "obd mem.*leaked" || true) + LEAK_PORTALS=$(dmesg | tail -n 20 | grep "Portals memory leaked" || true) + if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then + echo "$LEAK_LUSTRE" 1>&2 + echo "$LEAK_PORTALS" 1>&2 + mv $TMP/debug $TMP/debug-leak.`date +%s` || true + echo "Memory leaks detected" + return 254 + fi + echo "modules unloaded." + return 0 +} + +check_gss_daemon_facet() { + facet=$1 + dname=$2 + + num=`do_facet $facet ps -o cmd -C $dname | grep $dname | wc -l` + if [ $num -ne 1 ]; then + echo "$num instance of $dname on $facet" + return 1 + fi + return 0 +} + +send_sigint() { + local facet=$1 + shift + do_facet $facet "killall -2 $@ 2>/dev/null || true" +} + +start_gss_daemons() { + # starting on MDT + for num in `seq $MDSCOUNT`; do + do_facet mds$num "$LSVCGSSD -v" + do_facet mds$num "$LGSSD -v" + done + # starting on OSTs + for num in `seq $OSTCOUNT`; do + do_facet ost$num "$LSVCGSSD -v" + done + # starting on client + # FIXME: is "client" the right facet name? + do_facet client "$LGSSD -v" + + # wait daemons entering "stable" status + sleep 5 + + # + # check daemons are running + # + for num in `seq $MDSCOUNT`; do + check_gss_daemon_facet mds$num lsvcgssd + check_gss_daemon_facet mds$num lgssd + done + for num in `seq $OSTCOUNT`; do + check_gss_daemon_facet ost$num lsvcgssd + done + check_gss_daemon_facet client lgssd +} + +stop_gss_daemons() { + for num in `seq $MDSCOUNT`; do + send_sigint mds$num lsvcgssd lgssd + done + for num in `seq $OSTCOUNT`; do + send_sigint ost$num lsvcgssd + done + send_sigint client lgssd +} + +init_krb5_env() { + if [ ! -z $SEC ]; then + MDS_MOUNT_OPTS=$MDS_MOUNT_OPTS,sec=$SEC + OST_MOUNT_OPTS=$OST_MOUNT_OPTS,sec=$SEC + fi + + if [ ! -z $USING_KRB5 ]; then + start_gss_daemons + fi +} + +cleanup_krb5_env() { + if [ ! -z $USING_KRB5 ]; then + stop_gss_daemons + # maybe cleanup credential cache? + fi +} + +mdsdevlabel() { + local num=$1 + local device=`mdsdevname $num` + local label=`do_facet mds$num "e2label ${device}" | grep -v "CMD: "` + echo -n $label +} + +ostdevlabel() { + local num=$1 + local device=`ostdevname $num` + local label=`do_facet ost$num "e2label ${device}" | grep -v "CMD: "` + echo -n $label } # Facet functions +# start facet device options start() { facet=$1 shift - active=`facet_active $facet` - do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \ - --node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \ - $@ $XMLCONFIG + device=$1 + shift + echo "Starting ${facet}: $@ ${device} ${MOUNT%/*}/${facet}" + do_facet ${facet} mkdir -p ${MOUNT%/*}/${facet} + do_facet ${facet} mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} + RC=${PIPESTATUS[0]} + if [ $RC -ne 0 ]; then + echo mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} + echo Start of ${device} on ${facet} failed ${RC} + else + do_facet ${facet} sync + label=$(do_facet ${facet} "e2label ${device}") + [ -z "$label" ] && echo no label for ${device} && exit 1 + eval export ${facet}_svc=${label} + eval export ${facet}_dev=${device} + eval export ${facet}_opt=\"$@\" + echo Started ${label} + fi + return $RC } stop() { + local running facet=$1 - active=`facet_active $facet` shift - do_facet $facet $LCONF --select ${facet}_svc=${active}_facet \ - --node ${active}_facet --ptldebug $PTLDEBUG --subsystem $SUBSYSTEM \ - $@ --cleanup $XMLCONFIG + HOST=`facet_active_host $facet` + [ -z $HOST ] && echo stop: no host for $facet && return 0 + + running=$(do_facet ${facet} "grep -c ${MOUNT%/*}/${facet}' ' /proc/mounts") || true + if [ ${running} -ne 0 ]; then + echo "Stopping ${MOUNT%/*}/${facet} (opts:$@)" + do_facet ${facet} umount -d $@ ${MOUNT%/*}/${facet} + fi + + # umount should block, but we should wait for unrelated obd's + # like the MGS or MGC to also stop. + local WAIT=0 + local INTERVAL=1 + # conf-sanity 31 takes a long time cleanup + while [ $WAIT -lt 300 ]; do + running=$(do_facet ${facet} "[ -e $LPROC ] && grep ST' ' $LPROC/devices") || true + if [ -z "${running}" ]; then + return 0 + fi + echo "waited $WAIT for${running}" + if [ $INTERVAL -lt 64 ]; then + INTERVAL=$((INTERVAL + INTERVAL)) + fi + sleep $INTERVAL + WAIT=$((WAIT + INTERVAL)) + done + echo "service didn't stop after $WAIT seconds. Still running:" + echo ${running} + exit 1 } zconf_mount() { - mnt=$1 - - [ -d $mnt ] || mkdir $mnt - - if [ -x /sbin/mount.lustre ] ; then - mount -t lustre -o nettype=$NETTYPE \ - `facet_host mds`:/mds_svc/client_facet $mnt - else - # this is so cheating - $LCONF --nosetup --node client_facet $XMLCONFIG - $LUSTRE/utils/llmount `facet_host mds`:/mds_svc/client_facet $mnt \ - -o nettype=$NETTYPE + local OPTIONS + local client=$1 + local mnt=$2 + # Only supply -o to mount if we have options + if [ -n "$MOUNTOPT" ]; then + OPTIONS="-o $MOUNTOPT" + fi + local device=$MGSNID:/$FSNAME + if [ -z "$mnt" -o -z "$FSNAME" ]; then + echo Bad zconf mount command: opt=$OPTIONS dev=$device mnt=$mnt + exit 1 fi + echo "Starting client: $OPTIONS $device $mnt" + do_node $client mkdir -p $mnt + do_node $client mount -t lustre $OPTIONS $device $mnt || return 1 + + do_node $client "sysctl -w lnet.debug=$PTLDEBUG; sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }" [ -d /r ] && $LCTL modules > /r/tmp/ogdb-`hostname` return 0 } zconf_umount() { - mnt=$1 - umount $mnt || : - $LCONF --cleanup --nosetup --node client_facet $XMLCONFIG || : + client=$1 + mnt=$2 + [ "$3" ] && force=-f + local running=$(do_node $client "grep -c $mnt' ' /proc/mounts") || true + if [ $running -ne 0 ]; then + echo "Stopping client $mnt (opts:$force)" + do_node $client umount $force $mnt + fi } shutdown_facet() { facet=$1 if [ "$FAILURE_MODE" = HARD ]; then - $POWER_DOWN `facet_active_host $facet` - sleep 2 + $POWER_DOWN `facet_active_host $facet` + sleep 2 elif [ "$FAILURE_MODE" = SOFT ]; then - stop $facet --force --failover --nomod + stop $facet fi } reboot_facet() { facet=$1 if [ "$FAILURE_MODE" = HARD ]; then - $POWER_UP `facet_active_host $facet` + $POWER_UP `facet_active_host $facet` + else + sleep 10 fi } +# verify that lustre actually cleaned up properly +cleanup_check() { + [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \ + error "LBUG/LASSERT detected" + BUSY=`dmesg | grep -i destruct || true` + if [ "$BUSY" ]; then + echo "$BUSY" 1>&2 + [ -e $TMP/debug ] && mv $TMP/debug $TMP/debug-busy.`date +%s` + exit 205 + fi + LEAK_LUSTRE=`dmesg | tail -n 30 | grep "obd mem.*leaked" || true` + LEAK_PORTALS=`dmesg | tail -n 20 | grep "Portals memory leaked" || true` + if [ "$LEAK_LUSTRE" -o "$LEAK_PORTALS" ]; then + echo "$0: $LEAK_LUSTRE" 1>&2 + echo "$0: $LEAK_PORTALS" 1>&2 + echo "$0: Memory leak(s) detected..." 1>&2 + mv $TMP/debug $TMP/debug-leak.`date +%s` + exit 204 + fi + + [ "`lctl dl 2> /dev/null | wc -l`" -gt 0 ] && lctl dl && \ + echo "$0: lustre didn't clean up..." 1>&2 && return 202 || true + + if [ "`/sbin/lsmod 2>&1 | egrep 'lnet|libcfs'`" ]; then + echo "$0: modules still loaded..." 1>&2 + /sbin/lsmod 1>&2 + return 203 + fi + return 0 +} + wait_for_host() { - HOST=$1 - check_network $HOST 900 - while ! do_node $HOST "$CHECKSTAT -t dir $LUSTRE"; do sleep 5; done + HOST=$1 + check_network "$HOST" 900 + while ! do_node $HOST "ls -d $LUSTRE " > /dev/null; do sleep 5; done } wait_for() { - facet=$1 - HOST=`facet_active_host $facet` - wait_for_host $HOST + facet=$1 + HOST=`facet_active_host $facet` + wait_for_host $HOST } client_df() { # not every config has many clients if [ ! -z "$CLIENTS" ]; then - $PDSH $CLIENTS "df $MOUNT" > /dev/null + $PDSH $CLIENTS "df $MOUNT" > /dev/null + fi +} + +client_reconnect() { + uname -n >> $MOUNT/recon + if [ ! -z "$CLIENTS" ]; then + $PDSH $CLIENTS "df $MOUNT; uname -n >> $MOUNT/recon" > /dev/null fi + echo Connected clients: + cat $MOUNT/recon + ls -l $MOUNT/recon > /dev/null + rm $MOUNT/recon } facet_failover() { facet=$1 - echo "Failing $facet node `facet_active_host $facet`" + echo "Failing $facet on node `facet_active_host $facet`" shutdown_facet $facet reboot_facet $facet client_df & DFPID=$! + echo "df pid is $DFPID" change_active $facet TO=`facet_active_host $facet` - echo "Failover MDS to $TO" + echo "Failover $facet to $TO" wait_for $facet - start $facet + local dev=${facet}_dev + local opt=${facet}_opt + start $facet ${!dev} ${!opt} || error "Restart of $facet failed" +} + +obd_name() { + local facet=$1 } replay_barrier() { local facet=$1 do_facet $facet sync df $MOUNT - do_facet $facet $LCTL --device %${facet}_svc readonly - do_facet $facet $LCTL --device %${facet}_svc notransno - do_facet $facet $LCTL mark "REPLAY BARRIER" - $LCTL mark "REPLAY BARRIER" + local svc=${facet}_svc + do_facet $facet $LCTL --device %${!svc} readonly + do_facet $facet $LCTL --device %${!svc} notransno + do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}" + $LCTL mark "local REPLAY BARRIER on ${!svc}" +} + +replay_barrier_nodf() { + local facet=$1 echo running=${running} + do_facet $facet sync + local svc=${facet}_svc + echo Replay barrier on ${!svc} + do_facet $facet $LCTL --device %${!svc} readonly + do_facet $facet $LCTL --device %${!svc} notransno + do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}" + $LCTL mark "local REPLAY BARRIER on ${!svc}" } mds_evict_client() { - UUID=`cat /proc/fs/lustre/mdc/*_MNT_*/uuid` - do_facet mds "echo $UUID > /proc/fs/lustre/mds/mds_svc/evict_client" + UUID=`cat /proc/fs/lustre/mdc/${mds1_svc}-mdc-*/uuid` + do_facet mds1 "echo $UUID > /proc/fs/lustre/mdt/${mds1_svc}/evict_client" +} + +ost_evict_client() { + UUID=`grep ${ost1_svc}-osc- $LPROC/devices | egrep -v 'MDT' | awk '{print $5}'` + do_facet ost1 "echo $UUID > /proc/fs/lustre/obdfilter/${ost1_svc}/evict_client" } fail() { - local facet=$1 - facet_failover $facet + facet_failover $* || error "failover: $?" df $MOUNT || error "post-failover df: $?" } fail_abort() { local facet=$1 - stop $facet --force --failover --nomod + stop $facet change_active $facet - start $facet - do_facet $facet lctl --device %${facet}_svc abort_recovery + local svc=${facet}_svc + local dev=${facet}_dev + local opt=${facet}_opt + start $facet ${!dev} ${!opt} + do_facet $facet lctl --device %${!svc} abort_recovery df $MOUNT || echo "first df failed: $?" + sleep 1 df $MOUNT || error "post-failover df: $?" } do_lmc() { - $LMC -m ${XMLCONFIG} $@ + echo There is no lmc. This is mountconf, baby. + exit 1 } -h2tcp() { - if [ "$1" = "client" ]; then echo \'*\'; else - echo $1 +h2gm () { + if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else + ID=`$PDSH $1 $GMNALNID -l | cut -d\ -f2` + echo $ID"@gm" + fi +} + +h2name_or_ip() { + if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else + echo $1"@$2" + fi +} + +h2ptl() { + if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else + ID=`xtprocadmin -n $1 2>/dev/null | egrep -v 'NID' | awk '{print $1}'` + if [ -z "$ID" ]; then + echo "Could not get a ptl id for $1..." + exit 1 + fi + echo $ID"@ptl" fi } +declare -fx h2ptl + +h2tcp() { + h2name_or_ip "$1" "tcp" +} declare -fx h2tcp h2elan() { - if [ "$1" = "client" ]; then echo \'*\'; else - echo $1 | sed 's/[^0-9]*//g' - fi + if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else + if type __h2elan >/dev/null 2>&1; then + ID=$(__h2elan $1) + else + ID=`echo $1 | sed 's/[^0-9]*//g'` + fi + echo $ID"@elan" + fi } declare -fx h2elan -facet_host() { - local facet=$1 - varname=${facet}_HOST - echo -n ${!varname} +h2openib() { + h2name_or_ip "$1" "openib" } +declare -fx h2openib -facet_nid() { - facet=$1 - HOST=`facet_host $facet` - if [ -z "$HOST" ]; then - echo "The env variable ${facet}_HOST must be set." - exit 1 - fi - echo `h2$NETTYPE $HOST` +h2o2ib() { + h2name_or_ip "$1" "o2ib" +} +declare -fx h2o2ib + +facet_host() { + local facet=$1 + varname=${facet}_HOST + if [ -z "${!varname}" ]; then + if [ "${facet:0:3}" == "ost" ]; then + eval ${facet}_HOST=${ost_HOST} + fi + fi + echo -n ${!varname} } facet_active() { local facet=$1 local activevar=${facet}active + + if [ -f ./${facet}active ] ; then + source ./${facet}active + fi + active=${!activevar} if [ -z "$active" ] ; then - echo -n ${facet} + echo -n ${facet} else - echo -n ${active} + echo -n ${active} fi } facet_active_host() { local facet=$1 local active=`facet_active $facet` - echo `facet_host $active` + if [ "$facet" == client ]; then + hostname + else + echo `facet_host $active` + fi } change_active() { @@ -252,74 +669,139 @@ change_active() { do_node() { HOST=$1 shift - + local myPDSH=$PDSH + if [ "$HOST" = "$(hostname)" ]; then + myPDSH="no_dsh" + fi if $VERBOSE; then - echo "CMD: $HOST $@" + echo "CMD: $HOST $@" >&2 + $myPDSH $HOST $LCTL mark "$@" > /dev/null 2>&1 || : fi - $PDSH $HOST "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests; cd $RPWD; sh -c \"$@\")" + $myPDSH $HOST "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; cd $RPWD; sh -c \"$@\")" | sed "s/^${HOST}: //" + return ${PIPESTATUS[0]} } + do_facet() { facet=$1 shift HOST=`facet_active_host $facet` + [ -z $HOST ] && echo No host defined for facet ${facet} && exit 1 do_node $HOST $@ } -add_facet() { +add() { local facet=$1 shift - echo "add facet $facet: `facet_host $facet`" - do_lmc --add node --node ${facet}_facet $@ --timeout $TIMEOUT - do_lmc --add net --node ${facet}_facet --nid `facet_nid $facet` \ - --nettype $NETTYPE + # make sure its not already running + stop ${facet} -f + rm -f ${facet}active + do_facet ${facet} $MKFS $* } -add_mds() { - facet=$1 - shift - rm -f ${facet}active - add_facet $facet --lustre_upcall $UPCALL --ptldebug $PTLDEBUG \ - --subsystem $SUBSYSTEM - do_lmc --add mds --node ${facet}_facet --mds ${facet}_svc $* +ostdevname() { + num=$1 + DEVNAME=OSTDEV$num + #if $OSTDEVn isn't defined, default is $OSTDEVBASE + num + eval DEVPTR=${!DEVNAME:=${OSTDEVBASE}${num}} + echo -n $DEVPTR } -add_mdsfailover() { - facet=$1 - shift - add_facet ${facet}failover --lustre_upcall $UPCALL - do_lmc --add mds --node ${facet}failover_facet --mds ${facet}_svc $* +mdsdevname() { + num=$1 + DEVNAME=MDSDEV$num + #if $MDSDEVn isn't defined, default is $MDSDEVBASE + num + eval DEVPTR=${!DEVNAME:=${MDSDEVBASE}${num}} + echo -n $DEVPTR } -add_ost() { - facet=$1 - shift - rm -f ${facet}active - add_facet $facet - do_lmc --add ost --node ${facet}_facet --ost ${facet}_svc $* +######## +## MountConf setup + +stopall() { + # make sure we are using the primary server, so test-framework will + # be able to clean up properly. + activemds=`facet_active mds1` + if [ $activemds != "mds1" ]; then + fail mds1 + fi + + # assume client mount is local + grep " $MOUNT " /proc/mounts && zconf_umount `hostname` $MOUNT $* + grep " $MOUNT2 " /proc/mounts && zconf_umount `hostname` $MOUNT2 $* + for num in `seq $MDSCOUNT`; do + stop mds$num -f + done + for num in `seq $OSTCOUNT`; do + stop ost$num -f + done + return 0 } -add_ostfailover() { - facet=$1 - shift - add_facet ${facet}failover - do_lmc --add ost --failover --node ${facet}failover_facet --ost ${facet}_svc $* +cleanupall() { + stopall $* + unload_modules + cleanup_krb5_env } -add_lov() { - lov=$1 - mds_facet=$2 - shift; shift - do_lmc --add lov --mds ${mds_facet}_svc --lov $lov $* - +mdsmkfsopts() +{ + local nr=$1 + test $nr = 1 && echo -n $MDS_MKFS_OPTS || echo -n $MDSn_MKFS_OPTS } -add_client() { - facet=$1 - mds=$2 - shift; shift - add_facet $facet --lustre_upcall $UPCALL - do_lmc --add mtpt --node ${facet}_facet --mds ${mds}_svc $* +formatall() { + [ "$FSTYPE" ] && FSTYPE_OPT="--backfstype $FSTYPE" + + stopall + # We need ldiskfs here, may as well load them all + load_modules + [ "$CLIENTONLY" ] && return + echo "Formatting mdts, osts" + for num in `seq $MDSCOUNT`; do + echo "Format mds$num: $(mdsdevname $num)" + if $VERBOSE; then + add mds$num `mdsmkfsopts $num` $FSTYPE_OPT --reformat `mdsdevname $num` || exit 9 + else + add mds$num `mdsmkfsopts $num` $FSTYPE_OPT --reformat `mdsdevname $num` > /dev/null || exit 9 + fi + done + for num in `seq $OSTCOUNT`; do + echo "Format ost$num: $(ostdevname $num)" + if $VERBOSE; then + add ost$num $OST_MKFS_OPTS --reformat `ostdevname $num` || exit 10 + else + add ost$num $OST_MKFS_OPTS --reformat `ostdevname $num` > /dev/null || exit 10 + fi + done +} + +mount_client() { + grep " $1 " /proc/mounts || zconf_mount `hostname` $* +} + +setupall() { + load_modules + init_krb5_env + if [ -z "$CLIENTONLY" ]; then + echo "Setup mdts, osts" + for num in `seq $MDSCOUNT`; do + DEVNAME=$(mdsdevname $num) + echo $REFORMAT | grep -q "reformat" \ + || do_facet mds$num "$TUNEFS --writeconf $DEVNAME" + start mds$num $DEVNAME $MDS_MOUNT_OPTS + done + for num in `seq $OSTCOUNT`; do + DEVNAME=$(ostdevname $num) + start ost$num $DEVNAME $OST_MOUNT_OPTS + done + fi + [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE + mount_client $MOUNT + if [ "$MOUNT_2" ]; then + mount_client $MOUNT2 + fi + sleep 5 } @@ -327,33 +809,33 @@ add_client() { # General functions check_network() { - local NETWORK=0 - local WAIT=0 - local MAX=$2 - while [ $NETWORK -eq 0 ]; do - ping -c 1 -w 3 $1 > /dev/null - if [ $? -eq 0 ]; then - NETWORK=1 - else - WAIT=$((WAIT + 5)) - echo "waiting for $1, $((MAX - WAIT)) secs left" - sleep 5 - fi - if [ $WAIT -gt $MAX ]; then - echo "Network not available" - exit 1 - fi - done + local NETWORK=0 + local WAIT=0 + local MAX=$2 + while [ $NETWORK -eq 0 ]; do + ping -c 1 -w 3 $1 > /dev/null + if [ $? -eq 0 ]; then + NETWORK=1 + else + WAIT=$((WAIT + 5)) + echo "waiting for $1, $((MAX - WAIT)) secs left" + sleep 5 + fi + if [ $WAIT -gt $MAX ]; then + echo "Network not available" + exit 1 + fi + done } check_port() { - while( !($DSH2 $1 "netstat -tna | grep -q $2") ) ; do - sleep 9 - done + while( !($DSH2 $1 "netstat -tna | grep -q $2") ) ; do + sleep 9 + done } no_dsh() { - shift - eval $@ + shift + eval $@ } comma_list() { @@ -363,7 +845,7 @@ comma_list() { } absolute_path() { - (cd `dirname $1`; echo $PWD/`basename $1`) + (cd `dirname $1`; echo $PWD/`basename $1`) } ################################## @@ -372,72 +854,117 @@ absolute_path() { drop_request() { # OBD_FAIL_MDS_ALL_REQUEST_NET RC=0 - do_facet mds "echo 0x123 > /proc/sys/lustre/fail_loc" + do_facet mds sysctl -w lustre.fail_loc=0x123 do_facet client "$1" || RC=$? - do_facet mds "echo 0 > /proc/sys/lustre/fail_loc" + do_facet mds sysctl -w lustre.fail_loc=0 return $RC } drop_reply() { # OBD_FAIL_MDS_ALL_REPLY_NET RC=0 - do_facet mds "echo 0x122 > /proc/sys/lustre/fail_loc" + do_facet mds sysctl -w lustre.fail_loc=0x122 + do_facet client "$@" || RC=$? + do_facet mds sysctl -w lustre.fail_loc=0 + return $RC +} + +drop_reint_reply() { +# OBD_FAIL_MDS_REINT_NET_REP + RC=0 + do_facet mds sysctl -w lustre.fail_loc=0x119 do_facet client "$@" || RC=$? - do_facet mds "echo 0 > /proc/sys/lustre/fail_loc" + do_facet mds sysctl -w lustre.fail_loc=0 return $RC } pause_bulk() { #define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 RC=0 - do_facet ost "echo 0x214 > /proc/sys/lustre/fail_loc" + do_facet ost1 sysctl -w lustre.fail_loc=0x214 do_facet client "$1" || RC=$? do_facet client "sync" - do_facet ost "echo 0 > /proc/sys/lustre/fail_loc" + do_facet ost1 sysctl -w lustre.fail_loc=0 return $RC } drop_ldlm_cancel() { #define OBD_FAIL_LDLM_CANCEL 0x304 RC=0 - do_facet client "echo 0x304 > /proc/sys/lustre/fail_loc" + do_facet client sysctl -w lustre.fail_loc=0x304 do_facet client "$@" || RC=$? - do_facet client "echo 0 > /proc/sys/lustre/fail_loc" + do_facet client sysctl -w lustre.fail_loc=0 return $RC } drop_bl_callback() { #define OBD_FAIL_LDLM_BL_CALLBACK 0x305 RC=0 - do_facet client "echo 0x305 > /proc/sys/lustre/fail_loc" + do_facet client sysctl -w lustre.fail_loc=0x305 + do_facet client "$@" || RC=$? + do_facet client sysctl -w lustre.fail_loc=0 + return $RC +} + +drop_ldlm_reply() { +#define OBD_FAIL_LDLM_REPLY 0x30c + RC=0 + do_facet mds sysctl -w lustre.fail_loc=0x30c do_facet client "$@" || RC=$? - do_facet client "echo 0 > /proc/sys/lustre/fail_loc" + do_facet mds sysctl -w lustre.fail_loc=0 return $RC } +clear_failloc() { + facet=$1 + pause=$2 + sleep $pause + echo "clearing fail_loc on $facet" + do_facet $facet "sysctl -w lustre.fail_loc=0" +} + cancel_lru_locks() { - for d in /proc/fs/lustre/ldlm/namespaces/$1*; do - if [ -f $d/lru_size ]; then - echo clear > $d/lru_size - grep [0-9] $d/lock_unused_count - fi - done + $LCTL mark "cancel_lru_locks $1 start" + for d in `find $LPROC/ldlm/namespaces | egrep -i $1`; do + [ -f $d/lru_size ] && echo clear > $d/lru_size + [ -f $d/lock_unused_count ] && grep [1-9] $d/lock_unused_count /dev/null + done + $LCTL mark "cancel_lru_locks $1 stop" +} + + +pgcache_empty() { + for a in /proc/fs/lustre/llite/*/dump_page_cache; do + if [ `wc -l $a | awk '{print $1}'` -gt 1 ]; then + echo there is still data in page cache $a ? + cat $a; + return 1; + fi + done + return 0 } ################################## # Test interface error() { - echo "${TESTSUITE}: **** FAIL:" $@ + sysctl -w lustre.fail_loc=0 2> /dev/null || true + log "${TESTSUITE}: **** FAIL:" $@ + $LCTL dk $TMP/lustre-log-$TESTNAME.log + log "FAIL: $TESTNAME $@" + $LCTL dk $TMP/lustrefail_${TESTSUITE}_${TESTNAME}.$(date +%s) exit 1 } build_test_filter() { - for O in $ONLY; do - eval ONLY_${O}=true - done - for E in $EXCEPT $ALWAYS_EXCEPT; do - eval EXCEPT_${E}=true - done + [ "$ONLY" ] && log "only running test `echo $ONLY`" + for O in $ONLY; do + eval ONLY_${O}=true + done + [ "$EXCEPT$ALWAYS_EXCEPT" ] && \ + log "skipping tests: `echo $EXCEPT $ALWAYS_EXCEPT`" + for E in $EXCEPT $ALWAYS_EXCEPT; do + eval EXCEPT_${E}=true + done } _basetest() { @@ -449,53 +976,95 @@ basetest() { } run_test() { - export base=`basetest $1` - if [ ! -z "$ONLY" ]; then - testname=ONLY_$1 - if [ ${!testname}x != x ]; then - run_one $1 "$2" - return $? - fi - testname=ONLY_$base - if [ ${!testname}x != x ]; then - run_one $1 "$2" - return $? - fi - echo -n "." - return 0 - fi - testname=EXCEPT_$1 + export base=`basetest $1` + if [ ! -z "$ONLY" ]; then + testname=ONLY_$1 if [ ${!testname}x != x ]; then - echo "skipping excluded test $1" - return 0 + run_one $1 "$2" + return $? fi - testname=EXCEPT_$base + testname=ONLY_$base if [ ${!testname}x != x ]; then - echo "skipping excluded test $1 (base $base)" - return 0 + run_one $1 "$2" + return $? fi - run_one $1 "$2" - - return $? + echo -n "." + return 0 + fi + testname=EXCEPT_$1 + if [ ${!testname}x != x ]; then + log "skipping excluded test $1" + return 0 + fi + testname=EXCEPT_$base + if [ ${!testname}x != x ]; then + log "skipping excluded test $1 (base $base)" + return 0 + fi + run_one $1 "$2" + + return $? } EQUALS="======================================================================" equals_msg() { - msg="$@" + msg="$@" + + local suffixlen=$((${#EQUALS} - ${#msg})) + [ $suffixlen -lt 5 ] && suffixlen=5 + printf '===== %s %.*s\n' "$msg" $suffixlen $EQUALS +} + +log() { + echo "$*" + lsmod | grep lnet > /dev/null || load_modules + $LCTL mark "$*" 2> /dev/null || true +} + +pass() { + echo PASS $@ +} - local suffixlen=$((${#EQUALS} - ${#msg})) - [ $suffixlen -lt 5 ] && suffixlen=5 - printf '===== %s %.*s\n' "$msg" $suffixlen $EQUALS +check_mds() { + FFREE=`cat /proc/fs/lustre/mds/*/filesfree` + FTOTAL=`cat /proc/fs/lustre/mds/*/filestotal` + [ $FFREE -ge $FTOTAL ] && error "files free $FFREE > total $FTOTAL" || true } run_one() { testnum=$1 message=$2 - tfile=f$base - tdir=d$base + tfile=f${testnum} + tdir=d${base} # Pretty tests run faster. equals_msg $testnum: $message + BEFORE=`date +%s` + log "== test $testnum: $message ============ `date +%H:%M:%S` ($BEFORE)" + #check_mds + export TESTNAME=test_$testnum test_${testnum} || error "test_$testnum failed with $?" + #check_mds + [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \ + error "LBUG/LASSERT detected" + pass "($((`date +%s` - $BEFORE))s)" + unset TESTNAME +} + +canonical_path() { + (cd `dirname $1`; echo $PWD/`basename $1`) +} + +######################## +# helper functions + +osc_to_ost() +{ + osc=$1 + ost=`echo $1 | awk -F_ '{print $3}'` + if [ -z $ost ]; then + ost=`echo $1 | sed 's/-osc.*//'` + fi + echo $ost }