#!/bin/bash
-# Simple function used by run_*.sh scripts
+# functions used by other scripts
assert_env() {
local failed=""
print_opts IOR ior_THREADS ior_DURATION MACHINEFILE
- test_mkdir -p $testdir
+ client_load_mkdir $testdir
# mpi_run uses mpiuser
chmod 0777 $testdir
rm -rf $testdir
}
+
+client_load_mkdir () {
+ local dir=$1
+ local parent=$(dirname $dir)
+
+ local mdtcount=$($LFS df $parent 2> /dev/null | grep -c MDT)
+ if [ $mdtcount -le 1 ] || ! is_lustre ${parent}; then
+ mkdir $dir || return 1
+ return 0
+ else
+ mdt_idx=$((RANDOM % mdtcount))
+ if $RECOVERY_SCALE_ENABLE_STRIPED_DIRS; then
+ # stripe_count in range [1,mdtcount]
+ # $LFS mkdir treats stripe_count 0 and 1 the same
+ stripe_count_opt="-c$((RANDOM % mdtcount + 1))"
+ else
+ stripe_count_opt=""
+ fi
+ fi
+
+ if $RECOVERY_SCALE_ENABLE_REMOTE_DIRS ||
+ $RECOVERY_SCALE_ENABLE_STRIPED_DIRS; then
+ $LFS mkdir -i$mdt_idx $stripe_count_opt $dir ||
+ return 1
+ else
+ mkdir $dir || return 1
+ fi
+ $LFS getdirstripe $dir || return 1
+
+ if [ -n "$client_load_SETSTRIPEPARAMS" ]; then
+ $LFS setstripe $client_load_SETSTRIPEPARAMS $dir ||
+ return 1
+ fi
+ $LFS getstripe $dir || return 1
+}
+
+enospc_detected () {
+ grep "No space left on device" $1 | grep -qv grep
+}
init_changelog
END_RUN_FILE=${DIR}/$tdir/run LOAD_PID_FILE=${DIR}/$tdir/pid \
- MOUNT=${DIR}/$tdir run_iozone.sh &
+ MOUNT=${DIR}/$tdir RECOVERY_SCALE_ENABLE_REMOTE_DIRS=false \
+ RECOVERY_SCALE_ENABLE_STRIPED_DIRS=false run_iozone.sh &
sleep 30
child_pid=$(pgrep iozone)
stop_procs $child_pid
init_test_env "$@"
init_logging
+init_stripe_dir_params RECOVERY_SCALE_ENABLE_REMOTE_DIRS \
+ RECOVERY_SCALE_ENABLE_STRIPED_DIRS
+
ALWAYS_EXCEPT="$PARALLEL_SCALE_EXCEPT "
# bug number for skipped test: LU-9429
ALWAYS_EXCEPT+=" parallel_grouplock "
echo Victim facets "${victims[@]}"
fi
+init_stripe_dir_params RACER_ENABLE_REMOTE_DIRS \
+ RACER_ENABLE_STRIPED_DIRS
+
if ((MDSCOUNT > 1)); then
- (( $MDS1_VERSION >= $(version_code 2.5.0) )) &&
- RACER_ENABLE_REMOTE_DIRS=${RACER_ENABLE_REMOTE_DIRS:-true}
- (( $MDS1_VERSION >= $(version_code 2.8.0) )) &&
- RACER_ENABLE_STRIPED_DIRS=${RACER_ENABLE_STRIPED_DIRS:-true}
(( $MDS1_VERSION >= $(version_code 2.13.57) )) &&
RACER_ENABLE_MIGRATION=${RACER_ENABLE_MIGRATION:-true}
(( $MDS1_VERSION >= $(version_code 2.15.55.45) )) &&
RACER_ENABLE_FALLOCATE=false
check_set_fallocate || RACER_ENABLE_FALLOCATE=false
-RACER_ENABLE_REMOTE_DIRS=${RACER_ENABLE_REMOTE_DIRS:-false}
-RACER_ENABLE_STRIPED_DIRS=${RACER_ENABLE_STRIPED_DIRS:-false}
RACER_ENABLE_MIGRATION=${RACER_ENABLE_MIGRATION:-false}
RACER_ENABLE_SNAPSHOT=${RACER_ENABLE_SNAPSHOT:-true}
RACER_ENABLE_FILE_MIGRATE=${RACER_ENABLE_FILE_MIGRATE:-true}
END_RUN_FILE=${END_RUN_FILE:-$SHARED_DIRECTORY/end_run_file}
LOAD_PID_FILE=${LOAD_PID_FILE:-$TMP/client-load.pid}
+init_stripe_dir_params RECOVERY_SCALE_ENABLE_REMOTE_DIRS \
+ RECOVERY_SCALE_ENABLE_STRIPED_DIRS
+
reboot_recover_node () {
# item var contains a pair of clients if nodetype=clients
# I would prefer to have a list here
# -- remove hostname from clients list
zconf_umount $HOSTNAME $MOUNT
NODES_TO_USE=${NODES_TO_USE:-$CLIENTS}
+log "Using NODES_TO_USE: $NODES_TO_USE and CLIENTS: $CLIENTS"
NODES_TO_USE=$(exclude_items_from_list $NODES_TO_USE $HOSTNAME)
+log "Using remote NODES_TO_USE: $NODES_TO_USE HOSTNAME=$HOSTNAME"
check_progs_installed $NODES_TO_USE "${CLIENT_LOADS[@]}"
ERRORS_OK="" # No application failures should occur during this test.
+init_stripe_dir_params RECOVERY_SCALE_ENABLE_REMOTE_DIRS \
+ RECOVERY_SCALE_ENABLE_STRIPED_DIRS
+
check_and_setup_lustre
rm -rf $DIR/[Rdfs][0-9]*
# but not for other clients.
ERRORS_OK="yes"
+init_stripe_dir_params RECOVERY_SCALE_ENABLE_REMOTE_DIRS \
+ RECOVERY_SCALE_ENABLE_STRIPED_DIRS
+
numfailovers () {
local facet
local var
if [ -e $END_RUN_FILE ]; then
local end_run_node
read end_run_node < $END_RUN_FILE
- [[ $end_run_node = $fail_client ]] &&
- rm -f $END_RUN_FILE || exit 13
+ if [[ $end_run_node = $fail_client ]]; then
+ rm -f $END_RUN_FILE
+ else
+ echo "failure is expected on FAIL CLIENT \
+ $fail_client, not on $end_run_node"
+ exit 13
+ fi
fi
restart_client_loads $fail_client $ERRORS_OK || exit $?
LOG=$TESTLOG_PREFIX.$(basename $0 .sh)_stdout.$(hostname -s).log
DEBUGLOG=$(echo $LOG | sed 's/\(.*\)stdout/\1debug/')
+error () {
+ echo "$@"
+ exit 17
+}
+
mkdir -p ${LOG%/*}
rm -f $LOG $DEBUGLOG
TESTDIR=${TESTDIR:-$MOUNT/d0.ior-$(hostname)}
-CONTINUE=true
-while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+while [ ! -e "$END_RUN_FILE" ]; do
echoerr "$(date +'%F %H:%M:%S'): IOR run starting"
- mkdir -p $TESTDIR
+ rm -rf $TESTDIR
+ client_load_mkdir $TESTDIR
+ if [ $? -ne 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): failed to create $TESTDIR"
+ echo $(hostname) >> $END_RUN_FILE
+ break
+ fi
+
# need this only if TESTDIR is not default
chmod -R 777 $TESTDIR
sync
- run_ior fpp $TESTDIR 1>$LOG &
+ run_ior fpp $TESTDIR 1>>$LOG &
load_pid=$!
wait $load_pid
if [ ${PIPESTATUS[0]} -eq 0 ]; then
echoerr "$(date +'%F %H:%M:%S'): IOR succeeded"
cd $TMP
- rm -rf $TESTDIR
- echoerr "$(date +'%F %H:%M:%S'): IOR run finished"
else
+ enospc_detected $DEBUGLOG &&
+ echoerr "$(date +'%F %H:%M:%S'): IOR ENOSPC, ignored" &&
+ continue
+
echoerr "$(date +'%F %H:%M:%S'): IOR failed"
if [ -z "$ERRORS_OK" ]; then
echo $(hostname) >> $END_RUN_FILE
fi
- if [ $BREAK_ON_ERROR ]; then
- # break
- CONTINUE=false
- fi
fi
done
TESTDIR=$MOUNT/d0.dbench-$(hostname)
-CONTINUE=true
-
-while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+while [ ! -e "$END_RUN_FILE" ]; do
echoerr "$(date +'%F %H:%M:%S'): dbench run starting"
- mkdir -p $TESTDIR
+ rm -rf $TESTDIR
+ client_load_mkdir $TESTDIR
+ if [ $? -ne 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): failed to create $TESTDIR"
+ echo $(hostname) >> $END_RUN_FILE
+ break
+ fi
sync
rundbench -D $TESTDIR 2 1>$LOG &
if [ ${PIPESTATUS[0]} -eq 0 ]; then
echoerr "$(date +'%F %H:%M:%S'): dbench succeeded"
cd $TMP
- rm -rf $TESTDIR
- echoerr "$(date +'%F %H:%M:%S'): dbench run finished"
else
+ enospc_detected $DEBUGLOG &&
+ echoerr "$(date +'%F %H:%M:%S'):"\
+ "dbench ENOSPC, ignored" &&
+ continue
+
echoerr "$(date +'%F %H:%M:%S'): dbench failed"
if [ -z "$ERRORS_OK" ]; then
echo $(hostname) >> $END_RUN_FILE
fi
-
- if [ $BREAK_ON_ERROR ]; then
- # break
- CONTINUE=false
- fi
fi
done
TESTDIR=$MOUNT/d0.dd-$(hostname)
-CONTINUE=true
-while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+while [ ! -e "$END_RUN_FILE" ]; do
echoerr "$(date +'%F %H:%M:%S'): dd run starting"
- mkdir -p $TESTDIR
- $LFS setstripe -c -1 $TESTDIR
+ rm -rf $TESTDIR
+ client_load_mkdir $TESTDIR
+ if [ $? -ne 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): failed to create $TESTDIR"
+ echo $(hostname) >> $END_RUN_FILE
+ break
+ fi
cd $TESTDIR
sync
# suppress dd xfer stat to workaround buggy coreutils/gettext
# combination in RHEL5 and OEL5, see BZ 21264
FREE_SPACE=$(df -P $TESTDIR | awk '/:/ { print $4 }')
- BLKS=$((FREE_SPACE * 9 / 40 / CLIENT_COUNT))
+ BLKS=$((FREE_SPACE / 4 / CLIENT_COUNT))
echoerr "Total free disk space is $FREE_SPACE, 4k blocks to dd is $BLKS"
df $TESTDIR || true
if [ $? -eq 0 ]; then
echoerr "$(date +'%F %H:%M:%S'): dd succeeded"
cd $TMP
- rm -rf $TESTDIR
- echoerr "$(date +'%F %H:%M:%S'): dd run finished"
else
+ enospc_detected $DEBUGLOG &&
+ echoerr "$(date +'%F %H:%M:%S'): dd ENOSPC, ignored" &&
+ continue
echoerr "$(date +'%F %H:%M:%S'): dd failed"
if [ -z "$ERRORS_OK" ]; then
echo $(hostname) >> $END_RUN_FILE
fi
- if [ $BREAK_ON_ERROR ]; then
- # break
- CONTINUE=false
- fi
fi
done
TESTDIR=$MOUNT/d0.iozone-$(hostname)
-CONTINUE=true
-while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+while [ ! -e "$END_RUN_FILE" ]; do
echoerr "$(date +'%F %H:%M:%S'): iozone run starting"
- mkdir -p $TESTDIR
+ rm -rf $TESTDIR
+ client_load_mkdir $TESTDIR
+ if [ $? -ne 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): failed to create $TESTDIR"
+ echo $(hostname) >> $END_RUN_FILE
+ break
+ fi
cd $TESTDIR
sync
if [ ${PIPESTATUS[0]} -eq 0 ]; then
echoerr "$(date +'%F %H:%M:%S'): iozone succeeded"
cd $TMP
- rm -rf $TESTDIR
- if [ -d $TESTDIR ]; then
- echoerr "$(date +'%F %H:%M:%S'): failed to remove \
- $TESTDIR"
- echo $(hostname) >> $END_RUN_FILE
- CONTINUE=false
- fi
- echoerr "$(date +'%F %H:%M:%S'): iozone run finished"
else
+ enospc_detected $DEBUGLOG &&
+ echoerr "$(date +'%F %H:%M:%S'):"\
+ "iozone ENOSPC, ignored" &&
+ continue
+
echoerr "$(date +'%F %H:%M:%S'): iozone failed"
if [ -z "$ERRORS_OK" ]; then
echo $(hostname) >> $END_RUN_FILE
fi
- if [ $BREAK_ON_ERROR ]; then
- # break
- CONTINUE=false
- fi
fi
done
return ${PIPESTATUS[1]}
}
-CONTINUE=true
-while [ ! -e "$END_RUN_FILE" ] && $CONTINUE; do
+while [ ! -e "$END_RUN_FILE" ]; do
echoerr "$(date +'%F %H:%M:%S'): tar run starting"
- mkdir -p $TESTDIR
+ rm -rf $TESTDIR
+ client_load_mkdir $TESTDIR
+ if [ $? -ne 0 ]; then
+ echoerr "$(date +'%F %H:%M:%S'): failed to create $TESTDIR"
+ echo $(hostname) >> $END_RUN_FILE
+ break
+ fi
cd $TESTDIR
sync
- USAGE=$(du -s /etc | awk '{print $1}')
$LCTL set_param llite.*.lazystatfs=0
df $TESTDIR || true
sleep 2
- FREE_SPACE=$(df $TESTDIR | awk '/:/ { print $4 }')
- AVAIL=$((FREE_SPACE * 9 / 10 / CLIENT_COUNT))
- if [ $AVAIL -lt $USAGE ]; then
- echoerr "no enough free disk space: need $USAGE, avail $AVAIL"
- echo $(hostname) >> $END_RUN_FILE
- break
- fi
do_tar & wait $!
RC=$?
if [ $RC -eq 0 ]; then
echoerr "$(date +'%F %H:%M:%S'): tar succeeded"
cd $TMP
- rm -rf $TESTDIR & wait $!
- echoerr "$(date +'%F %H:%M:%S'): tar run finished"
else
+ enospc_detected $DEBUGLOG &&
+ echoerr "$(date +'%F %H:%M:%S'): tar ENOSPC, ignored" &&
+ continue
+
echoerr "$(date +'%F %H:%M:%S'): tar failed"
if [ -z "$ERRORS_OK" ]; then
echo $(hostname) >> $END_RUN_FILE
fi
- if [ $BREAK_ON_ERROR ]; then
- # break
- CONTINUE=false
- fi
fi
done
eval export ${var}=$load
do_node $client "PATH=$PATH MOUNT=$MOUNT ERRORS_OK=$ERRORS_OK \
- BREAK_ON_ERROR=$BREAK_ON_ERROR \
END_RUN_FILE=$END_RUN_FILE \
LOAD_PID_FILE=$LOAD_PID_FILE \
TESTLOG_PREFIX=$TESTLOG_PREFIX \
DBENCH_LIB=$DBENCH_LIB \
DBENCH_SRC=$DBENCH_SRC \
CLIENT_COUNT=$((CLIENTCOUNT - 1)) \
+ RECOVERY_SCALE_ENABLE_REMOTE_DIRS=$RECOVERY_SCALE_ENABLE_REMOTE_DIRS \
+ RECOVERY_SCALE_ENABLE_STRIPED_DIRS=$RECOVERY_SCALE_ENABLE_STRIPED_DIRS \
LFS=$LFS \
LCTL=$LCTL \
FSNAME=$FSNAME \
+ MPI_USER=$MPI_USER \
MPIRUN=$MPIRUN \
MPIRUN_OPTIONS=\\\"$MPIRUN_OPTIONS\\\" \
MACHINEFILE_OPTION=\\\"$MACHINEFILE_OPTION\\\" \
for host in ${hostlist//,/ }; do
check_network "$host" 900
done
- while ! do_nodes $hostlist hostname > /dev/null; do sleep 5; done
+ while ! do_nodes $hostlist hostname; do sleep 5; done
}
wait_for_facet() {
local host=$(facet_active_host $facet)
hostlist=$(expand_list $hostlist $host)
+ local fhost=$(facet_host $facet)
+ local ffhost=$(facet_failover_host $facet)
+ echo "facet: $facet facet_host: $fhost facet_failover_host: $ffhost"
if [ $(facet_host $facet) = \
$(facet_failover_host $facet) ]; then
waithostlist=$(expand_list $waithostlist $host)
for host in ${hostlist//,/ }; do
reboot_node $host
done
- echo "$(date +'%H:%M:%S (%s)') $hostlist rebooted"
+ echo "$(date +'%H:%M:%S (%s)') $hostlist rebooted; waithostlist: $waithostlist"
# We need to wait the rebooted hosts in case if
# facet_HOST == facetfailover_HOST
if ! [ -z "$waithostlist" ]; then
stack_trap "restore_layout $dir $layout" EXIT
}
+init_stripe_dir_params() {
+ local varremote=$1
+ local varstriped=$2
+
+ if ((MDSCOUNT > 1 &&
+ $MDS1_VERSION >=
+ $(version_code 2.8.0))); then
+ eval $varremote=${!varremote:-true}
+ eval $varstriped=${!varstriped:-true}
+ elif ((MDSCOUNT > 1 &&
+ $MDS1_VERSION >=
+ $(version_code 2.5.0))); then
+ eval $varremote=${!varremote:-true}
+ eval $varstriped=${!varstriped:-false}
+ fi
+
+ eval $varremote=${!varremote:-false}
+ eval $varstriped=${!varstriped:-false}
+}
+
verify_yaml_layout() {
local src=$1
local dst=$2