rsize_iterator="*=2"
## which tests to run (first must be write)
-# remount) not really a test; just remount to uncache everything
+# clear_cache) not really a test; just uncache everything
# *write*) write
# *) read
#tests=(write rewrite read reread rewrite_again)
-tests=(write rewrite remount read reread)
+tests=(write rewrite clear_cache read reread)
# total # bytes written/read by any client node
min_per_client_size=4G
file_per_task=1
# the binaries
-IOR="/home/ericb/ior/src/C/IOR"
-llmount=/home/ericb/lustre/utils/llmount
+IOR=/usr/local/sbin/IOR
+llmount=llmount
pdsh=pdsh
# the result file prefix (date/time + hostname makes unique)
-#rslt=/home/ericb/ior_survey_`date +%F@%R`_`uname -n`
-rslt=/home/ericb/ior_survey
+rslt=/tmp/ior_survey_`date +%F@%R`_`uname -n`
# where lustre is mounted on the clients
lustre=/mnt/lustre
# basename of the test file(s)
testfile=${lustre}/ior_survey_testfile
-# how to unmount and remount the F/S on a client (to clear the cache)
-# change this depending on lustre config (network type, MDS etc)
-remount="umount $lustre && $llmount -o nettype=elan mdev6:/ll_mds/client $lustre"
-
# pdsh args required to instantiate all instances of IOR in parallel
# the chosen module must support '-n <procs-per-node>'
# -R<module>, -f<fanout> etc
tmpf=${workf}_tmp
echo -n > $tmpf
- if [ "$test" = "remount" ]; then
- echo "=> $remount" >> $tmpf
+ if [ "$test" = "clear_cache" ]; then
+ clear_cache='for LRU in /proc/fs/lustre/ldlm/namespaces/*/lru_size; do; echo clear > $LRU; done'
+ echo "=> $clear_cache" >> $tmpf
$pdsh -S -b -w "$test_clients" >> $tmpf 2>&1 \
- "$remount"
+ "$clear_cache"
status=$?
echo "Completion Status: $status" >> $tmpf
# ...or...
# echo_client instances (set 'client_names')
# ... use 'host:name' for obd instances on other nodes.
-ost_names=(ost{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16})
+
+# allow these to be passed in via string...
+ost_names_str=${ost_names_str:-""}
+if [ -n "$ost_names_str" ]; then
+ declare -a ost_names
+ count=0
+ for name in $ost_names_str; do
+ ost_names[$count]=$name
+ count=$((count+1))
+ done
+else
+ ost_names=(ost{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16})
+fi
+
#client_names=(ns8:ECHO_ns8 ns9:ECHO_ns9)
+client_names_str=${client_names_str:-""}
+if [ -n "$client_names_str" ]; then
+ # make sure we unset ost_names so that our client_names get noticed...
+ unset ost_names
+ declare -a client_names
+ count=0
+ for name in $client_names_str; do
+ client_names[$count]=$name
+ count=$((count+1))
+ done
+fi
# result file prefix (date/time + hostname makes unique)
# NB ensure path to it exists
-rslt=/home_nfs/eeb/obdfilter_survey_`date +%F@%R`_`uname -n`
+rslt=${rslt:-"/tmp/obdfilter_survey_`date +%F@%R`_`uname -n`"}
# lustre root (if running with own source tree)
-lustre_root=/home_nfs/eeb/lustre
+lustre_root=${lustre_root:-"/home_nfs/eeb/lustre"}
# what tests to run (first must be write)
-#tests=(write rewrite read reread rewrite_again)
-tests=(write rewrite read)
+tests_str=${tests_str:-""}
+if [ -n "$tests_str" ]; then
+ declare -a tests
+ count=0
+ for name in $tests_str; do
+ tests[$count]=$name
+ count=$((count+1))
+ done
+else
+ #tests=(write rewrite read reread rewrite_again)
+ tests=(write rewrite read)
+fi
# total size (MBytes) per obd instance
# large enough to avoid cache effects
# and to make test startup/shutdown overhead insignificant
-size=16384
+size=${size:-16384}
# record size (KBytes)
-rszlo=1024
-rszhi=1024
+rszlo=${rszlo:-1024}
+rszhi=${rszhi:-1024}
# number of objects per OST
-nobjlo=1
-nobjhi=512
+nobjlo=${nobjlo:-1}
+nobjhi=${nobjhi:-512}
# threads per OST (1024 max)
-thrlo=1
-thrhi=64
+thrlo=${thrlo:-1}
+thrhi=${thrhi:-64}
# restart from here iff all are defined
restart_rsz=
restart_nobj=1
# machine's page size (K)
-PAGE_SIZE=64
+if [ -z "$PAGE_SIZE" ]; then
+ if which python >/dev/null; then
+ PAGE_SIZE=`echo 'import resource; print resource.getpagesize()/1024;' |python`
+ fi
+fi
+PAGE_SIZE=${PAGE_SIZE:-4}
# max buffer_mem (total_threads * buffer size)
# (to avoid lctl ENOMEM problems)
verify=1
if [ ${#tests[@]} -eq 0 -o "${tests[0]}" != "write" ]; then
+ echo "tests: ${tests[@]}"
echo "First test must be 'write'" 1>&2
exit 1
fi
rsltf="${rslt}.summary"
workf="${rslt}.detail"
cmdsf="${rslt}.script"
+vmstatf="${rslt}.vmstat"
echo -n > $rsltf
echo -n > $workf
+declare -a vmstatpids
+
+# hide a little trick to unset this from the command line
+if [ "$lustre_root" == " " ]; then
+ unset lustre_root
+fi
+
if [ -z "$lustre_root" ]; then
- lctl=lctl
+ lctl=$(which lctl)
else
lctl=${lustre_root}/utils/lctl
fi
local nobj=$3
local rfile=$4
remote_shell $host $lctl --device $devno create $nobj > $rfile 2>&1
- n=(`awk < $rfile \
- '/is object id/ {obj=strtonum($6);\
- first=!not_first; not_first=1;\
- if (first) first_obj=obj;
- else if (obj != prev + 1) exit;\
- prev=obj; n++}\
- END {printf "%d %d\n", first_obj, n}'`)
- if ((n[1] != nobj)); then
- echo "ERROR"
- else
- echo ${n[0]}
+ first=0
+ prev=0
+ count=0
+ error=0
+ while read line; do
+ echo "$line" | grep -q 'is object id'
+ if [ $? -ne 0 ]; then
+ continue
+ fi
+ if [ $first -eq 0 ]; then
+ first=$(echo $line | awk '{print $6}')
+ first=$(printf "%d" $first)
+ prev=$first
+ count=1
+ else
+ obj=$(echo $line | awk '{print $6}')
+ obj=$(printf "%d" $obj)
+ diff=$((obj - (prev+1)))
+ if [ $diff -ne 0 ]; then
+ error=1
+ fi
+ prev=$obj
+ count=$((count+1))
+ fi
+ done < $rfile
+ if [ $nobj -ne $count ]; then
+ echo "ERROR: $nobj != $count" >&2
+ cat $rfile >&2
+ echo "ERROR"
+ elif [ $error -ne 0 ]; then
+ echo "ERROR: non contiguous objs found" >&2
+ echo "ERROR"
+ else
+ echo $first
fi
}
done
fi
+# get vmstat started
# disable portals debug and get obdecho loaded on all relevant hosts
unique_hosts=(`unique ${host_names[@]}`)
+pidcount=0
for host in ${unique_hosts[@]}; do
remote_shell $host "echo 0 > /proc/sys/portals/debug"
+ host_vmstatf=${vmstatf}_${host}
+ echo -n > $host_vmstatf
+ remote_shell $host "vmstat 5 >> $host_vmstatf" &
+ pid=$!
+ vmstatpids[$pidcount]=$pid
+ pidcount=$((pidcount+1))
do_unload_obdecho[$host]=0
if obdecho_loaded $host; then
- continue
+ continue
fi
load_obdecho $host
if obdecho_loaded $host; then
- do_unload_obdecho[$host]=1
- continue
- fi
+ do_unload_obdecho[$host]=1
+ continue
+ fi
echo "Can't load obdecho on $host" 1>&2
exit 1
done
done
# run tests
for test in ${tests[@]}; do
+ declare -a pidarray
+ for host in ${unique_hosts[@]}; do
+ echo "starting run for test: $test rsz: $rsz threads: $thr objects: $nobj" >> ${vmstatf}_${host}
+ done
print_summary -n "$test "
# create per-host script files
for host in ${unique_hosts[@]}; do
"$lctl > $tmpfi 2>&1 \\
--threads $thr -$snap $devno \\
test_brw $count `testname2type $test` q $pages ${thr}t${first_obj} &"
- done
- for host in ${unique_hosts[@]}; do
- echo "wait" >> ${cmdsf}_${host}
- done
- # timed run of all the per-host script files
- t0=`date +%s.%N`
- for host in ${unique_hosts[@]}; do
- remote_shell $host bash ${cmdsf}_${host}&
- done
- wait
- t1=`date +%s.%N`
- # clean up per-host script files
- for host in ${unique_hosts[@]}; do
- rm ${cmdsf}_${host}
- done
+ done
+ pidcount=0
+ for host in ${unique_hosts[@]}; do
+ echo "wait" >> ${cmdsf}_${host}
+ pidarray[$pidcount]=0
+ pidcount=$((pidcount+1))
+ done
+ # timed run of all the per-host script files
+ t0=`date +%s.%N`
+ pidcount=0
+ for host in ${unique_hosts[@]}; do
+ remote_shell $host bash ${cmdsf}_${host} &
+ pidarray[$pidcount]=$!
+ pidcount=$((pidcount+1))
+ done
+ pidcount=0
+ for host in ${unique_hosts[@]}; do
+ wait ${pidarray[$pidcount]}
+ pidcount=$((pidcount+1))
+ done
+ #wait
+ t1=`date +%s.%N`
+ # clean up per-host script files
+ for host in ${unique_hosts[@]}; do
+ rm ${cmdsf}_${host}
+ done
# compute bandwidth from total data / elapsed time
str=`awk "BEGIN {printf \"%7.2f \",\
$total_size / (( $t1 - $t0 ) * 1024)}"`
done
# unload any obdecho modules we loaded
+pidcount=0
for host in ${unique_hosts[@]}; do
+ remote_shell $host "killall vmstat" &
+ pid=$!
+ kill -term ${vmstatpids[$pidcount]}
+ kill -kill ${vmstatpids[$pidcount]} 2>/dev/null
+ wait $pid
+ pidcount=$((pidcount+1))
if ((${do_unload_obdecho[$host]})); then
- unload_obdecho $host
+ unload_obdecho $host
fi
done
+
+exit 0