#!/bin/bash ###################################################################### # customize per survey # specify obd instances to exercise # these can be either... # obdfilter instances (set 'ost_names') # ...or... # echo_client instances (set 'client_names') # ... use 'host:name' for obd instances on other nodes. ost_names=(ost{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}) #client_names=(ns8:ECHO_ns8 ns9:ECHO_ns9) # result file prefix (date/time + hostname makes unique) # NB ensure path to it exists rslt=/home_nfs/eeb/obdfilter_survey_`date +%F@%R`_`uname -n` # lustre root (if running with own source tree) lustre_root=/home_nfs/eeb/lustre # what tests to run (first must be write) #tests=(write rewrite read reread rewrite_again) tests=(write rewrite read) # total size (MBytes) per obd instance # large enough to avoid cache effects # and to make test startup/shutdown overhead insignificant size=16384 # record size (KBytes) rszlo=1024 rszhi=1024 # number of objects per OST nobjlo=1 nobjhi=512 # threads per OST (1024 max) thrlo=1 thrhi=64 # restart from here iff all are defined restart_rsz= restart_thr=1 restart_nobj=1 # machine's page size (K) PAGE_SIZE=64 # max buffer_mem (total_threads * buffer size) # (to avoid lctl ENOMEM problems) max_buffer_mem=$((1024*1024)) # how to run commands on other nodes # You need to make this work on your cluster if you have specified # non-local obd instances above custom_remote_shell () { host=$1 shift cmds="$*" here=`pwd` # Hop on to the remote node, chdir to 'here' and run the given # commands. One of the following will probably work. ssh $host "cd $here; $cmds" #rsh $host "cd $here; $cmds" # we have to remove the leading `uname -n`: from pdsh output lines #pdsh -w $host "cd $here; $cmds" | sed 's/^[^:]*://' } ##################################################################### # leave the rest of this alone unless you know what you're doing... # binaries lsmod="/sbin/lsmod" modprobe="/sbin/modprobe" insmod="/sbin/insmod" rmmod="/sbin/rmmod" # lctl::test_brw bandwidth snapshot interval (seconds) snap=1 # check file contents? verify=1 if [ ${#tests[@]} -eq 0 -o "${tests[0]}" != "write" ]; then echo "First test must be 'write'" 1>&2 exit 1 fi rsltf="${rslt}.summary" workf="${rslt}.detail" cmdsf="${rslt}.script" echo -n > $rsltf echo -n > $workf if [ -z "$lustre_root" ]; then lctl=lctl else lctl=${lustre_root}/utils/lctl fi remote_shell () { host=$1 shift cmds="$*" if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then eval "$cmds" else custom_remote_shell $host "$cmds" fi } obdecho_loaded() { local host=$1 remote_shell $host $lsmod | grep obdecho > /dev/null 2>&1 } load_obdecho () { local host=$1 if [ -z "$lustre_root" ]; then remote_shell $host $modprobe obdecho elif [ -f ${lustre_root}/obdecho/obdecho.ko ]; then remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.ko else remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.o fi } unload_obdecho () { local host=$1 remote_shell $host $rmmod obdecho } get_devno () { local host=$1 local type=$2 local name=$3 remote_shell $host $lctl device_list | \ awk "{if (\$2 == \"UP\" && \$3 == \"$type\" && \$4 == \"$name\") {\ print \$1; exit}}" } get_ec_devno () { local host=$1 local client_name="$2" local ost_name="$3" if [ -z "$client_name" ]; then if [ -z "$ost_name" ]; then echo "client and ost name both null" 1>&2 return fi client_name=${ost_name}_echo_client fi ec=`get_devno $host echo_client $client_name` if [ -n "$ec" ]; then echo $ec $client_name 0 return fi if [ -z "$ost_name" ]; then echo "no echo client and ost_name not set" 1>&2 return fi ost=`get_devno $host obdfilter $ost_name` if [ -z "$ost" ]; then echo "OST $ost_name not setup" 1>&2 return fi remote_shell $host "$lctl <&2 return fi echo $ec $client_name 1 } teardown_ec_devno () { local host=$1 local client_name=$2 remote_shell $host "$lctl < $rfile 2>&1 first=0 prev=0 count=0 error=0 while read line; do echo "$line" | grep -q 'is object id' if [ $? -ne 0 ]; then continue fi if [ $first -eq 0 ]; then first=$(echo $line | awk '{print $6}') first=$(printf "%d" $first) prev=$first count=1 else obj=$(echo $line | awk '{print $6}') obj=$(printf "%d" $obj) diff=$((obj - (prev+1))) if [ $diff -ne 0 ]; then error=1 fi prev=$obj count=$((count+1)) fi done < $rfile if [ $nobj -ne $count ]; then echo "ERROR: $nobj != $count" >&2 cat $rfile >&2 echo "ERROR" elif [ $error -ne 0 ]; then echo "ERROR: non contiguous objs found" >&2 echo "ERROR" else echo $first fi } destroy_objects () { local host=$1 local devno=$2 local obj0=$3 local nobj=$4 local rfile=$5 remote_shell $host $lctl --device $devno destroy $obj0 $nobj > $rfile 2>&1 } get_stats () { local rfile=$1 awk < $rfile \ '/^Selected device [0-9]+$/ {n = 0; next}\ /error/ {n = -1; exit}\ /^[0-9]+\/[0-9]+ Total: [0-9]+\.[0-9]+\/second$/ {n++; v=strtonum($3); \ if (n == 1 || v < min) min = v;\ if (n == 1 || v > max) max = v;\ next}\ {if (n != 0) {n = -1; exit}}\ END {printf "%d %f %f\n", n, min, max}' } get_global_stats () { local rfile=$1 awk < $rfile 'BEGIN {n = 0;}\ {n++; if (n == 1) {err = $1; min = $2; max = $3} else\ {if ($1 < err) err = $1;\ if ($2 < min) min = $2;\ if ($3 > max) max = $3}}\ END {if (n == 0) err = 0;\ printf "%d %f %f\n", err, min, max}' } testname2type () { # 'x' disables data check if ((verify)); then x="" else x="x" fi case $1 in *write*) echo "w$x";; *) echo "r$x";; esac } print_summary () { if [ "$1" = "-n" ]; then minusn=$1; shift else minusn="" fi echo $minusn "$*" >> $rsltf echo $minusn "$*" } unique () { echo "$@" | xargs -n1 echo | sort -u } split_hostname () { local name=$1 case $name in *:*) host=`echo $name | sed 's/:.*$//'` name=`echo $name | sed 's/[^:]*://'` ;; *) host=localhost ;; esac echo "$host $name" } # split out hostnames from client/ost names ndevs=${#client_names[@]} if ((ndevs != 0)); then if ((${#ost_names[@]} != 0)); then echo "Please specify client_names or ost_names, but not both" 1>&2 exit 1 fi for ((i=0; i&2 exit 1 fi for ((i=0; i /proc/sys/portals/debug" do_unload_obdecho[$host]=0 if obdecho_loaded $host; then continue fi load_obdecho $host if obdecho_loaded $host; then do_unload_obdecho[$host]=1 continue fi echo "Can't load obdecho on $host" 1>&2 exit 1 done # get all the echo_client device numbers and names for ((i=0; i $str" >> $workf print_summary -n "$str" if ((total_thr * actual_rsz > max_buffer_mem)); then print_summary "Too much buffer space" continue fi # create the objects tmpf="${workf}_tmp" for ((idx=0; idx < ndevs; idx++)); do host=${host_names[$idx]} devno=${devnos[$idx]} client_name="${host}:${client_names[$idx]}" echo "=============> Create $nobj on $client_name" >> $workf first_obj=`create_objects $host $devno $nobj $tmpf` cat $tmpf >> $workf rm $tmpf if [ $first_obj = "ERROR" ]; then print_summary "created object #s on $client_name not contiguous" exit 1 fi first_objs[$idx]=$first_obj done # run tests for test in ${tests[@]}; do print_summary -n "$test " # create per-host script files for host in ${unique_hosts[@]}; do echo -n > ${cmdsf}_${host} done for ((idx=0; idx < ndevs; idx++)); do host=${host_names[$idx]} devno=${devnos[$idx]} tmpfi="${tmpf}_$idx" first_obj=${first_objs[$idx]} echo >> ${cmdsf}_${host} \ "$lctl > $tmpfi 2>&1 \\ --threads $thr -$snap $devno \\ test_brw $count `testname2type $test` q $pages ${thr}t${first_obj} &" done for host in ${unique_hosts[@]}; do echo "wait" >> ${cmdsf}_${host} done # timed run of all the per-host script files t0=`date +%s.%N` for host in ${unique_hosts[@]}; do remote_shell $host bash ${cmdsf}_${host}& done wait t1=`date +%s.%N` # clean up per-host script files for host in ${unique_hosts[@]}; do rm ${cmdsf}_${host} done # compute bandwidth from total data / elapsed time str=`awk "BEGIN {printf \"%7.2f \",\ $total_size / (( $t1 - $t0 ) * 1024)}"` print_summary -n "$str" # collect/check individual OST stats echo -n > $tmpf for ((idx=0; idx < ndevs; idx++)); do client_name="${host_names[$idx]}:${client_names[$idx]}" tmpfi="${tmpf}_$idx" echo "=============> $test $client_name" >> $workf cat $tmpfi >> $workf get_stats $tmpfi >> $tmpf rm $tmpfi done # compute/display global min/max stats echo "=============> $test global" >> $workf cat $tmpf >> $workf stats=(`get_global_stats $tmpf`) rm $tmpf if ((stats[0] <= 0)); then if ((stats[0] < 0)); then str=`printf "%17s " ERROR` else str=`printf "%17s " SHORT` fi else str=`awk "BEGIN {printf \"[%7.2f,%7.2f] \",\ (${stats[1]} * $actual_rsz)/1024,\ (${stats[2]} * $actual_rsz)/1024; exit}"` fi print_summary -n "$str" done print_summary "" # destroy objects we created for ((idx=0; idx < ndevs; idx++)); do host=${host_names[$idx]} devno=${devnos[$idx]} client_name="${host}:${client_names[$idx]}" first_obj=${first_objs[$idx]} echo "=============> Destroy $nobj on $client_name" >> $workf destroy_objects $host $devno $first_obj $nobj $tmpf cat $tmpf >> $workf rm $tmpf done done done done # tear down any echo clients we created for ((i=0; i